vsx_utils.hpp 51 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047
  1. // This file is part of OpenCV project.
  2. // It is subject to the license terms in the LICENSE file found in the top-level directory
  3. // of this distribution and at http://opencv.org/license.html
  4. #ifndef OPENCV_HAL_VSX_UTILS_HPP
  5. #define OPENCV_HAL_VSX_UTILS_HPP
  6. #include "opencv2/core/cvdef.h"
  7. #ifndef SKIP_INCLUDES
  8. # include <assert.h>
  9. #endif
  10. //! @addtogroup core_utils_vsx
  11. //! @{
  12. #if CV_VSX
  13. #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
  14. #define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
  15. #define __VSX_S4__(c, v) (c){v, v, v, v}
  16. #define __VSX_S2__(c, v) (c){v, v}
  17. typedef __vector unsigned char vec_uchar16;
  18. #define vec_uchar16_set(...) (vec_uchar16){__VA_ARGS__}
  19. #define vec_uchar16_sp(c) (__VSX_S16__(vec_uchar16, (unsigned char)c))
  20. #define vec_uchar16_c(v) ((vec_uchar16)(v))
  21. #define vec_uchar16_z vec_uchar16_sp(0)
  22. typedef __vector signed char vec_char16;
  23. #define vec_char16_set(...) (vec_char16){__VA_ARGS__}
  24. #define vec_char16_sp(c) (__VSX_S16__(vec_char16, (signed char)c))
  25. #define vec_char16_c(v) ((vec_char16)(v))
  26. #define vec_char16_z vec_char16_sp(0)
  27. typedef __vector unsigned short vec_ushort8;
  28. #define vec_ushort8_set(...) (vec_ushort8){__VA_ARGS__}
  29. #define vec_ushort8_sp(c) (__VSX_S8__(vec_ushort8, (unsigned short)c))
  30. #define vec_ushort8_c(v) ((vec_ushort8)(v))
  31. #define vec_ushort8_z vec_ushort8_sp(0)
  32. typedef __vector signed short vec_short8;
  33. #define vec_short8_set(...) (vec_short8){__VA_ARGS__}
  34. #define vec_short8_sp(c) (__VSX_S8__(vec_short8, (signed short)c))
  35. #define vec_short8_c(v) ((vec_short8)(v))
  36. #define vec_short8_z vec_short8_sp(0)
  37. typedef __vector unsigned int vec_uint4;
  38. #define vec_uint4_set(...) (vec_uint4){__VA_ARGS__}
  39. #define vec_uint4_sp(c) (__VSX_S4__(vec_uint4, (unsigned int)c))
  40. #define vec_uint4_c(v) ((vec_uint4)(v))
  41. #define vec_uint4_z vec_uint4_sp(0)
  42. typedef __vector signed int vec_int4;
  43. #define vec_int4_set(...) (vec_int4){__VA_ARGS__}
  44. #define vec_int4_sp(c) (__VSX_S4__(vec_int4, (signed int)c))
  45. #define vec_int4_c(v) ((vec_int4)(v))
  46. #define vec_int4_z vec_int4_sp(0)
  47. typedef __vector float vec_float4;
  48. #define vec_float4_set(...) (vec_float4){__VA_ARGS__}
  49. #define vec_float4_sp(c) (__VSX_S4__(vec_float4, c))
  50. #define vec_float4_c(v) ((vec_float4)(v))
  51. #define vec_float4_z vec_float4_sp(0)
  52. typedef __vector unsigned long long vec_udword2;
  53. #define vec_udword2_set(...) (vec_udword2){__VA_ARGS__}
  54. #define vec_udword2_sp(c) (__VSX_S2__(vec_udword2, (unsigned long long)c))
  55. #define vec_udword2_c(v) ((vec_udword2)(v))
  56. #define vec_udword2_z vec_udword2_sp(0)
  57. typedef __vector signed long long vec_dword2;
  58. #define vec_dword2_set(...) (vec_dword2){__VA_ARGS__}
  59. #define vec_dword2_sp(c) (__VSX_S2__(vec_dword2, (signed long long)c))
  60. #define vec_dword2_c(v) ((vec_dword2)(v))
  61. #define vec_dword2_z vec_dword2_sp(0)
  62. typedef __vector double vec_double2;
  63. #define vec_double2_set(...) (vec_double2){__VA_ARGS__}
  64. #define vec_double2_c(v) ((vec_double2)(v))
  65. #define vec_double2_sp(c) (__VSX_S2__(vec_double2, c))
  66. #define vec_double2_z vec_double2_sp(0)
  67. #define vec_bchar16 __vector __bool char
  68. #define vec_bchar16_set(...) (vec_bchar16){__VA_ARGS__}
  69. #define vec_bchar16_c(v) ((vec_bchar16)(v))
  70. #define vec_bshort8 __vector __bool short
  71. #define vec_bshort8_set(...) (vec_bshort8){__VA_ARGS__}
  72. #define vec_bshort8_c(v) ((vec_bshort8)(v))
  73. #define vec_bint4 __vector __bool int
  74. #define vec_bint4_set(...) (vec_bint4){__VA_ARGS__}
  75. #define vec_bint4_c(v) ((vec_bint4)(v))
  76. #define vec_bdword2 __vector __bool long long
  77. #define vec_bdword2_set(...) (vec_bdword2){__VA_ARGS__}
  78. #define vec_bdword2_c(v) ((vec_bdword2)(v))
  79. #define VSX_FINLINE(tp) extern inline tp __attribute__((always_inline))
  80. #define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
  81. VSX_FINLINE(rt) fnm(const rg& a) { return fn2(a); }
  82. #define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
  83. VSX_FINLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
  84. /*
  85. * GCC VSX compatibility
  86. **/
  87. #if defined(__GNUG__) && !defined(__clang__)
  88. // inline asm helper
  89. #define VSX_IMPL_1RG(rt, rg, opc, fnm) \
  90. VSX_FINLINE(rt) fnm(const rg& a) \
  91. { rt rs; __asm__ __volatile__(#opc" %x0,%x1" : "=wa" (rs) : "wa" (a)); return rs; }
  92. #define VSX_IMPL_1VRG(rt, rg, opc, fnm) \
  93. VSX_FINLINE(rt) fnm(const rg& a) \
  94. { rt rs; __asm__ __volatile__(#opc" %0,%1" : "=v" (rs) : "v" (a)); return rs; }
  95. #define VSX_IMPL_2VRG_F(rt, rg, fopc, fnm) \
  96. VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
  97. { rt rs; __asm__ __volatile__(fopc : "=v" (rs) : "v" (a), "v" (b)); return rs; }
  98. #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
  99. #if __GNUG__ < 8
  100. // Support for int4 -> dword2 expanding multiply was added in GCC 8.
  101. #ifdef vec_mule
  102. #undef vec_mule
  103. #endif
  104. #ifdef vec_mulo
  105. #undef vec_mulo
  106. #endif
  107. VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mule, __builtin_vec_mule)
  108. VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mule, __builtin_vec_mule)
  109. VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mule, __builtin_vec_mule)
  110. VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mule, __builtin_vec_mule)
  111. VSX_REDIRECT_2RG(vec_ushort8, vec_uchar16, vec_mulo, __builtin_vec_mulo)
  112. VSX_REDIRECT_2RG(vec_short8, vec_char16, vec_mulo, __builtin_vec_mulo)
  113. VSX_REDIRECT_2RG(vec_int4, vec_short8, vec_mulo, __builtin_vec_mulo)
  114. VSX_REDIRECT_2RG(vec_uint4, vec_ushort8, vec_mulo, __builtin_vec_mulo)
  115. // dword2 support arrived in ISA 2.07 and GCC 8+
  116. VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulosw, vec_mule)
  117. VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmulouw, vec_mule)
  118. VSX_IMPL_2VRG(vec_dword2, vec_int4, vmulesw, vec_mulo)
  119. VSX_IMPL_2VRG(vec_udword2, vec_uint4, vmuleuw, vec_mulo)
  120. #endif
  121. #if __GNUG__ < 7
  122. // up to GCC 6 vec_mul only supports precisions and llong
  123. # ifdef vec_mul
  124. # undef vec_mul
  125. # endif
  126. /*
  127. * there's no a direct instruction for supporting 8-bit, 16-bit multiplication in ISA 2.07,
  128. * XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
  129. **/
  130. # define VSX_IMPL_MULH(Tvec, cperm) \
  131. VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
  132. { \
  133. static const vec_uchar16 ev_od = {cperm}; \
  134. return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
  135. }
  136. #define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
  137. VSX_IMPL_MULH(vec_char16, VSX_IMPL_MULH_P16)
  138. VSX_IMPL_MULH(vec_uchar16, VSX_IMPL_MULH_P16)
  139. #define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
  140. VSX_IMPL_MULH(vec_short8, VSX_IMPL_MULH_P8)
  141. VSX_IMPL_MULH(vec_ushort8, VSX_IMPL_MULH_P8)
  142. // vmuluwm can be used for unsigned or signed integers, that's what they said
  143. VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
  144. VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
  145. // redirect to GCC builtin vec_mul, since it already supports precisions and llong
  146. VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
  147. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
  148. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
  149. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
  150. #endif // __GNUG__ < 7
  151. #if __GNUG__ < 6
  152. /*
  153. * Instruction "compare greater than or equal" in ISA 2.07 only supports single
  154. * and double precision.
  155. * In XLC and new versions of GCC implement integers by using instruction "greater than" and NOR.
  156. **/
  157. # ifdef vec_cmpge
  158. # undef vec_cmpge
  159. # endif
  160. # ifdef vec_cmple
  161. # undef vec_cmple
  162. # endif
  163. # define vec_cmple(a, b) vec_cmpge(b, a)
  164. # define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
  165. VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
  166. VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
  167. VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
  168. VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
  169. VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
  170. VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
  171. VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
  172. VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
  173. VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
  174. // redirect to GCC builtin cmpge, since it already supports precisions
  175. VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
  176. VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
  177. // up to gcc5 vec_nor doesn't support bool long long
  178. # undef vec_nor
  179. template<typename T>
  180. VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
  181. VSX_FINLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
  182. { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
  183. // vec_packs doesn't support double words in gcc4 and old versions of gcc5
  184. # undef vec_packs
  185. VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
  186. VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
  187. VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
  188. VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
  189. VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
  190. VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
  191. #endif // __GNUG__ < 6
  192. #if __GNUG__ < 5
  193. // vec_xxpermdi in gcc4 missing little-endian supports just like clang
  194. # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
  195. // same as vec_xxpermdi
  196. # undef vec_vbpermq
  197. VSX_IMPL_2VRG(vec_udword2, vec_uchar16, vbpermq, vec_vbpermq)
  198. VSX_IMPL_2VRG(vec_dword2, vec_char16, vbpermq, vec_vbpermq)
  199. #else
  200. # define vec_permi vec_xxpermdi
  201. #endif // __GNUG__ < 5
  202. // shift left double by word immediate
  203. #ifndef vec_sldw
  204. # define vec_sldw __builtin_vsx_xxsldwi
  205. #endif
  206. // vector population count
  207. VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
  208. VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcntu)
  209. VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
  210. VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcntu)
  211. VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcntu)
  212. VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcntu)
  213. VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
  214. VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcntu)
  215. // converts between single and double-precision
  216. VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
  217. VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
  218. // converts word and doubleword to double-precision
  219. #undef vec_ctd
  220. VSX_IMPL_1RG(vec_double2, vec_int4, xvcvsxwdp, vec_ctdo)
  221. VSX_IMPL_1RG(vec_double2, vec_uint4, xvcvuxwdp, vec_ctdo)
  222. VSX_IMPL_1RG(vec_double2, vec_dword2, xvcvsxddp, vec_ctd)
  223. VSX_IMPL_1RG(vec_double2, vec_udword2, xvcvuxddp, vec_ctd)
  224. // converts word and doubleword to single-precision
  225. #undef vec_ctf
  226. VSX_IMPL_1RG(vec_float4, vec_int4, xvcvsxwsp, vec_ctf)
  227. VSX_IMPL_1RG(vec_float4, vec_uint4, xvcvuxwsp, vec_ctf)
  228. VSX_IMPL_1RG(vec_float4, vec_dword2, xvcvsxdsp, vec_ctfo)
  229. VSX_IMPL_1RG(vec_float4, vec_udword2, xvcvuxdsp, vec_ctfo)
  230. // converts single and double precision to signed word
  231. #undef vec_cts
  232. VSX_IMPL_1RG(vec_int4, vec_double2, xvcvdpsxws, vec_ctso)
  233. VSX_IMPL_1RG(vec_int4, vec_float4, xvcvspsxws, vec_cts)
  234. // converts single and double precision to unsigned word
  235. #undef vec_ctu
  236. VSX_IMPL_1RG(vec_uint4, vec_double2, xvcvdpuxws, vec_ctuo)
  237. VSX_IMPL_1RG(vec_uint4, vec_float4, xvcvspuxws, vec_ctu)
  238. // converts single and double precision to signed doubleword
  239. #undef vec_ctsl
  240. VSX_IMPL_1RG(vec_dword2, vec_double2, xvcvdpsxds, vec_ctsl)
  241. VSX_IMPL_1RG(vec_dword2, vec_float4, xvcvspsxds, vec_ctslo)
  242. // converts single and double precision to unsigned doubleword
  243. #undef vec_ctul
  244. VSX_IMPL_1RG(vec_udword2, vec_double2, xvcvdpuxds, vec_ctul)
  245. VSX_IMPL_1RG(vec_udword2, vec_float4, xvcvspuxds, vec_ctulo)
  246. // just in case if GCC doesn't define it
  247. #ifndef vec_xl
  248. # define vec_xl vec_vsx_ld
  249. # define vec_xst vec_vsx_st
  250. #endif
  251. #endif // GCC VSX compatibility
  252. /*
  253. * CLANG VSX compatibility
  254. **/
  255. #if defined(__clang__) && !defined(__IBMCPP__)
  256. /*
  257. * CLANG doesn't support %x<n> in the inline asm template which fixes register number
  258. * when using any of the register constraints wa, wd, wf
  259. *
  260. * For more explanation checkout PowerPC and IBM RS6000 in https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
  261. * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
  262. *
  263. * So we're not able to use inline asm and only use built-in functions that CLANG supports
  264. * and use __builtin_convertvector if clang missing any of vector conversions built-in functions
  265. *
  266. * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
  267. */
  268. // convert vector helper
  269. #define VSX_IMPL_CONVERT(rt, rg, fnm) \
  270. VSX_FINLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
  271. #ifndef vec_permi
  272. #if __clang_major__ < 5
  273. // implement vec_permi in a dirty way
  274. # define VSX_IMPL_CLANG_4_PERMI(Tvec) \
  275. VSX_FINLINE(Tvec) vec_permi(const Tvec& a, const Tvec& b, unsigned const char c) \
  276. { \
  277. switch (c) \
  278. { \
  279. case 0: \
  280. return vec_mergeh(a, b); \
  281. case 1: \
  282. return vec_mergel(vec_mergeh(a, a), b); \
  283. case 2: \
  284. return vec_mergeh(vec_mergel(a, a), b); \
  285. default: \
  286. return vec_mergel(a, b); \
  287. } \
  288. }
  289. VSX_IMPL_CLANG_4_PERMI(vec_udword2)
  290. VSX_IMPL_CLANG_4_PERMI(vec_dword2)
  291. VSX_IMPL_CLANG_4_PERMI(vec_double2)
  292. // vec_xxsldwi is missing in clang 4
  293. # define vec_xxsldwi(a, b, c) vec_sld(a, b, (c) * 4)
  294. #else
  295. // vec_xxpermdi is missing little-endian supports in clang 4 just like gcc4
  296. # define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ (((c) & 1) << 1 | (c) >> 1)))
  297. #endif // __clang_major__ < 5
  298. #endif
  299. // shift left double by word immediate
  300. #ifndef vec_sldw
  301. # define vec_sldw vec_xxsldwi
  302. #endif
  303. #if __clang_major__ < 13
  304. // Implement vec_rsqrt since clang only supports vec_rsqrte
  305. #ifndef vec_rsqrt
  306. VSX_FINLINE(vec_float4) vec_rsqrt(const vec_float4& a)
  307. { return vec_div(vec_float4_sp(1), vec_sqrt(a)); }
  308. VSX_FINLINE(vec_double2) vec_rsqrt(const vec_double2& a)
  309. { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
  310. #endif
  311. // vec_promote missing support for doubleword
  312. VSX_FINLINE(vec_dword2) vec_promote(long long a, int b)
  313. {
  314. vec_dword2 ret = vec_dword2_z;
  315. ret[b & 1] = a;
  316. return ret;
  317. }
  318. VSX_FINLINE(vec_udword2) vec_promote(unsigned long long a, int b)
  319. {
  320. vec_udword2 ret = vec_udword2_z;
  321. ret[b & 1] = a;
  322. return ret;
  323. }
  324. #endif
  325. // vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
  326. #define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
  327. VSX_FINLINE(Tvec) vec_popcntu(const Tvec2& a) \
  328. { return ucast(vec_popcnt(a)); }
  329. VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
  330. VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
  331. VSX_IMPL_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
  332. VSX_IMPL_POPCNTU(vec_udword2, vec_dword2, vec_udword2_c);
  333. // redirect unsigned types
  334. VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
  335. VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
  336. VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
  337. VSX_REDIRECT_1RG(vec_udword2, vec_udword2, vec_popcntu, vec_popcnt)
  338. // converts between single and double precision
  339. VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvfo, __builtin_vsx_xvcvdpsp)
  340. VSX_REDIRECT_1RG(vec_double2, vec_float4, vec_cvfo, __builtin_vsx_xvcvspdp)
  341. // converts word and doubleword to double-precision
  342. #ifdef vec_ctd
  343. # undef vec_ctd
  344. #endif
  345. VSX_REDIRECT_1RG(vec_double2, vec_int4, vec_ctdo, __builtin_vsx_xvcvsxwdp)
  346. VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
  347. VSX_IMPL_CONVERT(vec_double2, vec_dword2, vec_ctd)
  348. VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
  349. // converts word and doubleword to single-precision
  350. #if __clang_major__ > 4
  351. # undef vec_ctf
  352. #endif
  353. VSX_IMPL_CONVERT(vec_float4, vec_int4, vec_ctf)
  354. VSX_IMPL_CONVERT(vec_float4, vec_uint4, vec_ctf)
  355. VSX_REDIRECT_1RG(vec_float4, vec_dword2, vec_ctfo, __builtin_vsx_xvcvsxdsp)
  356. VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctfo, __builtin_vsx_xvcvuxdsp)
  357. // converts single and double precision to signed word
  358. #if __clang_major__ > 4
  359. # undef vec_cts
  360. #endif
  361. VSX_REDIRECT_1RG(vec_int4, vec_double2, vec_ctso, __builtin_vsx_xvcvdpsxws)
  362. VSX_IMPL_CONVERT(vec_int4, vec_float4, vec_cts)
  363. // converts single and double precision to unsigned word
  364. #if __clang_major__ > 4
  365. # undef vec_ctu
  366. #endif
  367. VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctuo, __builtin_vsx_xvcvdpuxws)
  368. VSX_IMPL_CONVERT(vec_uint4, vec_float4, vec_ctu)
  369. // converts single and double precision to signed doubleword
  370. #ifdef vec_ctsl
  371. # undef vec_ctsl
  372. #endif
  373. VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
  374. // __builtin_convertvector unable to convert, xvcvspsxds is missing on it
  375. VSX_FINLINE(vec_dword2) vec_ctslo(const vec_float4& a)
  376. { return vec_ctsl(vec_cvfo(a)); }
  377. // converts single and double precision to unsigned doubleword
  378. #ifdef vec_ctul
  379. # undef vec_ctul
  380. #endif
  381. VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
  382. // __builtin_convertvector unable to convert, xvcvspuxds is missing on it
  383. VSX_FINLINE(vec_udword2) vec_ctulo(const vec_float4& a)
  384. { return vec_ctul(vec_cvfo(a)); }
  385. #endif // CLANG VSX compatibility
  386. /*
  387. * Common GCC, CLANG compatibility
  388. **/
  389. #if defined(__GNUG__) && !defined(__IBMCPP__)
  390. #ifdef vec_cvf
  391. # undef vec_cvf
  392. #endif
  393. #define VSX_IMPL_CONV_EVEN_4_2(rt, rg, fnm, fn2) \
  394. VSX_FINLINE(rt) fnm(const rg& a) \
  395. { return fn2(vec_sldw(a, a, 1)); }
  396. VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_float4, vec_cvf, vec_cvfo)
  397. VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_int4, vec_ctd, vec_ctdo)
  398. VSX_IMPL_CONV_EVEN_4_2(vec_double2, vec_uint4, vec_ctd, vec_ctdo)
  399. VSX_IMPL_CONV_EVEN_4_2(vec_dword2, vec_float4, vec_ctsl, vec_ctslo)
  400. VSX_IMPL_CONV_EVEN_4_2(vec_udword2, vec_float4, vec_ctul, vec_ctulo)
  401. #define VSX_IMPL_CONV_EVEN_2_4(rt, rg, fnm, fn2) \
  402. VSX_FINLINE(rt) fnm(const rg& a) \
  403. { \
  404. rt v4 = fn2(a); \
  405. return vec_sldw(v4, v4, 3); \
  406. }
  407. VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_double2, vec_cvf, vec_cvfo)
  408. VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_dword2, vec_ctf, vec_ctfo)
  409. VSX_IMPL_CONV_EVEN_2_4(vec_float4, vec_udword2, vec_ctf, vec_ctfo)
  410. VSX_IMPL_CONV_EVEN_2_4(vec_int4, vec_double2, vec_cts, vec_ctso)
  411. VSX_IMPL_CONV_EVEN_2_4(vec_uint4, vec_double2, vec_ctu, vec_ctuo)
  412. // Only for Eigen!
  413. /*
  414. * changing behavior of conversion intrinsics for gcc has effect on Eigen
  415. * so we redefine old behavior again only on gcc, clang
  416. */
  417. #if !defined(__clang__) || __clang_major__ > 4
  418. // ignoring second arg since Eigen only truncates toward zero
  419. # define VSX_IMPL_CONV_2VARIANT(rt, rg, fnm, fn2) \
  420. VSX_FINLINE(rt) fnm(const rg& a, int only_truncate) \
  421. { \
  422. assert(only_truncate == 0); \
  423. CV_UNUSED(only_truncate); \
  424. return fn2(a); \
  425. }
  426. VSX_IMPL_CONV_2VARIANT(vec_int4, vec_float4, vec_cts, vec_cts)
  427. VSX_IMPL_CONV_2VARIANT(vec_uint4, vec_float4, vec_ctu, vec_ctu)
  428. VSX_IMPL_CONV_2VARIANT(vec_float4, vec_int4, vec_ctf, vec_ctf)
  429. VSX_IMPL_CONV_2VARIANT(vec_float4, vec_uint4, vec_ctf, vec_ctf)
  430. // define vec_cts for converting double precision to signed doubleword
  431. // which isn't compatible with xlc but its okay since Eigen only uses it for gcc
  432. VSX_IMPL_CONV_2VARIANT(vec_dword2, vec_double2, vec_cts, vec_ctsl)
  433. #endif // Eigen
  434. #endif // Common GCC, CLANG compatibility
  435. /*
  436. * XLC VSX compatibility
  437. **/
  438. #if defined(__IBMCPP__)
  439. // vector population count
  440. #define vec_popcntu vec_popcnt
  441. // overload and redirect with setting second arg to zero
  442. // since we only support conversions without the second arg
  443. #define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
  444. VSX_FINLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
  445. VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4, vec_ctd)
  446. VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4, vec_ctd)
  447. VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2, vec_ctd)
  448. VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
  449. VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_int4, vec_ctf)
  450. VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_uint4, vec_ctf)
  451. VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_dword2, vec_ctf)
  452. VSX_IMPL_OVERLOAD_Z2(vec_float4, vec_udword2, vec_ctf)
  453. VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_double2, vec_cts)
  454. VSX_IMPL_OVERLOAD_Z2(vec_int4, vec_float4, vec_cts)
  455. VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_double2, vec_ctu)
  456. VSX_IMPL_OVERLOAD_Z2(vec_uint4, vec_float4, vec_ctu)
  457. VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_double2, vec_ctsl)
  458. VSX_IMPL_OVERLOAD_Z2(vec_dword2, vec_float4, vec_ctsl)
  459. VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
  460. VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4, vec_ctul)
  461. // fixme: implement conversions of odd-numbered elements in a dirty way
  462. // since xlc doesn't support VSX registers operand in inline asm.
  463. #define VSX_IMPL_CONV_ODD_4_2(rt, rg, fnm, fn2) \
  464. VSX_FINLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
  465. VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_float4, vec_cvfo, vec_cvf)
  466. VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_int4, vec_ctdo, vec_ctd)
  467. VSX_IMPL_CONV_ODD_4_2(vec_double2, vec_uint4, vec_ctdo, vec_ctd)
  468. VSX_IMPL_CONV_ODD_4_2(vec_dword2, vec_float4, vec_ctslo, vec_ctsl)
  469. VSX_IMPL_CONV_ODD_4_2(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
  470. #define VSX_IMPL_CONV_ODD_2_4(rt, rg, fnm, fn2) \
  471. VSX_FINLINE(rt) fnm(const rg& a) \
  472. { \
  473. rt v4 = fn2(a); \
  474. return vec_sldw(v4, v4, 1); \
  475. }
  476. VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_double2, vec_cvfo, vec_cvf)
  477. VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_dword2, vec_ctfo, vec_ctf)
  478. VSX_IMPL_CONV_ODD_2_4(vec_float4, vec_udword2, vec_ctfo, vec_ctf)
  479. VSX_IMPL_CONV_ODD_2_4(vec_int4, vec_double2, vec_ctso, vec_cts)
  480. VSX_IMPL_CONV_ODD_2_4(vec_uint4, vec_double2, vec_ctuo, vec_ctu)
  481. #endif // XLC VSX compatibility
  482. // ignore GCC warning that caused by -Wunused-but-set-variable in rare cases
  483. #if defined(__GNUG__) && !defined(__clang__)
  484. # define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
  485. #else // CLANG, XLC
  486. # define VSX_UNUSED(Tvec) Tvec
  487. #endif
  488. // gcc can find his way in casting log int and XLC, CLANG ambiguous
  489. #if defined(__clang__) || defined(__IBMCPP__)
  490. VSX_FINLINE(vec_udword2) vec_splats(uint64 v)
  491. { return vec_splats((unsigned long long) v); }
  492. VSX_FINLINE(vec_dword2) vec_splats(int64 v)
  493. { return vec_splats((long long) v); }
  494. VSX_FINLINE(vec_udword2) vec_promote(uint64 a, int b)
  495. { return vec_promote((unsigned long long) a, b); }
  496. VSX_FINLINE(vec_dword2) vec_promote(int64 a, int b)
  497. { return vec_promote((long long) a, b); }
  498. #endif
  499. /*
  500. * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
  501. * load and set using offset depend on the pointer type
  502. *
  503. * implement vsx_ldf(offset, pointer), vsx_stf(vector, offset, pointer)
  504. * load and set using offset depend on fixed bytes size
  505. *
  506. * Note: In clang vec_xl and vec_xst fails to load unaligned addresses
  507. * so we are using vec_vsx_ld, vec_vsx_st instead
  508. */
  509. #if defined(__clang__) && !defined(__IBMCPP__)
  510. # define vsx_ldf vec_vsx_ld
  511. # define vsx_stf vec_vsx_st
  512. #else // GCC , XLC
  513. # define vsx_ldf vec_xl
  514. # define vsx_stf vec_xst
  515. #endif
  516. #define VSX_OFFSET(o, p) ((o) * sizeof(*(p)))
  517. #define vsx_ld(o, p) vsx_ldf(VSX_OFFSET(o, p), p)
  518. #define vsx_st(v, o, p) vsx_stf(v, VSX_OFFSET(o, p), p)
  519. /*
  520. * implement vsx_ld2(offset, pointer), vsx_st2(vector, offset, pointer) to load and store double words
  521. * In GCC vec_xl and vec_xst it maps to vec_vsx_ld, vec_vsx_st which doesn't support long long
  522. * and in CLANG we are using vec_vsx_ld, vec_vsx_st because vec_xl, vec_xst fails to load unaligned addresses
  523. *
  524. * In XLC vec_xl and vec_xst fail to cast int64(long int) to long long
  525. */
  526. #if (defined(__GNUG__) || defined(__clang__)) && !defined(__IBMCPP__)
  527. VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
  528. { return vec_udword2_c(vsx_ldf(VSX_OFFSET(o, p), (unsigned int*)p)); }
  529. VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
  530. { return vec_dword2_c(vsx_ldf(VSX_OFFSET(o, p), (int*)p)); }
  531. VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
  532. { vsx_stf(vec_uint4_c(vec), VSX_OFFSET(o, p), (unsigned int*)p); }
  533. VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
  534. { vsx_stf(vec_int4_c(vec), VSX_OFFSET(o, p), (int*)p); }
  535. #else // XLC
  536. VSX_FINLINE(vec_udword2) vsx_ld2(long o, const uint64* p)
  537. { return vsx_ldf(VSX_OFFSET(o, p), (unsigned long long*)p); }
  538. VSX_FINLINE(vec_dword2) vsx_ld2(long o, const int64* p)
  539. { return vsx_ldf(VSX_OFFSET(o, p), (long long*)p); }
  540. VSX_FINLINE(void) vsx_st2(const vec_udword2& vec, long o, uint64* p)
  541. { vsx_stf(vec, VSX_OFFSET(o, p), (unsigned long long*)p); }
  542. VSX_FINLINE(void) vsx_st2(const vec_dword2& vec, long o, int64* p)
  543. { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
  544. #endif
  545. // Store lower 8 byte
  546. #define vec_st_l8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 0)
  547. // Store higher 8 byte
  548. #define vec_st_h8(v, p) *((uint64*)(p)) = vec_extract(vec_udword2_c(v), 1)
  549. // Load 64-bits of integer data to lower part
  550. #define VSX_IMPL_LOAD_L8(Tvec, Tp) \
  551. VSX_FINLINE(Tvec) vec_ld_l8(const Tp *p) \
  552. { return ((Tvec)vec_promote(*((uint64*)p), 0)); }
  553. VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
  554. VSX_IMPL_LOAD_L8(vec_char16, schar)
  555. VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
  556. VSX_IMPL_LOAD_L8(vec_short8, short)
  557. VSX_IMPL_LOAD_L8(vec_uint4, uint)
  558. VSX_IMPL_LOAD_L8(vec_int4, int)
  559. VSX_IMPL_LOAD_L8(vec_float4, float)
  560. VSX_IMPL_LOAD_L8(vec_udword2, uint64)
  561. VSX_IMPL_LOAD_L8(vec_dword2, int64)
  562. VSX_IMPL_LOAD_L8(vec_double2, double)
  563. // logical not
  564. #define vec_not(a) vec_nor(a, a)
  565. // power9 yaya
  566. // not equal
  567. #ifndef vec_cmpne
  568. # define vec_cmpne(a, b) vec_not(vec_cmpeq(a, b))
  569. #endif
  570. // absolute difference
  571. #ifndef _ARCH_PWR9
  572. # undef vec_absd
  573. # define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
  574. #endif
  575. /*
  576. * Implement vec_unpacklu and vec_unpackhu
  577. * since vec_unpackl, vec_unpackh only support signed integers
  578. **/
  579. #define VSX_IMPL_UNPACKU(rt, rg, zero) \
  580. VSX_FINLINE(rt) vec_unpacklu(const rg& a) \
  581. { return (rt)(vec_mergel(a, zero)); } \
  582. VSX_FINLINE(rt) vec_unpackhu(const rg& a) \
  583. { return (rt)(vec_mergeh(a, zero)); }
  584. VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
  585. VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
  586. VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
  587. /*
  588. * Implement vec_mergesqe and vec_mergesqo
  589. * Merges the sequence values of even and odd elements of two vectors
  590. */
  591. #define VSX_IMPL_PERM(rt, fnm, ...) \
  592. VSX_FINLINE(rt) fnm(const rt& a, const rt& b) \
  593. { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
  594. // 16
  595. #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
  596. #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
  597. VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
  598. VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
  599. VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
  600. VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
  601. // 8
  602. #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
  603. #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
  604. VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
  605. VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
  606. VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
  607. VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
  608. // 4
  609. #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
  610. #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
  611. VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
  612. VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
  613. VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
  614. VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
  615. VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
  616. VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
  617. // 2
  618. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
  619. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
  620. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
  621. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
  622. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
  623. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
  624. /*
  625. * Implement vec_mergesqh and vec_mergesql
  626. * Merges the sequence most and least significant halves of two vectors
  627. */
  628. #define VSX_IMPL_MERGESQHL(Tvec) \
  629. VSX_FINLINE(Tvec) vec_mergesqh(const Tvec& a, const Tvec& b) \
  630. { return (Tvec)vec_mergeh(vec_udword2_c(a), vec_udword2_c(b)); } \
  631. VSX_FINLINE(Tvec) vec_mergesql(const Tvec& a, const Tvec& b) \
  632. { return (Tvec)vec_mergel(vec_udword2_c(a), vec_udword2_c(b)); }
  633. VSX_IMPL_MERGESQHL(vec_uchar16)
  634. VSX_IMPL_MERGESQHL(vec_char16)
  635. VSX_IMPL_MERGESQHL(vec_ushort8)
  636. VSX_IMPL_MERGESQHL(vec_short8)
  637. VSX_IMPL_MERGESQHL(vec_uint4)
  638. VSX_IMPL_MERGESQHL(vec_int4)
  639. VSX_IMPL_MERGESQHL(vec_float4)
  640. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
  641. VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
  642. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
  643. VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
  644. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
  645. VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
  646. // 2 and 4 channels interleave for all types except 2 lanes
  647. #define VSX_IMPL_ST_INTERLEAVE(Tp, Tvec) \
  648. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
  649. { \
  650. vsx_stf(vec_mergeh(a, b), 0, ptr); \
  651. vsx_stf(vec_mergel(a, b), 16, ptr); \
  652. } \
  653. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  654. const Tvec& c, const Tvec& d, Tp* ptr) \
  655. { \
  656. Tvec ac = vec_mergeh(a, c); \
  657. Tvec bd = vec_mergeh(b, d); \
  658. vsx_stf(vec_mergeh(ac, bd), 0, ptr); \
  659. vsx_stf(vec_mergel(ac, bd), 16, ptr); \
  660. ac = vec_mergel(a, c); \
  661. bd = vec_mergel(b, d); \
  662. vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
  663. vsx_stf(vec_mergel(ac, bd), 48, ptr); \
  664. }
  665. VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
  666. VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
  667. VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
  668. VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
  669. VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
  670. VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
  671. VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
  672. // 2 and 4 channels deinterleave for 16 lanes
  673. #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
  674. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  675. { \
  676. Tvec v0 = vsx_ld(0, ptr); \
  677. Tvec v1 = vsx_ld(16, ptr); \
  678. a = vec_mergesqe(v0, v1); \
  679. b = vec_mergesqo(v0, v1); \
  680. } \
  681. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  682. Tvec& c, Tvec& d) \
  683. { \
  684. Tvec v0 = vsx_ld(0, ptr); \
  685. Tvec v1 = vsx_ld(16, ptr); \
  686. Tvec v2 = vsx_ld(32, ptr); \
  687. Tvec v3 = vsx_ld(48, ptr); \
  688. Tvec m0 = vec_mergesqe(v0, v1); \
  689. Tvec m1 = vec_mergesqe(v2, v3); \
  690. a = vec_mergesqe(m0, m1); \
  691. c = vec_mergesqo(m0, m1); \
  692. m0 = vec_mergesqo(v0, v1); \
  693. m1 = vec_mergesqo(v2, v3); \
  694. b = vec_mergesqe(m0, m1); \
  695. d = vec_mergesqo(m0, m1); \
  696. }
  697. VSX_IMPL_ST_DINTERLEAVE_8(uchar, vec_uchar16)
  698. VSX_IMPL_ST_DINTERLEAVE_8(schar, vec_char16)
  699. // 2 and 4 channels deinterleave for 8 lanes
  700. #define VSX_IMPL_ST_DINTERLEAVE_16(Tp, Tvec) \
  701. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  702. { \
  703. Tvec v0 = vsx_ld(0, ptr); \
  704. Tvec v1 = vsx_ld(8, ptr); \
  705. a = vec_mergesqe(v0, v1); \
  706. b = vec_mergesqo(v0, v1); \
  707. } \
  708. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  709. Tvec& c, Tvec& d) \
  710. { \
  711. Tvec v0 = vsx_ld(0, ptr); \
  712. Tvec v1 = vsx_ld(8, ptr); \
  713. Tvec m0 = vec_mergeh(v0, v1); \
  714. Tvec m1 = vec_mergel(v0, v1); \
  715. Tvec ab0 = vec_mergeh(m0, m1); \
  716. Tvec cd0 = vec_mergel(m0, m1); \
  717. v0 = vsx_ld(16, ptr); \
  718. v1 = vsx_ld(24, ptr); \
  719. m0 = vec_mergeh(v0, v1); \
  720. m1 = vec_mergel(v0, v1); \
  721. Tvec ab1 = vec_mergeh(m0, m1); \
  722. Tvec cd1 = vec_mergel(m0, m1); \
  723. a = vec_mergesqh(ab0, ab1); \
  724. b = vec_mergesql(ab0, ab1); \
  725. c = vec_mergesqh(cd0, cd1); \
  726. d = vec_mergesql(cd0, cd1); \
  727. }
  728. VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
  729. VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
  730. // 2 and 4 channels deinterleave for 4 lanes
  731. #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
  732. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  733. { \
  734. a = vsx_ld(0, ptr); \
  735. b = vsx_ld(4, ptr); \
  736. Tvec m0 = vec_mergeh(a, b); \
  737. Tvec m1 = vec_mergel(a, b); \
  738. a = vec_mergeh(m0, m1); \
  739. b = vec_mergel(m0, m1); \
  740. } \
  741. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  742. Tvec& c, Tvec& d) \
  743. { \
  744. Tvec v0 = vsx_ld(0, ptr); \
  745. Tvec v1 = vsx_ld(4, ptr); \
  746. Tvec v2 = vsx_ld(8, ptr); \
  747. Tvec v3 = vsx_ld(12, ptr); \
  748. Tvec m0 = vec_mergeh(v0, v2); \
  749. Tvec m1 = vec_mergeh(v1, v3); \
  750. a = vec_mergeh(m0, m1); \
  751. b = vec_mergel(m0, m1); \
  752. m0 = vec_mergel(v0, v2); \
  753. m1 = vec_mergel(v1, v3); \
  754. c = vec_mergeh(m0, m1); \
  755. d = vec_mergel(m0, m1); \
  756. }
  757. VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
  758. VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
  759. VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
  760. // 2 and 4 channels interleave and deinterleave for 2 lanes
  761. #define VSX_IMPL_ST_D_INTERLEAVE_64(Tp, Tvec, ld_func, st_func) \
  762. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, Tp* ptr) \
  763. { \
  764. st_func(vec_mergeh(a, b), 0, ptr); \
  765. st_func(vec_mergel(a, b), 2, ptr); \
  766. } \
  767. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  768. const Tvec& c, const Tvec& d, Tp* ptr) \
  769. { \
  770. st_func(vec_mergeh(a, b), 0, ptr); \
  771. st_func(vec_mergeh(c, d), 2, ptr); \
  772. st_func(vec_mergel(a, b), 4, ptr); \
  773. st_func(vec_mergel(c, d), 6, ptr); \
  774. } \
  775. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b) \
  776. { \
  777. Tvec m0 = ld_func(0, ptr); \
  778. Tvec m1 = ld_func(2, ptr); \
  779. a = vec_mergeh(m0, m1); \
  780. b = vec_mergel(m0, m1); \
  781. } \
  782. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
  783. Tvec& c, Tvec& d) \
  784. { \
  785. Tvec v0 = ld_func(0, ptr); \
  786. Tvec v1 = ld_func(2, ptr); \
  787. Tvec v2 = ld_func(4, ptr); \
  788. Tvec v3 = ld_func(6, ptr); \
  789. a = vec_mergeh(v0, v2); \
  790. b = vec_mergel(v0, v2); \
  791. c = vec_mergeh(v1, v3); \
  792. d = vec_mergel(v1, v3); \
  793. }
  794. VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
  795. VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
  796. VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
  797. /* 3 channels */
  798. #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
  799. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  800. const Tvec& c, Tp* ptr) \
  801. { \
  802. static const vec_uchar16 a12 = {0, 16, 0, 1, 17, 0, 2, 18, 0, 3, 19, 0, 4, 20, 0, 5}; \
  803. static const vec_uchar16 a123 = {0, 1, 16, 3, 4, 17, 6, 7, 18, 9, 10, 19, 12, 13, 20, 15}; \
  804. vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
  805. static const vec_uchar16 b12 = {21, 0, 6, 22, 0, 7, 23, 0, 8, 24, 0, 9, 25, 0, 10, 26}; \
  806. static const vec_uchar16 b123 = {0, 21, 2, 3, 22, 5, 6, 23, 8, 9, 24, 11, 12, 25, 14, 15}; \
  807. vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 16, ptr); \
  808. static const vec_uchar16 c12 = {0, 11, 27, 0, 12, 28, 0, 13, 29, 0, 14, 30, 0, 15, 31, 0}; \
  809. static const vec_uchar16 c123 = {26, 1, 2, 27, 4, 5, 28, 7, 8, 29, 10, 11, 30, 13, 14, 31}; \
  810. vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 32, ptr); \
  811. } \
  812. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  813. { \
  814. Tvec v1 = vsx_ld(0, ptr); \
  815. Tvec v2 = vsx_ld(16, ptr); \
  816. Tvec v3 = vsx_ld(32, ptr); \
  817. static const vec_uchar16 a12_perm = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 0, 0, 0, 0, 0}; \
  818. static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 17, 20, 23, 26, 29}; \
  819. a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
  820. static const vec_uchar16 b12_perm = {1, 4, 7, 10, 13, 16, 19, 22, 25, 28, 31, 0, 0, 0, 0, 0}; \
  821. static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 18, 21, 24, 27, 30}; \
  822. b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
  823. static const vec_uchar16 c12_perm = {2, 5, 8, 11, 14, 17, 20, 23, 26, 29, 0, 0, 0, 0, 0, 0}; \
  824. static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 19, 22, 25, 28, 31}; \
  825. c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
  826. }
  827. VSX_IMPL_ST_INTERLEAVE_3CH_16(uchar, vec_uchar16)
  828. VSX_IMPL_ST_INTERLEAVE_3CH_16(schar, vec_char16)
  829. #define VSX_IMPL_ST_INTERLEAVE_3CH_8(Tp, Tvec) \
  830. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  831. const Tvec& c, Tp* ptr) \
  832. { \
  833. static const vec_uchar16 a12 = {0, 1, 16, 17, 0, 0, 2, 3, 18, 19, 0, 0, 4, 5, 20, 21}; \
  834. static const vec_uchar16 a123 = {0, 1, 2, 3, 16, 17, 6, 7, 8, 9, 18, 19, 12, 13, 14, 15}; \
  835. vsx_st(vec_perm(vec_perm(a, b, a12), c, a123), 0, ptr); \
  836. static const vec_uchar16 b12 = {0, 0, 6, 7, 22, 23, 0, 0, 8, 9, 24, 25, 0, 0, 10, 11}; \
  837. static const vec_uchar16 b123 = {20, 21, 2, 3, 4, 5, 22, 23, 8, 9, 10, 11, 24, 25, 14, 15}; \
  838. vsx_st(vec_perm(vec_perm(a, b, b12), c, b123), 8, ptr); \
  839. static const vec_uchar16 c12 = {26, 27, 0, 0, 12, 13, 28, 29, 0, 0, 14, 15, 30, 31, 0, 0}; \
  840. static const vec_uchar16 c123 = {0, 1, 26, 27, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 30, 31}; \
  841. vsx_st(vec_perm(vec_perm(a, b, c12), c, c123), 16, ptr); \
  842. } \
  843. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  844. { \
  845. Tvec v1 = vsx_ld(0, ptr); \
  846. Tvec v2 = vsx_ld(8, ptr); \
  847. Tvec v3 = vsx_ld(16, ptr); \
  848. static const vec_uchar16 a12_perm = {0, 1, 6, 7, 12, 13, 18, 19, 24, 25, 30, 31, 0, 0, 0, 0}; \
  849. static const vec_uchar16 a123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 26, 27}; \
  850. a = vec_perm(vec_perm(v1, v2, a12_perm), v3, a123_perm); \
  851. static const vec_uchar16 b12_perm = {2, 3, 8, 9, 14, 15, 20, 21, 26, 27, 0, 0, 0, 0, 0, 0}; \
  852. static const vec_uchar16 b123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 22, 23, 28, 29}; \
  853. b = vec_perm(vec_perm(v1, v2, b12_perm), v3, b123_perm); \
  854. static const vec_uchar16 c12_perm = {4, 5, 10, 11, 16, 17, 22, 23, 28, 29, 0, 0, 0, 0, 0, 0}; \
  855. static const vec_uchar16 c123_perm = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 18, 19, 24, 25, 30, 31}; \
  856. c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
  857. }
  858. VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
  859. VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
  860. #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
  861. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  862. const Tvec& c, Tp* ptr) \
  863. { \
  864. Tvec hbc = vec_mergeh(b, c); \
  865. static const vec_uchar16 ahbc = {0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 4, 5, 6, 7}; \
  866. vsx_st(vec_perm(a, hbc, ahbc), 0, ptr); \
  867. Tvec lab = vec_mergel(a, b); \
  868. vsx_st(vec_sld(lab, hbc, 8), 4, ptr); \
  869. static const vec_uchar16 clab = {8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, 12, 13, 14, 15};\
  870. vsx_st(vec_perm(c, lab, clab), 8, ptr); \
  871. } \
  872. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c) \
  873. { \
  874. Tvec v1 = vsx_ld(0, ptr); \
  875. Tvec v2 = vsx_ld(4, ptr); \
  876. Tvec v3 = vsx_ld(8, ptr); \
  877. static const vec_uchar16 flp = {0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31}; \
  878. a = vec_perm(v1, vec_sld(v3, v2, 8), flp); \
  879. static const vec_uchar16 flp2 = {28, 29, 30, 31, 0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19}; \
  880. b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
  881. c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
  882. }
  883. VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
  884. VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
  885. VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
  886. #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
  887. VSX_FINLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
  888. const Tvec& c, Tp* ptr) \
  889. { \
  890. st_func(vec_mergeh(a, b), 0, ptr); \
  891. st_func(vec_permi(c, a, 1), 2, ptr); \
  892. st_func(vec_mergel(b, c), 4, ptr); \
  893. } \
  894. VSX_FINLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
  895. Tvec& b, Tvec& c) \
  896. { \
  897. Tvec v1 = ld_func(0, ptr); \
  898. Tvec v2 = ld_func(2, ptr); \
  899. Tvec v3 = ld_func(4, ptr); \
  900. a = vec_permi(v1, v2, 1); \
  901. b = vec_permi(v1, v3, 2); \
  902. c = vec_permi(v2, v3, 1); \
  903. }
  904. VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
  905. VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
  906. VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
  907. #endif // CV_VSX
  908. //! @}
  909. #endif // OPENCV_HAL_VSX_UTILS_HPP