mmiutils.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. /*
  2. * Loongson SIMD utils
  3. *
  4. * Copyright (c) 2016 Loongson Technology Corporation Limited
  5. * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #ifndef AVUTIL_MIPS_MMIUTILS_H
  24. #define AVUTIL_MIPS_MMIUTILS_H
  25. #include "config.h"
  26. #include "libavutil/mips/asmdefs.h"
  27. #if HAVE_LOONGSON2
  28. #define DECLARE_VAR_LOW32 int32_t low32
  29. #define RESTRICT_ASM_LOW32 [low32]"=&r"(low32),
  30. #define DECLARE_VAR_ALL64 int64_t all64
  31. #define RESTRICT_ASM_ALL64 [all64]"=&r"(all64),
  32. #define DECLARE_VAR_ADDRT mips_reg addrt
  33. #define RESTRICT_ASM_ADDRT [addrt]"=&r"(addrt),
  34. #define MMI_LWX(reg, addr, stride, bias) \
  35. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  36. "lw "#reg", "#bias"(%[addrt]) \n\t"
  37. #define MMI_SWX(reg, addr, stride, bias) \
  38. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  39. "sw "#reg", "#bias"(%[addrt]) \n\t"
  40. #define MMI_LDX(reg, addr, stride, bias) \
  41. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  42. "ld "#reg", "#bias"(%[addrt]) \n\t"
  43. #define MMI_SDX(reg, addr, stride, bias) \
  44. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  45. "sd "#reg", "#bias"(%[addrt]) \n\t"
  46. #define MMI_LWC1(fp, addr, bias) \
  47. "lwc1 "#fp", "#bias"("#addr") \n\t"
  48. #define MMI_ULWC1(fp, addr, bias) \
  49. "ulw %[low32], "#bias"("#addr") \n\t" \
  50. "mtc1 %[low32], "#fp" \n\t"
  51. #define MMI_LWXC1(fp, addr, stride, bias) \
  52. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  53. MMI_LWC1(fp, %[addrt], bias)
  54. #define MMI_SWC1(fp, addr, bias) \
  55. "swc1 "#fp", "#bias"("#addr") \n\t"
  56. #define MMI_USWC1(fp, addr, bias) \
  57. "mfc1 %[low32], "#fp" \n\t" \
  58. "usw %[low32], "#bias"("#addr") \n\t"
  59. #define MMI_SWXC1(fp, addr, stride, bias) \
  60. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  61. MMI_SWC1(fp, %[addrt], bias)
  62. #define MMI_LDC1(fp, addr, bias) \
  63. "ldc1 "#fp", "#bias"("#addr") \n\t"
  64. #define MMI_ULDC1(fp, addr, bias) \
  65. "uld %[all64], "#bias"("#addr") \n\t" \
  66. "dmtc1 %[all64], "#fp" \n\t"
  67. #define MMI_LDXC1(fp, addr, stride, bias) \
  68. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  69. MMI_LDC1(fp, %[addrt], bias)
  70. #define MMI_SDC1(fp, addr, bias) \
  71. "sdc1 "#fp", "#bias"("#addr") \n\t"
  72. #define MMI_USDC1(fp, addr, bias) \
  73. "dmfc1 %[all64], "#fp" \n\t" \
  74. "usd %[all64], "#bias"("#addr") \n\t"
  75. #define MMI_SDXC1(fp, addr, stride, bias) \
  76. PTR_ADDU "%[addrt], "#addr", "#stride" \n\t" \
  77. MMI_SDC1(fp, %[addrt], bias)
  78. #define MMI_LQ(reg1, reg2, addr, bias) \
  79. "ld "#reg1", "#bias"("#addr") \n\t" \
  80. "ld "#reg2", 8+"#bias"("#addr") \n\t"
  81. #define MMI_SQ(reg1, reg2, addr, bias) \
  82. "sd "#reg1", "#bias"("#addr") \n\t" \
  83. "sd "#reg2", 8+"#bias"("#addr") \n\t"
  84. #define MMI_LQC1(fp1, fp2, addr, bias) \
  85. "ldc1 "#fp1", "#bias"("#addr") \n\t" \
  86. "ldc1 "#fp2", 8+"#bias"("#addr") \n\t"
  87. #define MMI_SQC1(fp1, fp2, addr, bias) \
  88. "sdc1 "#fp1", "#bias"("#addr") \n\t" \
  89. "sdc1 "#fp2", 8+"#bias"("#addr") \n\t"
  90. #elif HAVE_LOONGSON3 /* !HAVE_LOONGSON2 */
  91. #define DECLARE_VAR_ALL64
  92. #define RESTRICT_ASM_ALL64
  93. #define DECLARE_VAR_ADDRT
  94. #define RESTRICT_ASM_ADDRT
  95. #define MMI_LWX(reg, addr, stride, bias) \
  96. "gslwx "#reg", "#bias"("#addr", "#stride") \n\t"
  97. #define MMI_SWX(reg, addr, stride, bias) \
  98. "gsswx "#reg", "#bias"("#addr", "#stride") \n\t"
  99. #define MMI_LDX(reg, addr, stride, bias) \
  100. "gsldx "#reg", "#bias"("#addr", "#stride") \n\t"
  101. #define MMI_SDX(reg, addr, stride, bias) \
  102. "gssdx "#reg", "#bias"("#addr", "#stride") \n\t"
  103. #define MMI_LWC1(fp, addr, bias) \
  104. "lwc1 "#fp", "#bias"("#addr") \n\t"
  105. #if _MIPS_SIM == _ABIO32 /* workaround for 3A2000 gslwlc1 bug */
  106. #define DECLARE_VAR_LOW32 int32_t low32
  107. #define RESTRICT_ASM_LOW32 [low32]"=&r"(low32),
  108. #define MMI_ULWC1(fp, addr, bias) \
  109. "ulw %[low32], "#bias"("#addr") \n\t" \
  110. "mtc1 %[low32], "#fp" \n\t"
  111. #else /* _MIPS_SIM != _ABIO32 */
  112. #define DECLARE_VAR_LOW32
  113. #define RESTRICT_ASM_LOW32
  114. #define MMI_ULWC1(fp, addr, bias) \
  115. "gslwlc1 "#fp", 3+"#bias"("#addr") \n\t" \
  116. "gslwrc1 "#fp", "#bias"("#addr") \n\t"
  117. #endif /* _MIPS_SIM != _ABIO32 */
  118. #define MMI_LWXC1(fp, addr, stride, bias) \
  119. "gslwxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  120. #define MMI_SWC1(fp, addr, bias) \
  121. "swc1 "#fp", "#bias"("#addr") \n\t"
  122. #define MMI_USWC1(fp, addr, bias) \
  123. "gsswlc1 "#fp", 3+"#bias"("#addr") \n\t" \
  124. "gsswrc1 "#fp", "#bias"("#addr") \n\t"
  125. #define MMI_SWXC1(fp, addr, stride, bias) \
  126. "gsswxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  127. #define MMI_LDC1(fp, addr, bias) \
  128. "ldc1 "#fp", "#bias"("#addr") \n\t"
  129. #define MMI_ULDC1(fp, addr, bias) \
  130. "gsldlc1 "#fp", 7+"#bias"("#addr") \n\t" \
  131. "gsldrc1 "#fp", "#bias"("#addr") \n\t"
  132. #define MMI_LDXC1(fp, addr, stride, bias) \
  133. "gsldxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  134. #define MMI_SDC1(fp, addr, bias) \
  135. "sdc1 "#fp", "#bias"("#addr") \n\t"
  136. #define MMI_USDC1(fp, addr, bias) \
  137. "gssdlc1 "#fp", 7+"#bias"("#addr") \n\t" \
  138. "gssdrc1 "#fp", "#bias"("#addr") \n\t"
  139. #define MMI_SDXC1(fp, addr, stride, bias) \
  140. "gssdxc1 "#fp", "#bias"("#addr", "#stride") \n\t"
  141. #define MMI_LQ(reg1, reg2, addr, bias) \
  142. "gslq "#reg1", "#reg2", "#bias"("#addr") \n\t"
  143. #define MMI_SQ(reg1, reg2, addr, bias) \
  144. "gssq "#reg1", "#reg2", "#bias"("#addr") \n\t"
  145. #define MMI_LQC1(fp1, fp2, addr, bias) \
  146. "gslqc1 "#fp1", "#fp2", "#bias"("#addr") \n\t"
  147. #define MMI_SQC1(fp1, fp2, addr, bias) \
  148. "gssqc1 "#fp1", "#fp2", "#bias"("#addr") \n\t"
  149. #endif /* HAVE_LOONGSON2 */
  150. /**
  151. * backup register
  152. */
  153. #define BACKUP_REG \
  154. LOCAL_ALIGNED_16(double, temp_backup_reg, [8]); \
  155. if (_MIPS_SIM == _ABI64) \
  156. __asm__ volatile ( \
  157. "gssqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
  158. "gssqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
  159. "gssqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
  160. "gssqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
  161. : \
  162. : [temp]"r"(temp_backup_reg) \
  163. : "memory" \
  164. ); \
  165. else \
  166. __asm__ volatile ( \
  167. "gssqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
  168. "gssqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
  169. "gssqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
  170. : \
  171. : [temp]"r"(temp_backup_reg) \
  172. : "memory" \
  173. );
  174. /**
  175. * recover register
  176. */
  177. #define RECOVER_REG \
  178. if (_MIPS_SIM == _ABI64) \
  179. __asm__ volatile ( \
  180. "gslqc1 $f25, $f24, 0x00(%[temp]) \n\t" \
  181. "gslqc1 $f27, $f26, 0x10(%[temp]) \n\t" \
  182. "gslqc1 $f29, $f28, 0x20(%[temp]) \n\t" \
  183. "gslqc1 $f31, $f30, 0x30(%[temp]) \n\t" \
  184. : \
  185. : [temp]"r"(temp_backup_reg) \
  186. : "memory" \
  187. ); \
  188. else \
  189. __asm__ volatile ( \
  190. "gslqc1 $f22, $f20, 0x00(%[temp]) \n\t" \
  191. "gslqc1 $f26, $f24, 0x10(%[temp]) \n\t" \
  192. "gslqc1 $f30, $f28, 0x20(%[temp]) \n\t" \
  193. : \
  194. : [temp]"r"(temp_backup_reg) \
  195. : "memory" \
  196. );
  197. /**
  198. * brief: Transpose 2X2 word packaged data.
  199. * fr_i0, fr_i1: src
  200. * fr_o0, fr_o1: dst
  201. */
  202. #define TRANSPOSE_2W(fr_i0, fr_i1, fr_o0, fr_o1) \
  203. "punpcklwd "#fr_o0", "#fr_i0", "#fr_i1" \n\t" \
  204. "punpckhwd "#fr_o1", "#fr_i0", "#fr_i1" \n\t"
  205. /**
  206. * brief: Transpose 4X4 half word packaged data.
  207. * fr_i0, fr_i1, fr_i2, fr_i3: src & dst
  208. * fr_t0, fr_t1, fr_t2, fr_t3: temporary register
  209. */
  210. #define TRANSPOSE_4H(fr_i0, fr_i1, fr_i2, fr_i3, \
  211. fr_t0, fr_t1, fr_t2, fr_t3) \
  212. "punpcklhw "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \
  213. "punpckhhw "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \
  214. "punpcklhw "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \
  215. "punpckhhw "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \
  216. "punpcklwd "#fr_i0", "#fr_t0", "#fr_t2" \n\t" \
  217. "punpckhwd "#fr_i1", "#fr_t0", "#fr_t2" \n\t" \
  218. "punpcklwd "#fr_i2", "#fr_t1", "#fr_t3" \n\t" \
  219. "punpckhwd "#fr_i3", "#fr_t1", "#fr_t3" \n\t"
  220. /**
  221. * brief: Transpose 8x8 byte packaged data.
  222. * fr_i0~i7: src & dst
  223. * fr_t0~t3: temporary register
  224. */
  225. #define TRANSPOSE_8B(fr_i0, fr_i1, fr_i2, fr_i3, fr_i4, fr_i5, \
  226. fr_i6, fr_i7, fr_t0, fr_t1, fr_t2, fr_t3) \
  227. "punpcklbh "#fr_t0", "#fr_i0", "#fr_i1" \n\t" \
  228. "punpckhbh "#fr_t1", "#fr_i0", "#fr_i1" \n\t" \
  229. "punpcklbh "#fr_t2", "#fr_i2", "#fr_i3" \n\t" \
  230. "punpckhbh "#fr_t3", "#fr_i2", "#fr_i3" \n\t" \
  231. "punpcklbh "#fr_i0", "#fr_i4", "#fr_i5" \n\t" \
  232. "punpckhbh "#fr_i1", "#fr_i4", "#fr_i5" \n\t" \
  233. "punpcklbh "#fr_i2", "#fr_i6", "#fr_i7" \n\t" \
  234. "punpckhbh "#fr_i3", "#fr_i6", "#fr_i7" \n\t" \
  235. "punpcklhw "#fr_i4", "#fr_t0", "#fr_t2" \n\t" \
  236. "punpckhhw "#fr_i5", "#fr_t0", "#fr_t2" \n\t" \
  237. "punpcklhw "#fr_i6", "#fr_t1", "#fr_t3" \n\t" \
  238. "punpckhhw "#fr_i7", "#fr_t1", "#fr_t3" \n\t" \
  239. "punpcklhw "#fr_t0", "#fr_i0", "#fr_i2" \n\t" \
  240. "punpckhhw "#fr_t1", "#fr_i0", "#fr_i2" \n\t" \
  241. "punpcklhw "#fr_t2", "#fr_i1", "#fr_i3" \n\t" \
  242. "punpckhhw "#fr_t3", "#fr_i1", "#fr_i3" \n\t" \
  243. "punpcklwd "#fr_i0", "#fr_i4", "#fr_t0" \n\t" \
  244. "punpckhwd "#fr_i1", "#fr_i4", "#fr_t0" \n\t" \
  245. "punpcklwd "#fr_i2", "#fr_i5", "#fr_t1" \n\t" \
  246. "punpckhwd "#fr_i3", "#fr_i5", "#fr_t1" \n\t" \
  247. "punpcklwd "#fr_i4", "#fr_i6", "#fr_t2" \n\t" \
  248. "punpckhwd "#fr_i5", "#fr_i6", "#fr_t2" \n\t" \
  249. "punpcklwd "#fr_i6", "#fr_i7", "#fr_t3" \n\t" \
  250. "punpckhwd "#fr_i7", "#fr_i7", "#fr_t3" \n\t"
  251. /**
  252. * brief: Parallel SRA for 8 byte packaged data.
  253. * fr_i0: src
  254. * fr_i1: SRA number(SRAB number + 8)
  255. * fr_t0, fr_t1: temporary register
  256. * fr_d0: dst
  257. */
  258. #define PSRAB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0) \
  259. "punpcklbh "#fr_t0", "#fr_t0", "#fr_i0" \n\t" \
  260. "punpckhbh "#fr_t1", "#fr_t1", "#fr_i0" \n\t" \
  261. "psrah "#fr_t0", "#fr_t0", "#fr_i1" \n\t" \
  262. "psrah "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \
  263. "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t"
  264. /**
  265. * brief: Parallel SRL for 8 byte packaged data.
  266. * fr_i0: src
  267. * fr_i1: SRL number(SRLB number + 8)
  268. * fr_t0, fr_t1: temporary register
  269. * fr_d0: dst
  270. */
  271. #define PSRLB_MMI(fr_i0, fr_i1, fr_t0, fr_t1, fr_d0) \
  272. "punpcklbh "#fr_t0", "#fr_t0", "#fr_i0" \n\t" \
  273. "punpckhbh "#fr_t1", "#fr_t1", "#fr_i0" \n\t" \
  274. "psrlh "#fr_t0", "#fr_t0", "#fr_i1" \n\t" \
  275. "psrlh "#fr_t1", "#fr_t1", "#fr_i1" \n\t" \
  276. "packsshb "#fr_d0", "#fr_t0", "#fr_t1" \n\t"
  277. #define PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \
  278. "psrah "#fp1", "#fp1", "#shift" \n\t" \
  279. "psrah "#fp2", "#fp2", "#shift" \n\t" \
  280. "psrah "#fp3", "#fp3", "#shift" \n\t" \
  281. "psrah "#fp4", "#fp4", "#shift" \n\t"
  282. #define PSRAH_8_MMI(fp1, fp2, fp3, fp4, fp5, fp6, fp7, fp8, shift) \
  283. PSRAH_4_MMI(fp1, fp2, fp3, fp4, shift) \
  284. PSRAH_4_MMI(fp5, fp6, fp7, fp8, shift)
  285. /**
  286. * brief: (((value) + (1 << ((n) - 1))) >> (n))
  287. * fr_i0: src & dst
  288. * fr_i1: Operand number
  289. * fr_t0, fr_t1: temporary FPR
  290. * gr_t0: temporary GPR
  291. */
  292. #define ROUND_POWER_OF_TWO_MMI(fr_i0, fr_i1, fr_t0, fr_t1, gr_t0) \
  293. "li "#gr_t0", 0x01 \n\t" \
  294. "dmtc1 "#gr_t0", "#fr_t0" \n\t" \
  295. "punpcklwd "#fr_t0", "#fr_t0", "#fr_t0" \n\t" \
  296. "psubw "#fr_t1", "#fr_i1", "#fr_t0" \n\t" \
  297. "psllw "#fr_t1", "#fr_t0", "#fr_t1" \n\t" \
  298. "paddw "#fr_i0", "#fr_i0", "#fr_t1" \n\t" \
  299. "psraw "#fr_i0", "#fr_i0", "#fr_i1" \n\t"
  300. #endif /* AVUTILS_MIPS_MMIUTILS_H */