nnpack.h 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. #pragma once
  2. #include <stddef.h>
  3. #include <stdint.h>
  4. #include <stdbool.h>
  5. #include <pthreadpool.h>
  6. #ifdef __cplusplus
  7. extern "C" {
  8. #endif
  9. /**
  10. * @brief Status code for any NNPACK function call.
  11. */
  12. enum nnp_status {
  13. /** The call succeeded, and all output arguments now contain valid data. */
  14. nnp_status_success = 0,
  15. /** NNPACK function was called with batch_size == 0. */
  16. nnp_status_invalid_batch_size = 2,
  17. /** NNPACK function was called with channels == 0. */
  18. nnp_status_invalid_channels = 3,
  19. /** NNPACK function was called with input_channels == 0. */
  20. nnp_status_invalid_input_channels = 4,
  21. /** NNPACK function was called with output_channels == 0. */
  22. nnp_status_invalid_output_channels = 5,
  23. /** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */
  24. nnp_status_invalid_input_size = 10,
  25. /** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */
  26. nnp_status_invalid_input_stride = 11,
  27. /** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.:
  28. *
  29. * - input_padding.left >= kernel_size.width (>= pooling_size.width)
  30. * - input_padding.right >= kernel_size.width (>= pooling_size.width)
  31. * - input_padding.top >= kernel_size.height (>= pooling_size.height)
  32. * - input_padding.bottom >= kernel_size.height (>= pooling_size.height)
  33. */
  34. nnp_status_invalid_input_padding = 12,
  35. /** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */
  36. nnp_status_invalid_kernel_size = 13,
  37. /** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */
  38. nnp_status_invalid_pooling_size = 14,
  39. /** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */
  40. nnp_status_invalid_pooling_stride = 15,
  41. /** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */
  42. nnp_status_invalid_algorithm = 16,
  43. /** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */
  44. nnp_status_invalid_transform_strategy = 17,
  45. /** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */
  46. nnp_status_invalid_output_subsampling = 13,
  47. /** NNPACK function was called with activation not in nnp_activation enum */
  48. nnp_status_invalid_activation = 14,
  49. /** NNPACK function was called with invalid activation parameters */
  50. nnp_status_invalid_activation_parameters = 15,
  51. /** NNPACK does not support the particular input size for the function */
  52. nnp_status_unsupported_input_size = 20,
  53. /** NNPACK does not support the particular input stride for the function */
  54. nnp_status_unsupported_input_stride = 21,
  55. /** NNPACK does not support the particular input padding for the function */
  56. nnp_status_unsupported_input_padding = 22,
  57. /** NNPACK does not support the particular kernel size for the function */
  58. nnp_status_unsupported_kernel_size = 23,
  59. /** NNPACK does not support the particular pooling size for the function */
  60. nnp_status_unsupported_pooling_size = 24,
  61. /** NNPACK does not support the particular pooling stride for the function */
  62. nnp_status_unsupported_pooling_stride = 25,
  63. /** NNPACK does not support the particular convolution algorithm for the function */
  64. nnp_status_unsupported_algorithm = 26,
  65. /** NNPACK does not support the particular convolution transform strategy for the algorithm */
  66. nnp_status_unsupported_transform_strategy = 27,
  67. /** NNPACK does not support the particular activation function for the function */
  68. nnp_status_unsupported_activation = 28,
  69. /** NNPACK does not support the particular activation function parameters for the function */
  70. nnp_status_unsupported_activation_parameters = 29,
  71. /** NNPACK function was called before the library was initialized */
  72. nnp_status_uninitialized = 50,
  73. /** NNPACK does not implement this function for the host CPU */
  74. nnp_status_unsupported_hardware = 51,
  75. /** NNPACK failed to allocate memory for temporary buffers */
  76. nnp_status_out_of_memory = 52,
  77. /** Scratch space buffer is too small */
  78. nnp_status_insufficient_buffer = 53,
  79. /** Scratch space buffer is not properly aligned */
  80. nnp_status_misaligned_buffer = 54
  81. };
  82. /**
  83. * @brief Activation applied applied after a convolutional or fully-connected layer.
  84. */
  85. enum nnp_activation {
  86. /** Identity activation f(x) := x, i.e. no transformation */
  87. nnp_activation_identity = 0,
  88. /** ReLU activation f(x) := max(0, x) */
  89. nnp_activation_relu = 1,
  90. };
  91. /**
  92. * @brief Algorithm for computing convolutional layers.
  93. */
  94. enum nnp_convolution_algorithm {
  95. /** Let NNPACK choose the algorithm depending on layer parameters */
  96. nnp_convolution_algorithm_auto = 0,
  97. /** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */
  98. nnp_convolution_algorithm_ft8x8 = 1,
  99. /** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */
  100. nnp_convolution_algorithm_ft16x16 = 2,
  101. /** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */
  102. nnp_convolution_algorithm_wt8x8 = 3,
  103. /** Direct convolution via implicit GEMM. */
  104. nnp_convolution_algorithm_implicit_gemm = 4,
  105. /** Direct convolution implementation. */
  106. nnp_convolution_algorithm_direct = 5,
  107. /**
  108. * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16.
  109. * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP),
  110. * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8.
  111. */
  112. nnp_convolution_algorithm_wt8x8_fp16 = 6,
  113. };
  114. enum nnp_convolution_transform_strategy {
  115. nnp_convolution_transform_strategy_compute = 1,
  116. nnp_convolution_transform_strategy_precompute = 2,
  117. nnp_convolution_transform_strategy_reuse = 3
  118. };
  119. /* For backward compatibility */
  120. #define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute
  121. #define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute
  122. /**
  123. * @brief Size of images, kernels, and pooling filters in NNPACK.
  124. */
  125. struct nnp_size {
  126. /** Width (horizontal size) of an image, kernel, or pooling filter. */
  127. size_t width;
  128. /** Height (vertical size) of an image, kernel, or pooling filter. */
  129. size_t height;
  130. };
  131. /**
  132. * @brief Padding of images in NNPACK.
  133. */
  134. struct nnp_padding {
  135. /** Padding above the image data */
  136. size_t top;
  137. /** Padding on the right of image data */
  138. size_t right;
  139. /** Padding below the image data */
  140. size_t bottom;
  141. /** Padding on the left of image data */
  142. size_t left;
  143. };
  144. /**
  145. * @brief Profiling information about time spent in different phases of a function call.
  146. */
  147. struct nnp_profile {
  148. /** Time spent inside the function call, in seconds. */
  149. double total;
  150. /** Time spend on transformation of the input or input gradient tensor, in seconds. */
  151. double input_transform;
  152. /** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */
  153. double kernel_transform;
  154. /** Time spend on transformation of the output or output gradient tensor, in seconds. */
  155. double output_transform;
  156. /** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */
  157. double block_multiplication;
  158. };
  159. enum nnp_status nnp_initialize(void);
  160. enum nnp_status nnp_deinitialize(void);
  161. /**
  162. * @brief Computes output of a 2D convolutional layer from input and kernel tensors.
  163. * @details This function targets training of convolutional neural networks and performs forward propagation.
  164. * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
  165. * For minibatch size 1, use nnp_convolution_inference for optimal performance.
  166. * @param algorithm The type of algorithm to use for convolution. Possible values are:
  167. *
  168. * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
  169. * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
  170. * Supports kernels up to 8x8.
  171. * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
  172. * Supports kernels up to 16x16.
  173. * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
  174. * Supports only 3x3 kernels.
  175. *
  176. * @param batch_size The number of images on the input and output of the convolutional layer.
  177. * @param input_channels The number of channels (AKA features, dimensions) in the input images.
  178. * @param output_channels The number of channels (AKA features, dimensions) in the output images.
  179. * @param input_size Size of input images, excluding implicit zero-padding.
  180. * @param input_padding Implicit zero-padding of input images.
  181. * @param kernel_size Kernel size.
  182. * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
  183. * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
  184. * @param[in] bias A 1D array bias[output_channels].
  185. * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where
  186. * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
  187. * (kernel_size.height - 1)
  188. * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
  189. * (kernel_size.width - 1)
  190. * @param threadpool A thread pool for parallelization of the computation.
  191. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  192. * @param[out] profile An optional pointer to profiling structure.
  193. * If provided, the structure would record time spent in different phases of the computation.
  194. */
  195. enum nnp_status nnp_convolution_output(
  196. enum nnp_convolution_algorithm algorithm,
  197. size_t batch_size,
  198. size_t input_channels,
  199. size_t output_channels,
  200. struct nnp_size input_size,
  201. struct nnp_padding input_padding,
  202. struct nnp_size kernel_size,
  203. const float* input,
  204. const float* kernel,
  205. const float* bias,
  206. float* output,
  207. void* workspace_buffer,
  208. size_t* workspace_size,
  209. enum nnp_activation activation,
  210. const void* activation_parameters,
  211. pthreadpool_t threadpool,
  212. struct nnp_profile* profile);
  213. /**
  214. * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
  215. * @details This function targets training of convolutional neural networks and performs backward propagation.
  216. * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
  217. * @param algorithm The type of algorithm to use for convolution. Possible values are:
  218. *
  219. * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
  220. * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
  221. * Supports kernels up to 8x8.
  222. * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
  223. * Supports kernels up to 16x16.
  224. * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
  225. * Supports only 3x3 kernels.
  226. *
  227. * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
  228. * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients).
  229. * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
  230. * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
  231. * @param input_padding Implicit zero-padding of input images.
  232. * @param kernel_size Kernel size.
  233. * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
  234. * where
  235. * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
  236. * (kernel_size.height - 1)
  237. * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
  238. * (kernel_size.width - 1)
  239. * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
  240. * @param[out] grad_input A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width].
  241. * @param threadpool A thread pool for parallelization of the computation.
  242. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  243. * @param[out] profile An optional pointer to profiling structure.
  244. * If provided, the structure would record time spent in different phases of the computation.
  245. */
  246. enum nnp_status nnp_convolution_input_gradient(
  247. enum nnp_convolution_algorithm algorithm,
  248. size_t batch_size,
  249. size_t input_channels,
  250. size_t output_channels,
  251. struct nnp_size input_size,
  252. struct nnp_padding input_padding,
  253. struct nnp_size kernel_size,
  254. const float* grad_output,
  255. const float* kernel,
  256. float* grad_input,
  257. void* workspace_buffer,
  258. size_t* workspace_size,
  259. enum nnp_activation activation,
  260. const void* activation_parameters,
  261. pthreadpool_t threadpool,
  262. struct nnp_profile* profile);
  263. /**
  264. * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors.
  265. * @details This function targets training of convolutional neural networks and performs backward propagation.
  266. * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
  267. * @param algorithm The type of algorithm to use for convolution. Possible values are:
  268. *
  269. * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
  270. * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
  271. * Supports kernels up to 8x8.
  272. * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
  273. * Supports kernels up to 16x16.
  274. *
  275. * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
  276. * @param input_channels The number of channels (AKA features, dimensions) in the input images.
  277. * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
  278. * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
  279. * @param input_padding Implicit zero-padding of input images.
  280. * @param kernel_size Kernel size.
  281. * @param[in] input A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
  282. * @param[in] grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
  283. * where
  284. * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
  285. * (kernel_size.height - 1)
  286. * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
  287. * (kernel_size.width - 1)
  288. * @param[out] grad_kernel A 4D tensor
  289. * grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
  290. * @param threadpool A thread pool for parallelization of the computation.
  291. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  292. * @param[out] profile An optional pointer to profiling structure.
  293. * If provided, the structure would record time spent in different phases of the computation.
  294. */
  295. enum nnp_status nnp_convolution_kernel_gradient(
  296. enum nnp_convolution_algorithm algorithm,
  297. size_t batch_size,
  298. size_t input_channels,
  299. size_t output_channels,
  300. struct nnp_size input_size,
  301. struct nnp_padding input_padding,
  302. struct nnp_size kernel_size,
  303. const float* input,
  304. const float* grad_output,
  305. float* grad_kernel,
  306. void* workspace_buffer,
  307. size_t* workspace_size,
  308. enum nnp_activation activation,
  309. const void* activation_parameters,
  310. pthreadpool_t threadpool,
  311. struct nnp_profile* profile);
  312. /**
  313. * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor.
  314. * @details This function targets prediction with convolutional neural networks and performs forward propagation.
  315. * @param algorithm The type of algorithm to use for convolution. Possible values are:
  316. *
  317. * - nnp_convolution_algorithm_auto -- let the function choose the algorithm.
  318. * - nnp_convolution_algorithm_ft8x8 -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
  319. * Supports kernels up to 8x8.
  320. * - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
  321. * Supports kernels up to 16x16.
  322. * - nnp_convolution_algorithm_wt8x8 -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
  323. * Supports only 3x3 kernels.
  324. *
  325. * @param transform_strategy A strategy that guides computation of kernel transforms coefficients.
  326. * Possible values are:
  327. *
  328. * - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed
  329. * coefficients.
  330. * - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed
  331. * coefficients.
  332. *
  333. * @param input_channels The number of channels (AKA features, dimensions) in the input image.
  334. * @param output_channels The number of channels (AKA features, dimensions) in the output image.
  335. * @param input_size Size of input image, excluding implicit zero-padding.
  336. * @param input_padding Implicit zero-padding of input image.
  337. * @param kernel_size Kernel size.
  338. * @param output_subsampling Subsample region for output, also known as convolution stride.
  339. * @param[in] input A 3D tensor input[input_channels][input_size.height][input_size.width].
  340. * @param[in] kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
  341. * @param[in] bias A 1D array bias[output_channels].
  342. * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where
  343. * output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
  344. * (kernel_size.height - 1)
  345. * output_size.width = (input_padding.left + input_size.width + input_padding.right) -
  346. * (kernel_size.width - 1)
  347. * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes.
  348. * If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size
  349. * of required workspace memory at the workspace_size location, and exit without
  350. * computations.
  351. * If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory
  352. * before and deallocate after this computation, potentially at significant runtime cost.
  353. * @param[in,out] workspace_size Pointer to the size of workspace buffer.
  354. * If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to
  355. * the location specified by this pointer.
  356. * If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of
  357. * the buffer, in bytes.
  358. * If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK
  359. * would allocate memory before and deallocate after this computation, potentially at
  360. * significant runtime cost.
  361. * @param threadpool A thread pool for parallelization of the computation.
  362. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  363. * @param[out] profile An optional pointer to profiling structure.
  364. * If provided, the structure would record time spent in different phases of the computation.
  365. */
  366. enum nnp_status nnp_convolution_inference(
  367. enum nnp_convolution_algorithm algorithm,
  368. enum nnp_convolution_transform_strategy transform_strategy,
  369. size_t input_channels,
  370. size_t output_channels,
  371. struct nnp_size input_size,
  372. struct nnp_padding input_padding,
  373. struct nnp_size kernel_size,
  374. struct nnp_size output_subsampling,
  375. const float* input,
  376. const float* kernel,
  377. const float* bias,
  378. float* output,
  379. void* workspace_buffer,
  380. size_t* workspace_size,
  381. enum nnp_activation activation,
  382. const void* activation_parameters,
  383. pthreadpool_t threadpool,
  384. struct nnp_profile* profile);
  385. /**
  386. * @brief Computes output of a fully connected layer from input and kernel matrices.
  387. * @details This function targets training of convolutional neural networks and performs forward propagation.
  388. * It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
  389. * For minibatch size 1, use nnp_fully_connected_inference for optimal performance.
  390. * @param batch_size The number of vectors on the input and output of the fully connected layer.
  391. * @param input_channels The number of channels (AKA features, dimensions) in the input matrix.
  392. * @param output_channels The number of channels (AKA features, dimensions) in the output matrix.
  393. * @param[in] input A 2D matrix input[batch_size][input_channels].
  394. * @param[in] kernel A 2D matrix kernel[output_channels][input_channels].
  395. * @param[out] output A 2D matrix output[batch_size][output_channels].
  396. * @param threadpool A thread pool for parallelization of the computation.
  397. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  398. */
  399. enum nnp_status nnp_fully_connected_output(
  400. size_t batch_size,
  401. size_t input_channels,
  402. size_t output_channels,
  403. const float input[],
  404. const float kernel[],
  405. float output[],
  406. pthreadpool_t threadpool,
  407. struct nnp_profile* profile);
  408. /**
  409. * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
  410. * @details This function targets prediction with convolutional neural networks and performs forward propagation.
  411. * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
  412. * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
  413. * @param[in] input A 1D array input[input_channels] of FP32 elements.
  414. * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements.
  415. * @param[out] output A 1D array output[output_channels] of FP32 elements.
  416. * @param threadpool A thread pool for parallelization of the computation.
  417. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  418. */
  419. enum nnp_status nnp_fully_connected_inference(
  420. size_t input_channels,
  421. size_t output_channels,
  422. const float* input,
  423. const float* kernel,
  424. float* output,
  425. pthreadpool_t threadpool);
  426. /**
  427. * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
  428. * @details This function targets prediction with convolutional neural networks and performs forward propagation.
  429. * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
  430. * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
  431. * @param[in] input A 1D array input[input_channels] of FP32 elements.
  432. * @param[in] kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements.
  433. * @param[out] output A 1D array output[output_channels] of FP32 elements.
  434. * @param threadpool A thread pool for parallelization of the computation.
  435. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  436. */
  437. enum nnp_status nnp_fully_connected_inference_f16f32(
  438. size_t input_channels,
  439. size_t output_channels,
  440. const float* input,
  441. const void* kernel,
  442. float* output,
  443. pthreadpool_t threadpool);
  444. /**
  445. * @brief Computes output of a max-pooling layer for an input tensor.
  446. * @details This function targets both prediction and training of convolutional neural networks and performs forward
  447. * propagation. Is is optimized for both large and small minibatch sizes.
  448. * @param batch_size The number of images on the input and output of the max-pooling layer.
  449. * @param channels The number of channels (AKA features, dimensions) in both input and output images.
  450. * @param input_size Size of input images, excluding implicit zero-padding.
  451. * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but
  452. * affect the output size.
  453. * @param pooling_size Size of the pooling filter. Only 2x2 filter are currently supported.
  454. * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported.
  455. * @param[in] input A 4D tensor input[batch_size][channels][input_size.height][input_size.width].
  456. * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where
  457. * output_size.height = ceil(
  458. * (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) /
  459. * pooling_stride.height) + 1
  460. * output_size.width = ceil(
  461. * (input_padding.left + input_size.width + input_padding.right - pooling_size.width) /
  462. * pooling_stride.width) + 1
  463. * @param threadpool A thread pool for parallelization of the computation.
  464. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  465. */
  466. enum nnp_status nnp_max_pooling_output(
  467. size_t batch_size,
  468. size_t channels,
  469. struct nnp_size input_size,
  470. struct nnp_padding input_padding,
  471. struct nnp_size pooling_size,
  472. struct nnp_size pooling_stride,
  473. const float input[],
  474. float output[],
  475. pthreadpool_t threadpool);
  476. /**
  477. * @brief Computes output of a softmax layer for an input matrix.
  478. * @details This function targets both prediction and training of convolutional neural networks and performs forward
  479. * propagation. Is is optimized for both large and small minibatch sizes.
  480. * @param batch_size The number of vectors on the input and output of the softmax layer.
  481. * @param channels The number of channels (AKA features, dimensions) in both input and output vectors.
  482. * @param[in] input A 2D matrix input[batch_size][channels].
  483. * @param[out] output A 2D matrix output[batch_size][channels].
  484. * @param threadpool A thread pool for parallelization of the computation.
  485. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  486. */
  487. enum nnp_status nnp_softmax_output(
  488. size_t batch_size,
  489. size_t channels,
  490. const float input[],
  491. float output[],
  492. pthreadpool_t threadpool);
  493. /**
  494. * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix.
  495. * @details This function targets both prediction and training of convolutional neural networks and performs forward
  496. * propagation. Is is optimized for both large and small minibatch sizes.
  497. * @param batch_size The number of vectors on the input and output of the ReLU layer.
  498. * @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
  499. * @param[in] input A 2D matrix input[batch_size][channels].
  500. * @param[out] output A 2D matrix output[batch_size][channels].
  501. * @param threadpool A thread pool for parallelization of the computation.
  502. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  503. */
  504. enum nnp_status nnp_relu_output(
  505. size_t batch_size,
  506. size_t channels,
  507. const float input[],
  508. float output[],
  509. float negative_slope,
  510. pthreadpool_t threadpool);
  511. /**
  512. * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices.
  513. * @details This function targets training of convolutional neural networks and performs backward propagation.
  514. * Is is optimized for both large and small minibatch sizes.
  515. * @param batch_size The number of vectors on the input and output of the ReLU layer.
  516. * @param channels The number of channels (AKA features, dimensions) in both input and output matrices.
  517. * @param[in] input A 2D matrix input[batch_size][channels].
  518. * @param[out] output A 2D matrix output[batch_size][channels].
  519. * @param threadpool A thread pool for parallelization of the computation.
  520. * If threadpool is NULL, the computation would run on the caller thread without parallelization.
  521. */
  522. enum nnp_status nnp_relu_input_gradient(
  523. size_t batch_size,
  524. size_t channels,
  525. const float grad_output[],
  526. const float input[],
  527. float grad_input[],
  528. float negative_slope,
  529. pthreadpool_t threadpool);
  530. #ifdef __cplusplus
  531. } /* extern "C" */
  532. #endif
  533. #ifdef __cplusplus
  534. // Backward compatible implementations for nnp_convolution_*, if we are in C++
  535. // mode.
  536. inline enum nnp_status nnp_convolution_output(
  537. enum nnp_convolution_algorithm algorithm,
  538. size_t batch_size,
  539. size_t input_channels,
  540. size_t output_channels,
  541. struct nnp_size input_size,
  542. struct nnp_padding input_padding,
  543. struct nnp_size kernel_size,
  544. const float input[],
  545. const float kernel[],
  546. const float bias[],
  547. float output[],
  548. pthreadpool_t threadpool,
  549. struct nnp_profile* profile)
  550. {
  551. return nnp_convolution_output(
  552. algorithm,
  553. batch_size, input_channels, output_channels,
  554. input_size, input_padding, kernel_size,
  555. input, kernel, bias, output,
  556. NULL, NULL,
  557. nnp_activation_identity, NULL, threadpool, profile);
  558. }
  559. inline enum nnp_status nnp_convolution_input_gradient(
  560. enum nnp_convolution_algorithm algorithm,
  561. size_t batch_size,
  562. size_t input_channels,
  563. size_t output_channels,
  564. struct nnp_size input_size,
  565. struct nnp_padding input_padding,
  566. struct nnp_size kernel_size,
  567. const float grad_output[],
  568. const float kernel[],
  569. float grad_input[],
  570. pthreadpool_t threadpool,
  571. struct nnp_profile* profile)
  572. {
  573. return nnp_convolution_input_gradient(
  574. algorithm,
  575. batch_size, input_channels, output_channels,
  576. input_size, input_padding, kernel_size,
  577. grad_output, kernel, grad_input,
  578. NULL, NULL,
  579. nnp_activation_identity, NULL, threadpool, profile);
  580. }
  581. inline enum nnp_status nnp_convolution_kernel_gradient(
  582. enum nnp_convolution_algorithm algorithm,
  583. size_t batch_size,
  584. size_t input_channels,
  585. size_t output_channels,
  586. struct nnp_size input_size,
  587. struct nnp_padding input_padding,
  588. struct nnp_size kernel_size,
  589. const float input[],
  590. const float grad_output[],
  591. float grad_kernel[],
  592. pthreadpool_t threadpool,
  593. struct nnp_profile* profile)
  594. {
  595. return nnp_convolution_kernel_gradient(
  596. algorithm,
  597. batch_size, input_channels, output_channels,
  598. input_size, input_padding, kernel_size,
  599. input, grad_output, grad_kernel,
  600. NULL, NULL,
  601. nnp_activation_identity, NULL, threadpool, profile);
  602. }
  603. inline enum nnp_status nnp_convolution_inference(
  604. enum nnp_convolution_algorithm algorithm,
  605. enum nnp_convolution_transform_strategy transform_strategy,
  606. size_t input_channels,
  607. size_t output_channels,
  608. struct nnp_size input_size,
  609. struct nnp_padding input_padding,
  610. struct nnp_size kernel_size,
  611. struct nnp_size output_subsampling,
  612. const float input[],
  613. const float kernel[],
  614. const float bias[],
  615. float output[],
  616. pthreadpool_t threadpool,
  617. struct nnp_profile* profile) {
  618. return nnp_convolution_inference(
  619. algorithm, transform_strategy,
  620. input_channels, output_channels,
  621. input_size, input_padding, kernel_size, output_subsampling,
  622. input, kernel, bias, output, NULL, NULL,
  623. nnp_activation_identity, NULL,
  624. threadpool, profile);
  625. }
  626. #endif // __cplusplus