UpSample.cuh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. #pragma once
  2. #include <ATen/core/TensorAccessor.h>
  3. #include <ATen/cuda/Atomic.cuh>
  4. #include <c10/util/ArrayRef.h>
  5. #include <c10/util/Optional.h>
  6. #include <c10/util/SmallVector.h>
  7. #include <c10/util/OptionalArrayRef.h>
  8. #include <math.h>
  9. namespace at {
  10. namespace native {
  11. namespace upsample {
  12. // TODO: Remove duplicate declaration.
  13. TORCH_API c10::SmallVector<int64_t, 3> compute_output_size(
  14. c10::IntArrayRef input_size, // Full input tensor size.
  15. at::OptionalIntArrayRef output_size,
  16. c10::optional<c10::ArrayRef<double>> scale_factors);
  17. } // namespace upsample
  18. namespace upsample_cuda {
  19. // TODO: Remove duplication with Upsample.h (CPU).
  20. inline c10::optional<double> get_scale_value(c10::optional<c10::ArrayRef<double>> scales, int idx) {
  21. if (!scales) {
  22. return nullopt;
  23. }
  24. return scales->at(idx);
  25. }
  26. } // namespace upsample_cuda
  27. /* TODO: move this to a common place */
  28. template <typename scalar_t>
  29. __device__ inline scalar_t min(scalar_t a, scalar_t b) {
  30. return a < b ? a : b;
  31. }
  32. template <typename scalar_t>
  33. __device__ inline scalar_t max(scalar_t a, scalar_t b) {
  34. return a > b ? a : b;
  35. }
  36. // NOTE [ Nearest neighbor upsampling kernel implementation ]
  37. //
  38. // The nearest neighbor upsampling kernel implementation is symmetrical as
  39. // expected. We launch kernels with threads mapping to destination tensors where
  40. // kernels write data to, each thread reads data from the source tensor, this
  41. // means:
  42. // 1. In the forward kernel,
  43. // src_xxx refers to properties of input tensors;
  44. // dst_xxx refers to properties of output tensors;
  45. // scale_factor is the ratio of src_size to dst_size;
  46. // 2. In the backward kernel,
  47. // src_xxx refers to properties of grad_output tensors;
  48. // dst_xxx refers to properties of grad_input tensors;
  49. // scale_factor is the ratio of src_size to dst_size;
  50. //
  51. // Because of this, we need to take the reciprocal of the scale defined by
  52. // upsample layer during forward path. The motivation is to avoid slow
  53. // division in the kernel code, so we can use faster multiplication instead.
  54. // This is not necessary during backward path, since the scale_factor is already
  55. // the reciprocal of corresponding scale_factor used in the forward path due to
  56. // the swap of source and destination tensor.
  57. //
  58. // Similarly, since the mapping from grad_input to grad_output during backward
  59. // is the reverse of the mapping of output to input, we need to have opposite
  60. // mapping functions to compute the source index.
  61. // see NOTE [ Nearest neighbor upsampling kernel implementation ]
  62. template <typename accscalar_t>
  63. __host__ __forceinline__ static accscalar_t compute_scales_value(
  64. const c10::optional<double> scale,
  65. int64_t src_size,
  66. int64_t dst_size) {
  67. // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
  68. return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)(1.0 / scale.value())
  69. : (accscalar_t)src_size / dst_size;
  70. }
  71. // see NOTE [ Nearest neighbor upsampling kernel implementation ]
  72. template <typename accscalar_t>
  73. __host__ __forceinline__ static accscalar_t compute_scales_value_backwards(
  74. const c10::optional<double> scale,
  75. int64_t src_size,
  76. int64_t dst_size) {
  77. // FIXME: remove magic > 0 after we ensure no models were serialized with -1 defaults.
  78. return (scale.has_value() && scale.value() > 0.) ? (accscalar_t)scale.value()
  79. : (accscalar_t)src_size / dst_size;
  80. }
  81. template <typename accscalar_t>
  82. __host__ __forceinline__ static accscalar_t area_pixel_compute_scale(
  83. int input_size,
  84. int output_size,
  85. bool align_corners,
  86. const c10::optional<double> scale) {
  87. if(align_corners) {
  88. if(output_size > 1) {
  89. return (accscalar_t)(input_size - 1) / (output_size - 1);
  90. }
  91. else {
  92. return static_cast<accscalar_t>(0);
  93. }
  94. }
  95. else{
  96. return compute_scales_value<accscalar_t>(scale, input_size, output_size);
  97. }
  98. }
  99. template <typename accscalar_t>
  100. __device__ __forceinline__ static accscalar_t area_pixel_compute_source_index(
  101. accscalar_t scale,
  102. int dst_index,
  103. bool align_corners,
  104. bool cubic) {
  105. if (align_corners) {
  106. return scale * dst_index;
  107. } else {
  108. accscalar_t src_idx = scale * (dst_index + static_cast<accscalar_t>(0.5)) -
  109. static_cast<accscalar_t>(0.5);
  110. // See Note[Follow Opencv resize logic]
  111. return (!cubic && src_idx < static_cast<accscalar_t>(0))
  112. ? static_cast<accscalar_t>(0)
  113. : src_idx;
  114. }
  115. }
  116. // see NOTE [ Nearest neighbor upsampling kernel implementation ]
  117. __device__ __forceinline__ static int nearest_neighbor_compute_source_index(
  118. const float scale,
  119. int dst_index,
  120. int input_size) {
  121. // index_f32 = (output_index) * scale
  122. // input_index = round(index_f32)
  123. // Same as a buggy OpenCV INTER_NEAREST
  124. // We keep this method for BC and consider as deprecated.
  125. // See nearest_neighbor_exact_compute_source_index as replacement
  126. const int src_index =
  127. min(static_cast<int>(floorf((dst_index) * scale)), input_size - 1);
  128. return src_index;
  129. }
  130. __device__ __forceinline__ static int nearest_neighbor_exact_compute_source_index(
  131. const float scale,
  132. int dst_index,
  133. int input_size) {
  134. // index_f32 = (output_index + 0.5) * scale - 0.5
  135. // input_index = round(index_f32)
  136. // Same as Pillow and Scikit-Image/Scipy ndi.zoom
  137. const int src_index =
  138. min(static_cast<int>(floorf((dst_index + static_cast<float>(0.5)) * scale)), input_size - 1);
  139. return src_index;
  140. }
  141. // see NOTE [ Nearest neighbor upsampling kernel implementation ]
  142. __device__ __forceinline__ static int nearest_neighbor_bw_compute_source_index(
  143. const float scale,
  144. int dst_index,
  145. int output_size) {
  146. // Equivalent to buggy OpenCV INTER_NEAREST
  147. // We keep this method for BC and consider as deprecated.
  148. // See nearest_neighbor_exact_bw_compute_source_index as replacement
  149. const int src_index =
  150. min(static_cast<int>(ceilf(dst_index * scale)), output_size);
  151. return src_index;
  152. }
  153. // see NOTE [ Nearest neighbor upsampling kernel implementation ]
  154. __device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_index(
  155. const float scale,
  156. int dst_index,
  157. int output_size) {
  158. // Equivalent to Pillow and Scikit-Image/Scipy ndi.zoom
  159. const int src_index =
  160. min(static_cast<int>(ceilf(dst_index * scale - static_cast<float>(0.5))), output_size);
  161. return src_index;
  162. }
  163. /* Used by UpSampleBicubic2d.cu */
  164. template <typename scalar_t>
  165. __device__ __forceinline__ static scalar_t upsample_get_value_bounded(
  166. const PackedTensorAccessor64<scalar_t, 4>& data,
  167. int batch,
  168. int channel,
  169. int height,
  170. int width,
  171. int y,
  172. int x) {
  173. int access_y = max(min(y, height - 1), 0);
  174. int access_x = max(min(x, width - 1), 0);
  175. return data[batch][channel][access_y][access_x];
  176. }
  177. /* Used by UpSampleBicubic2d.cu */
  178. template <typename scalar_t, typename accscalar_t>
  179. __device__ __forceinline__ static void upsample_increment_value_bounded(
  180. PackedTensorAccessor64<scalar_t, 4>& data,
  181. int batch,
  182. int channel,
  183. int height,
  184. int width,
  185. int y,
  186. int x,
  187. accscalar_t value) {
  188. int access_y = max(min(y, height - 1), 0);
  189. int access_x = max(min(x, width - 1), 0);
  190. /* TODO: result here is truncated to scalar_t,
  191. check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
  192. */
  193. gpuAtomicAddNoReturn(
  194. &data[batch][channel][access_y][access_x], static_cast<scalar_t>(value));
  195. }
  196. // Based on
  197. // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
  198. template <typename accscalar_t>
  199. __device__ __forceinline__ static accscalar_t cubic_convolution1(
  200. accscalar_t x,
  201. accscalar_t A) {
  202. return ((A + 2) * x - (A + 3)) * x * x + 1;
  203. }
  204. template <typename accscalar_t>
  205. __device__ __forceinline__ static accscalar_t cubic_convolution2(
  206. accscalar_t x,
  207. accscalar_t A) {
  208. return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
  209. }
  210. template <typename accscalar_t>
  211. __device__ __forceinline__ static void get_cubic_upsampling_coefficients(
  212. accscalar_t coeffs[4],
  213. accscalar_t t) {
  214. accscalar_t A = -0.75;
  215. accscalar_t x1 = t;
  216. coeffs[0] = cubic_convolution2<accscalar_t>(x1 + 1.0, A);
  217. coeffs[1] = cubic_convolution1<accscalar_t>(x1, A);
  218. // opposite coefficients
  219. accscalar_t x2 = 1.0 - t;
  220. coeffs[2] = cubic_convolution1<accscalar_t>(x2, A);
  221. coeffs[3] = cubic_convolution2<accscalar_t>(x2 + 1.0, A);
  222. }
  223. template <typename scalar_t, typename accscalar_t>
  224. __device__ __forceinline__ static accscalar_t cubic_interp1d(
  225. scalar_t x0,
  226. scalar_t x1,
  227. scalar_t x2,
  228. scalar_t x3,
  229. accscalar_t t) {
  230. accscalar_t coeffs[4];
  231. get_cubic_upsampling_coefficients<accscalar_t>(coeffs, t);
  232. return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
  233. }
  234. namespace upsample_antialias {
  235. // taken from
  236. // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
  237. // src/libImaging/Resample.c#L20-L29
  238. struct BilinearFilterFunctor {
  239. template <typename accscalar_t>
  240. __device__ accscalar_t operator()(accscalar_t x) const {
  241. if (x < 0) {
  242. x = -x;
  243. }
  244. if (x < 1) {
  245. return 1 - x;
  246. }
  247. return 0;
  248. }
  249. static const int size = 2;
  250. };
  251. // taken from
  252. // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
  253. // src/libImaging/Resample.c#L46-L62
  254. struct BicubicFilterFunctor {
  255. template <typename accscalar_t>
  256. __device__ accscalar_t operator()(accscalar_t x) const {
  257. // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
  258. const accscalar_t a = -0.5;
  259. if (x < 0) {
  260. x = -x;
  261. }
  262. if (x < 1) {
  263. return ((a + 2) * x - (a + 3)) * x * x + 1;
  264. }
  265. if (x < 2) {
  266. return (((x - 5) * x + 8) * x - 4) * a;
  267. }
  268. return 0;
  269. }
  270. static const int size = 4;
  271. };
  272. template <typename accscalar_t>
  273. __device__ __forceinline__ static void _compute_weights_span(
  274. const int i,
  275. const int input_size,
  276. const accscalar_t scale,
  277. const accscalar_t support,
  278. int& xmin,
  279. int& xsize,
  280. accscalar_t& center) {
  281. center = scale * (i + static_cast<accscalar_t>(0.5));
  282. xmin = max(static_cast<int>(center - support + static_cast<accscalar_t>(0.5)), static_cast<int>(0));
  283. xsize = min(static_cast<int>(center + support + static_cast<accscalar_t>(0.5)), input_size) - xmin;
  284. }
  285. template <typename scalar_t, typename accscalar_t, typename interp_filter_t>
  286. __device__ __forceinline__ static void _compute_weights(
  287. scalar_t* wt_ptr,
  288. const accscalar_t scale,
  289. int interp_size,
  290. const interp_filter_t& interp_filter,
  291. accscalar_t xmin_m_center,
  292. int xsize) {
  293. accscalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
  294. accscalar_t total_w = 0.0;
  295. int j = 0;
  296. for (j = 0; j < xsize; j++) {
  297. accscalar_t w = interp_filter((j + xmin_m_center + static_cast<accscalar_t>(0.5)) * invscale);
  298. wt_ptr[j] = static_cast<scalar_t>(w);
  299. total_w += w;
  300. }
  301. for (j = 0; j < xsize; j++) {
  302. if (total_w != 0.0) {
  303. wt_ptr[j] /= total_w;
  304. }
  305. }
  306. for (; j < interp_size; j++) {
  307. wt_ptr[j] = static_cast<scalar_t>(0.0);
  308. }
  309. }
  310. template <typename scalar_t, typename accscalar_t>
  311. __device__ __forceinline__ static accscalar_t interpolate_aa_single_dim(
  312. const scalar_t* src,
  313. const scalar_t* weights,
  314. int size) {
  315. scalar_t t = static_cast<accscalar_t>(*src);
  316. scalar_t wts = static_cast<accscalar_t>(weights[0]);
  317. accscalar_t output = t * wts;
  318. int j = 1;
  319. for (; j < size; j++) {
  320. wts = static_cast<accscalar_t>(weights[j]);
  321. t = static_cast<accscalar_t>(*(src + j));
  322. output += t * wts;
  323. }
  324. return output;
  325. }
  326. }
  327. } // namespace native
  328. } // namespace at