QnnpackUtils.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527
  1. #pragma once
  2. #ifdef USE_PYTORCH_QNNPACK
  3. #include <ATen/core/Tensor.h>
  4. #include <c10/util/irange.h>
  5. #include <pytorch_qnnpack.h>
  6. #include <qnnpack_func.h>
  7. #include <ATen/native/quantized/cpu/XnnpackUtils.h>
  8. #include <ATen/native/quantized/PackedParams.h>
  9. #include <ATen/native/utils/Factory.h>
  10. #ifndef AT_PER_OPERATOR_HEADERS
  11. #include <ATen/Functions.h>
  12. #else
  13. #include <ATen/ops/empty.h>
  14. #endif
  15. #include <utility>
  16. struct QnnpackOperatorDeleter {
  17. void operator()(pytorch_qnnp_operator_t op) {
  18. pytorch_qnnp_delete_operator(op);
  19. }
  20. };
  21. // PackedWeight struct for QNNPACK stores the original Weight and Bias as
  22. // QNNPACK currently does not support an unpack function.
  23. // For PyTorch Mobile, once the model is scripted and serialized we don't need
  24. // to call unpack, so we can save some memory by checking for this case and free
  25. // the original weights after packing.
  26. // Input scale is set to null in pre-pack step. QNNPACK needs bias quantized
  27. // with input scale which is available at runtime in pytorch. During runtime if
  28. // input scale value changes then we requantize bias with the updated scale. For
  29. // inference we expect the graph to be static so the input scale should not
  30. // change across consecutive inference calls.
  31. struct PackedLinearWeightsQnnp : public LinearPackedParamsBase {
  32. PackedLinearWeightsQnnp(
  33. std::unique_ptr<qnnpack::PackBMatrix> w,
  34. at::Tensor orig_weight,
  35. at::Tensor bias,
  36. c10::optional<double> input_scale,
  37. at::Tensor w_scales,
  38. std::vector<uint8_t>&& w_zps)
  39. : w(std::move(w)),
  40. orig_weight(std::move(orig_weight)),
  41. bias_(at::native::mobile::allocate_padded_contiguous_if_needed(
  42. bias, bias.suggest_memory_format())),
  43. per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine),
  44. input_scale(std::move(input_scale)),
  45. w_scales(std::move(w_scales)),
  46. w_zero_points(std::move(w_zps)) {
  47. weight_sizes = this->orig_weight.sizes().vec();
  48. n_elements = std::accumulate(std::begin(weight_sizes), std::end(weight_sizes), 1, std::multiplies<double>());
  49. }
  50. std::unique_ptr<qnnpack::PackBMatrix> w;
  51. at::Tensor orig_weight;
  52. at::Tensor bias_;
  53. bool per_channel_;
  54. c10::optional<double> input_scale;
  55. at::Tensor w_scales;
  56. std::vector<uint8_t> w_zero_points;
  57. std::vector<float> requantization_scales;
  58. std::vector<int64_t> weight_sizes;
  59. int n_elements;
  60. at::Tensor apply(
  61. at::Tensor input,
  62. double output_scale,
  63. int64_t output_zero_point) override;
  64. at::Tensor apply_relu(
  65. at::Tensor input,
  66. double output_scale,
  67. int64_t output_zero_point) override;
  68. at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override;
  69. at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override;
  70. std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
  71. c10::optional<at::Tensor> bias() override {
  72. return bias_;
  73. }
  74. static c10::intrusive_ptr<LinearPackedParamsBase> prepack(
  75. at::Tensor weight,
  76. c10::optional<at::Tensor> bias);
  77. bool per_channel() const {
  78. return per_channel_;
  79. }
  80. private:
  81. std::mutex qnnp_mutex_;
  82. #ifdef USE_XNNPACK
  83. xnnpack_operator xnnp_linear_op;
  84. template <typename scalar_t, bool kReluFused>
  85. at::Tensor apply_impl_xnnp(
  86. const at::Tensor& input,
  87. double output_scale,
  88. int64_t output_zero_point);
  89. #endif // USE_XNNPACK
  90. template <bool ReluFused>
  91. at::Tensor apply_impl(
  92. at::Tensor input,
  93. double output_scale,
  94. int64_t output_zero_point);
  95. template <bool ReluFused>
  96. at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range);
  97. };
  98. template <int kSpatialDim = 2>
  99. struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
  100. PackedConvWeightsQnnp(
  101. std::unique_ptr<qnnpack::PrePackConvWeights> w,
  102. at::Tensor orig_weight,
  103. at::Tensor bias,
  104. torch::List<int64_t> stride,
  105. torch::List<int64_t> padding,
  106. torch::List<int64_t> output_padding,
  107. torch::List<int64_t> dilation,
  108. int64_t groups,
  109. bool transpose,
  110. c10::optional<double> input_scale,
  111. std::vector<int64_t> kernel,
  112. at::Tensor w_scale,
  113. std::vector<uint8_t>&& w_zps,
  114. bool is_per_channel)
  115. : w(std::move(w)),
  116. orig_weight(std::move(orig_weight)),
  117. bias(std::move(bias)),
  118. stride_(std::move(stride)),
  119. padding_(std::move(padding)),
  120. output_padding_(std::move(output_padding)),
  121. dilation_(std::move(dilation)),
  122. groups_(groups),
  123. transpose_(transpose),
  124. is_per_channel_(is_per_channel),
  125. input_scale(input_scale),
  126. kernel_(std::move(kernel)),
  127. w_scales(std::move(w_scale)),
  128. w_zero_points(std::move(w_zps)) {
  129. const bool any_padding = std::any_of(
  130. padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; });
  131. const size_t kernel_size =
  132. std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>());
  133. const size_t group_input_channels = transpose
  134. ? this->orig_weight.size(0) / groups
  135. : this->orig_weight.size(1);
  136. const size_t group_output_channels = transpose
  137. ? this->orig_weight.size(1)
  138. : this->orig_weight.size(0) / groups;
  139. const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1;
  140. const size_t kernel_height = kernel_[kSpatialDim - 2];
  141. const size_t kernel_width = kernel_[kSpatialDim - 1];
  142. pytorch_qnnp_ukernel_type ukernel_type;
  143. if (transpose_) {
  144. ukernel_type = pytorch_qnnp_ukernel_type_conv;
  145. } else {
  146. ukernel_type = pytorch_qnnp_ukernel_type_none;
  147. const bool has_depthwise_dimensions =
  148. (kSpatialDim == 2 &&
  149. ((kernel_height == 3 && kernel_width == 3) ||
  150. (kernel_height == 5 && kernel_width == 5))) ||
  151. (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 &&
  152. kernel_depth == 3);
  153. const bool has_depthwise_grouping =
  154. group_input_channels == 1 && group_output_channels == 1 && groups > 1;
  155. if (has_depthwise_dimensions && has_depthwise_grouping) {
  156. ukernel_type = pytorch_qnnp_ukernel_type_dwconv;
  157. } else if (
  158. kernel_size == 1 &&
  159. std::all_of(
  160. stride_.begin(),
  161. stride_.end(),
  162. [](const auto& e) { return e == 1; }) &&
  163. !any_padding) {
  164. ukernel_type = group_input_channels >= SIZE_MAX
  165. ? pytorch_qnnp_ukernel_type_xzp_gemm
  166. : pytorch_qnnp_ukernel_type_gemm;
  167. } else {
  168. ukernel_type = pytorch_qnnp_ukernel_type_conv;
  169. }
  170. }
  171. if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
  172. TORCH_INTERNAL_ASSERT(
  173. false, "Per channel quantized weights are not supported for XZP kernels");
  174. }
  175. pytorch_qnnp_operator_t convolution{nullptr};
  176. // Initially all the params are set to zero.
  177. convolution = static_cast<pytorch_qnnp_operator_t>(
  178. calloc(1, sizeof(struct pytorch_qnnp_operator)));
  179. if (convolution == nullptr) {
  180. TORCH_INTERNAL_ASSERT(
  181. false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
  182. sizeof(struct pytorch_qnnp_operator));
  183. }
  184. convolution_op =
  185. std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter>(
  186. convolution);
  187. // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
  188. convolution->ukernel_type = ukernel_type;
  189. convolution->groups = groups;
  190. convolution->group_input_channels = group_input_channels;
  191. convolution->group_output_channels = group_output_channels;
  192. convolution->kernel_depth = kernel_depth;
  193. convolution->kernel_height = kernel_height;
  194. convolution->kernel_width = kernel_width;
  195. convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1;
  196. convolution->stride_height = stride_[kSpatialDim - 2];
  197. convolution->stride_width = stride_[kSpatialDim - 1];
  198. convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1;
  199. convolution->dilation_height = dilation_[kSpatialDim - 2];
  200. convolution->dilation_width = dilation_[kSpatialDim - 1];
  201. convolution->input_padding_height = padding_[kSpatialDim - 2];
  202. convolution->input_padding_width = padding_[kSpatialDim - 1];
  203. convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0;
  204. convolution->per_channel = is_per_channel_;
  205. convolution->transpose = transpose_;
  206. const uint32_t kr = pytorch_qnnp_params.q8conv.kr;
  207. const size_t k_stride = (group_input_channels + (kr - 1)) & -kr;
  208. size_t zero_size = sizeof(uint8_t) * k_stride;
  209. size_t zero_offset = 0;
  210. if (transpose_) {
  211. convolution->adjustment_width = output_padding_[1];
  212. convolution->adjustment_height = output_padding_[0];
  213. if (group_input_channels < 8) {
  214. zero_size += 8;
  215. zero_offset = 8;
  216. }
  217. } else {
  218. zero_buffer_size = 0;
  219. if (any_padding) {
  220. zero_size = 0;
  221. zero_offset = 0;
  222. if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) {
  223. const uint32_t cr = pytorch_qnnp_params.q8dw9.cr;
  224. const size_t group_stride = (groups + (cr - 1)) & -cr;
  225. if (groups >= 8) {
  226. zero_size = sizeof(uint8_t) * group_stride;
  227. zero_offset = 0;
  228. } else {
  229. zero_size = sizeof(uint8_t) * group_stride + 8;
  230. zero_offset = sizeof(uint8_t) * 8;
  231. }
  232. } else if (
  233. ukernel_type == pytorch_qnnp_ukernel_type_conv ||
  234. ukernel_type == pytorch_qnnp_ukernel_type_gemm) {
  235. if (group_input_channels >= 8) {
  236. zero_size = sizeof(uint8_t) * k_stride;
  237. zero_offset = 0;
  238. } else {
  239. zero_size = sizeof(uint8_t) * k_stride + 8;
  240. zero_offset = 8;
  241. }
  242. }
  243. }
  244. }
  245. // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI)
  246. void* zero_buffer = malloc(zero_size);
  247. if (zero_buffer == nullptr) {
  248. pytorch_qnnp_delete_operator(convolution);
  249. TORCH_INTERNAL_ASSERT(
  250. false, "failed to allocate %zu bytes for zero padding",
  251. zero_size);
  252. }
  253. // Need to set to input zero point
  254. // memset(zero_buffer, input_zero_point, zero_size);
  255. zero_buffer_size = zero_size;
  256. convolution->zero_buffer = zero_buffer;
  257. convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset);
  258. }
  259. std::unique_ptr<pytorch_qnnp_operator, QnnpackOperatorDeleter> convolution_op;
  260. #ifdef USE_XNNPACK
  261. xnnpack_operator xnnp_convolution_op;
  262. #endif // USE_XNNPACK
  263. std::unique_ptr<qnnpack::PrePackConvWeights> w;
  264. at::Tensor orig_weight;
  265. at::Tensor bias;
  266. torch::List<int64_t> stride_;
  267. torch::List<int64_t> padding_;
  268. torch::List<int64_t> output_padding_;
  269. torch::List<int64_t> dilation_;
  270. int64_t groups_;
  271. bool transpose_;
  272. bool is_per_channel_;
  273. c10::optional<double> input_scale;
  274. std::vector<int64_t> kernel_;
  275. at::Tensor w_scales;
  276. std::vector<uint8_t> w_zero_points;
  277. std::vector<float> requantization_scales;
  278. size_t zero_buffer_size;
  279. at::Tensor apply(
  280. const at::Tensor& input,
  281. double output_scale,
  282. int64_t output_zero_point) override;
  283. at::Tensor apply_relu(
  284. const at::Tensor& input,
  285. double output_scale,
  286. int64_t output_zero_point) override;
  287. at::Tensor apply_dynamic(
  288. const at::Tensor& input,
  289. bool reduce_range=false) override;
  290. std::tuple<at::Tensor, c10::optional<at::Tensor>> unpack() override;
  291. static c10::intrusive_ptr<ConvPackedParamsBase<kSpatialDim>> prepack(
  292. at::Tensor weight,
  293. c10::optional<at::Tensor> bias,
  294. torch::List<int64_t> stride,
  295. torch::List<int64_t> padding,
  296. torch::List<int64_t> output_padding,
  297. torch::List<int64_t> dilation,
  298. int64_t groups,
  299. bool transpose);
  300. torch::List<int64_t> stride() const override {
  301. return stride_;
  302. }
  303. torch::List<int64_t> padding() const override {
  304. return padding_;
  305. }
  306. torch::List<int64_t> output_padding() const override {
  307. return output_padding_;
  308. }
  309. torch::List<int64_t> dilation() const override {
  310. return dilation_;
  311. }
  312. int64_t groups() const override {
  313. return groups_;
  314. }
  315. bool transpose() const override {
  316. return transpose_;
  317. }
  318. bool per_channel() const {
  319. return is_per_channel_;
  320. }
  321. private:
  322. std::mutex qnnp_mutex_;
  323. template <bool ReluFused>
  324. at::Tensor apply_impl(
  325. const at::Tensor& input,
  326. double output_scale,
  327. int64_t output_zero_point);
  328. #ifdef USE_XNNPACK
  329. template <typename scalar_t, bool ReluFused>
  330. at::Tensor apply_impl_xnnp(
  331. const at::Tensor& input,
  332. double output_scale,
  333. int64_t output_zero_point);
  334. #endif // USE_XNNPACK
  335. };
  336. enum class Activation : uint8_t { NONE = 0, RELU = 1 };
  337. #if defined(__ANDROID__) && !defined(__NDK_MAJOR__)
  338. template <class T>
  339. inline float Round(const float x) {
  340. return ::nearbyintf(x);
  341. }
  342. inline double Round(const double x) {
  343. return ::nearbyint(x);
  344. }
  345. #else
  346. template <class T>
  347. inline T Round(const T x) {
  348. return std::nearbyint(x);
  349. }
  350. #endif
  351. template<typename T>
  352. inline T QuantizeValue(float scale, int32_t zero_point, float value) {
  353. const int32_t qmin = std::numeric_limits<T>::min();
  354. const int32_t qmax = std::numeric_limits<T>::max();
  355. auto r = zero_point + static_cast<int32_t>(Round(value / scale));
  356. r = std::max(r, qmin);
  357. r = std::min(r, qmax);
  358. return static_cast<T>(r);
  359. }
  360. template<typename T>
  361. inline std::pair<T, T> activationLimits(
  362. float scale,
  363. int32_t zero_point,
  364. Activation Ac) {
  365. switch (Ac) {
  366. case Activation::NONE:
  367. return {std::numeric_limits<T>::min(),
  368. std::numeric_limits<T>::max()};
  369. case Activation::RELU:
  370. return {QuantizeValue<T>(scale, zero_point, 0.0),
  371. std::numeric_limits<T>::max()};
  372. default:
  373. #ifdef _MSC_VER
  374. __assume(0);
  375. #else
  376. __builtin_unreachable();
  377. #endif
  378. }
  379. }
  380. namespace at {
  381. namespace native {
  382. namespace qnnp_avgpool_helper {
  383. Tensor qnnpack_avg_pool2d(
  384. Tensor input,
  385. IntArrayRef kernel_size,
  386. IntArrayRef stride,
  387. IntArrayRef padding,
  388. bool ceil_mode,
  389. bool count_include_pad,
  390. c10::optional<int64_t> divisor_override);
  391. } // qnnp_avgpool_helper
  392. } // namespace native
  393. } // namespace at
  394. namespace {
  395. C10_UNUSED std::vector<float> generate_requantization_scales(
  396. const at::Tensor& weight_scales,
  397. const float input_scale,
  398. const float output_scale,
  399. std::vector<float>& requant_scales) {
  400. // Since weight scale is allocated with padding
  401. // weight_scales.numel() gives us padded num elements.
  402. const auto num_output_channels_padded = weight_scales.numel();
  403. float *const weight_scales_data = weight_scales.data_ptr<float>();
  404. if (static_cast<int64_t>(requant_scales.size()) < num_output_channels_padded) {
  405. requant_scales.resize(num_output_channels_padded);
  406. }
  407. for (const auto i : c10::irange(num_output_channels_padded)) {
  408. const auto inverse_output_scale = 1.f /output_scale;
  409. requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale;
  410. TORCH_CHECK(
  411. (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])),
  412. "failed to create op with requantization scale: ",
  413. requant_scales[i],
  414. ": requantization scale must be finite and positive");
  415. }
  416. return requant_scales;
  417. }
  418. C10_UNUSED std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
  419. const at::Tensor& weight_contig,
  420. bool transpose = false,
  421. uint32_t groups = 1
  422. ) {
  423. const int out_ch_idx = transpose ? 1 : 0;
  424. const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1);
  425. // Add 8 to account for bufferring needed by QNNPACK.
  426. const auto num_output_channels_padded = num_output_channels + 8;
  427. const auto qtype = weight_contig.qscheme();
  428. std::vector<uint8_t> weight_zp(num_output_channels_padded, 0);
  429. // Adjust weight zero point, similar to weight data.
  430. if (qtype == at::kPerTensorAffine) {
  431. for (const auto i : c10::irange(num_output_channels)) {
  432. weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128);
  433. }
  434. } else if (qtype == at::kPerChannelAffine) {
  435. TORCH_CHECK(
  436. weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong,
  437. "Per channel zero points dtype must be long int.");
  438. const int64_t* per_channel_zero_points =
  439. weight_contig.q_per_channel_zero_points().data_ptr<int64_t>();
  440. for (const auto i : c10::irange(num_output_channels)) {
  441. weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128);
  442. }
  443. } else {
  444. TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
  445. }
  446. at:: Tensor weight_scales =
  447. at::empty(
  448. {num_output_channels_padded},
  449. at::device(at::kCPU).dtype(at::kFloat));
  450. float *const weight_scales_data = weight_scales.data_ptr<float>();
  451. if (qtype == at::kPerTensorAffine) {
  452. for (const auto i : c10::irange(num_output_channels)) {
  453. weight_scales_data[i] = weight_contig.q_scale();
  454. }
  455. } else if (qtype == at::kPerChannelAffine) {
  456. TORCH_CHECK(
  457. weight_contig.q_per_channel_scales().scalar_type() == at::kDouble,
  458. "Per channel scales dtype must be double.");
  459. const double *const per_channel_scales =
  460. weight_contig.q_per_channel_scales().data_ptr<double>();
  461. for (const auto i : c10::irange(num_output_channels)) {
  462. weight_scales_data[i] = static_cast<float>(per_channel_scales[i]);
  463. }
  464. } else {
  465. TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
  466. }
  467. for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) {
  468. weight_scales_data[i] = 1.f;
  469. }
  470. return {weight_zp, weight_scales};
  471. }
  472. } // namespace
  473. #endif