fake_quantize.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531
  1. """
  2. This module implements modules which are used to perform fake quantization
  3. during QAT.
  4. """
  5. import torch
  6. from torch.nn import Module
  7. from torch.ao.quantization.observer import (
  8. MovingAverageMinMaxObserver,
  9. HistogramObserver,
  10. MovingAveragePerChannelMinMaxObserver,
  11. FixedQParamsObserver,
  12. default_fixed_qparams_range_0to1_observer,
  13. default_fixed_qparams_range_neg1to1_observer,
  14. _with_args,
  15. )
  16. import re
  17. from abc import ABC, abstractmethod
  18. from typing import Any, Tuple
  19. __all__ = [
  20. "FakeQuantizeBase",
  21. "FakeQuantize",
  22. "FixedQParamsFakeQuantize",
  23. "FusedMovingAvgObsFakeQuantize",
  24. "disable_fake_quant",
  25. "disable_observer",
  26. "enable_fake_quant",
  27. "enable_observer",
  28. "default_fake_quant",
  29. "default_weight_fake_quant",
  30. "default_dynamic_fake_quant",
  31. "default_fixed_qparams_range_neg1to1_fake_quant",
  32. "default_fixed_qparams_range_0to1_fake_quant",
  33. "default_symmetric_fixed_qparams_fake_quant",
  34. "default_affine_fixed_qparams_fake_quant",
  35. "default_per_channel_weight_fake_quant",
  36. "default_embedding_fake_quant",
  37. "default_embedding_fake_quant_4bit",
  38. "default_histogram_fake_quant",
  39. "default_fused_act_fake_quant",
  40. "default_fused_wt_fake_quant",
  41. "default_fused_per_channel_wt_fake_quant",
  42. "fused_wt_fake_quant_range_neg_127_to_127",
  43. "fused_per_channel_wt_fake_quant_range_neg_127_to_127",
  44. ]
  45. def _is_per_channel(qscheme: 'torch.qscheme') -> bool:
  46. return qscheme in [torch.per_channel_symmetric, torch.per_channel_affine, torch.per_channel_affine_float_qparams]
  47. def _is_per_tensor(qscheme: 'torch.qscheme') -> bool:
  48. return qscheme in [torch.per_tensor_symmetric, torch.per_tensor_affine]
  49. def _is_symmetric_quant(qscheme: 'torch.qscheme') -> bool:
  50. return qscheme in [torch.per_tensor_symmetric, torch.per_channel_symmetric]
  51. def _is_float_qparams(qscheme: 'torch.qscheme') -> bool:
  52. return qscheme in [torch.per_channel_affine_float_qparams, ]
  53. class FakeQuantizeBase(ABC, Module):
  54. r""" Base fake quantize module
  55. Any fake quantize implementation should derive from this class.
  56. Concrete fake quantize module should follow the same API. In forward, they will update
  57. the statistics of the observed Tensor and fake quantize the input. They should also provide a
  58. `calculate_qparams` function that computes the quantization parameters given
  59. the collected statistics.
  60. """
  61. fake_quant_enabled: torch.Tensor
  62. observer_enabled: torch.Tensor
  63. def __init__(self):
  64. super().__init__()
  65. # fake_quant_enabled and observer_enabled are buffers to support their
  66. # replication in DDP. Data type is uint8 because NCCL does not support
  67. # bool tensors.
  68. self.register_buffer('fake_quant_enabled', torch.tensor([1], dtype=torch.uint8))
  69. self.register_buffer('observer_enabled', torch.tensor([1], dtype=torch.uint8))
  70. @abstractmethod
  71. def forward(self, x):
  72. pass
  73. @abstractmethod
  74. def calculate_qparams(self, **kwargs):
  75. pass
  76. @torch.jit.export
  77. def enable_fake_quant(self, enabled: bool = True) -> None:
  78. self.fake_quant_enabled[0] = 1 if enabled else 0
  79. @torch.jit.export
  80. def disable_fake_quant(self):
  81. self.enable_fake_quant(False)
  82. @torch.jit.export
  83. def enable_observer(self, enabled: bool = True) -> None:
  84. self.observer_enabled[0] = 1 if enabled else 0
  85. @torch.jit.export
  86. def disable_observer(self):
  87. self.enable_observer(False)
  88. @classmethod
  89. def with_args(cls, **kwargs):
  90. fake_quant_constructor = _with_args(cls, **kwargs)
  91. # need to assign the correct module to fake_quantize
  92. # constructors to satisfy public v private requirements
  93. fake_quant_constructor.__module__ = "torch.ao.quantization.fake_quantize"
  94. return fake_quant_constructor
  95. class FakeQuantize(FakeQuantizeBase):
  96. r""" Simulate the quantize and dequantize operations in training time.
  97. The output of this module is given by::
  98. x_out = (
  99. clamp(round(x/scale + zero_point), quant_min, quant_max) - zero_point
  100. ) * scale
  101. * :attr:`scale` defines the scale factor used for quantization.
  102. * :attr:`zero_point` specifies the quantized value to which 0 in floating point maps to
  103. * :attr:`fake_quant_enabled` controls the application of fake quantization on tensors, note that
  104. statistics can still be updated.
  105. * :attr:`observer_enabled` controls statistics collection on tensors
  106. * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization,
  107. allowable values are torch.qint8 and torch.quint8.
  108. Args:
  109. observer (module): Module for observing statistics on input tensors and calculating scale
  110. and zero-point.
  111. observer_kwargs (optional): Arguments for the observer module
  112. Attributes:
  113. activation_post_process (Module): User provided module that collects statistics on the input tensor and
  114. provides a method to calculate scale and zero-point.
  115. """
  116. scale: torch.Tensor
  117. zero_point: torch.Tensor
  118. def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=None, quant_max=None, **observer_kwargs):
  119. super().__init__()
  120. # Populate quant_min/quant_max to observer_kwargs if valid
  121. if quant_min is not None and quant_max is not None:
  122. assert quant_min <= quant_max, \
  123. 'quant_min must be less than or equal to quant_max'
  124. dtype = observer_kwargs.get("dtype", torch.quint8)
  125. if hasattr(observer, "p"):
  126. # In case observer is _PartialWrapper, dtype can be stored in
  127. # observer.p.keywords["dtype"]
  128. dtype = getattr(getattr(observer, "p", {}), "keywords", {}).get(
  129. "dtype", dtype
  130. )
  131. assert torch.iinfo(dtype).min <= quant_min, 'quant_min out of bound'
  132. assert quant_max <= torch.iinfo(dtype).max, 'quant_max out of bound'
  133. observer_kwargs.update({"quant_min": quant_min, "quant_max": quant_max})
  134. self.activation_post_process = observer(**observer_kwargs)
  135. # TODO: keeping self.quant_min/max for BC; remove after a couple releases
  136. # Users should use self.activation_post_process.quant_min
  137. self.quant_min = self.activation_post_process.quant_min
  138. self.quant_max = self.activation_post_process.quant_max
  139. if _is_float_qparams(self.activation_post_process.qscheme):
  140. zero_point_dtype = torch.float
  141. else:
  142. zero_point_dtype = torch.int
  143. self.register_buffer('scale', torch.tensor([1.0], dtype=torch.float))
  144. self.register_buffer('zero_point', torch.tensor([0], dtype=zero_point_dtype))
  145. self.dtype = self.activation_post_process.dtype
  146. self.qscheme = self.activation_post_process.qscheme
  147. self.ch_axis = self.activation_post_process.ch_axis \
  148. if hasattr(self.activation_post_process, 'ch_axis') else -1
  149. assert _is_per_channel(self.qscheme) or \
  150. _is_per_tensor(self.qscheme), \
  151. 'Only per channel and per tensor quantization are supported in fake quantize' + \
  152. ' got qscheme: ' + str(self.qscheme)
  153. self.is_per_channel = _is_per_channel(self.qscheme)
  154. @torch.jit.export
  155. def calculate_qparams(self):
  156. return self.activation_post_process.calculate_qparams()
  157. def forward(self, X):
  158. if self.observer_enabled[0] == 1:
  159. self.activation_post_process(X.detach())
  160. _scale, _zero_point = self.calculate_qparams()
  161. _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(self.zero_point.device)
  162. if self.scale.shape != _scale.shape:
  163. self.scale.resize_(_scale.shape)
  164. self.zero_point.resize_(_zero_point.shape)
  165. self.scale.copy_(_scale)
  166. self.zero_point.copy_(_zero_point)
  167. if self.fake_quant_enabled[0] == 1:
  168. if self.is_per_channel:
  169. X = torch.fake_quantize_per_channel_affine(
  170. X, self.scale, self.zero_point,
  171. self.ch_axis, self.activation_post_process.quant_min, self.activation_post_process.quant_max)
  172. else:
  173. X = torch.fake_quantize_per_tensor_affine(
  174. X, self.scale, self.zero_point,
  175. self.activation_post_process.quant_min, self.activation_post_process.quant_max)
  176. return X
  177. @torch.jit.export
  178. def extra_repr(self):
  179. return 'fake_quant_enabled={}, observer_enabled={}, ' \
  180. 'quant_min={}, quant_max={}, dtype={}, qscheme={}, ch_axis={}, ' \
  181. 'scale={}, zero_point={}'.format(
  182. self.fake_quant_enabled, self.observer_enabled,
  183. self.activation_post_process.quant_min, self.activation_post_process.quant_max,
  184. self.dtype, self.qscheme, self.ch_axis, self.scale, self.zero_point)
  185. def _save_to_state_dict(self, destination, prefix, keep_vars):
  186. # We cannot currently register scalar values as buffers, so need to manually
  187. # specify serialization here.
  188. super()._save_to_state_dict(destination, prefix, keep_vars)
  189. destination[prefix + 'scale'] = self.scale
  190. destination[prefix + 'zero_point'] = self.zero_point
  191. def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
  192. missing_keys, unexpected_keys, error_msgs):
  193. # Removing this function throws an error that the the size of the loaded tensor does not match the original size
  194. # i.e., These buffers start out with numel 0 and become numel 1 once they have their first forward pass.
  195. local_state = ['scale', 'zero_point']
  196. for name in local_state:
  197. key = prefix + name
  198. if key in state_dict:
  199. val = state_dict[key]
  200. # Custom handling to allow loading scale and zero_point
  201. # of size N into uninitialized buffers of size 0. The
  202. # buffers are resized here, and the values are copied in
  203. # the default state_dict loading code of the parent.
  204. if name == 'scale':
  205. self.scale.resize_(val.shape)
  206. else:
  207. assert name == 'zero_point'
  208. self.zero_point.resize_(val.shape)
  209. # For torchscript module we need to update the attributes here since we do not
  210. # call the `_load_from_state_dict` function defined module.py
  211. if torch.jit.is_scripting():
  212. if name == 'scale':
  213. self.scale.copy_(val)
  214. else:
  215. assert name == 'zero_point'
  216. self.zero_point.copy_(val)
  217. elif strict:
  218. missing_keys.append(key)
  219. super()._load_from_state_dict(state_dict, prefix, local_metadata, strict,
  220. missing_keys, unexpected_keys, error_msgs)
  221. class FixedQParamsFakeQuantize(FakeQuantize):
  222. """ Simulate quantize and dequantize with fixed quantization
  223. parameters in training time. Only per tensor quantization
  224. is supported.
  225. """
  226. def __init__(self, observer):
  227. super().__init__(observer=observer)
  228. assert type(self.activation_post_process) == FixedQParamsObserver,\
  229. "%s's observer must be a %s" % (self.__class__.__name__, FixedQParamsObserver.__name__)
  230. self._observer_ctr = observer
  231. self.scale = self.activation_post_process.scale
  232. self.zero_point = self.activation_post_process.zero_point
  233. assert _is_per_tensor(self.qscheme), 'Only per tensor quantization is supported' + \
  234. ' FixedQParamsFakeQuantize module, got qscheme:' + str(self.qscheme)
  235. @torch.jit.export
  236. def calculate_qparams(self):
  237. return self.scale, self.zero_point
  238. @torch.jit.export
  239. def extra_repr(self):
  240. return 'fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, ' \
  241. 'dtype={}, quant_min={}, quant_max={}, qscheme={}'.format(
  242. self.fake_quant_enabled, self.observer_enabled,
  243. self.scale, self.zero_point, self.dtype,
  244. self.activation_post_process.quant_min, self.activation_post_process.quant_max, self.qscheme)
  245. class FusedMovingAvgObsFakeQuantize(FakeQuantize):
  246. r"""Fused module that is used to observe the input tensor (compute min/max), compute
  247. scale/zero_point and fake_quantize the tensor.
  248. This module uses calculation similar MovingAverageMinMaxObserver for the inputs,
  249. to compute the min/max values in order to compute the scale/zero_point.
  250. The qscheme input in the observer is used to differentiate between symmetric/affine
  251. quantization scheme.
  252. The output of this module is given by
  253. x_out = (clamp(round(x/scale + zero_point), quant_min, quant_max)-zero_point)*scale
  254. Similar to :class:`~torch.ao.quantization.FakeQuantize`, and accepts the same attributes as the
  255. base class.
  256. """
  257. def __init__(
  258. self,
  259. observer: Any = MovingAverageMinMaxObserver,
  260. quant_min: int = 0,
  261. quant_max: int = 255,
  262. **observer_kwargs: Any
  263. ) -> None:
  264. super().__init__(observer, quant_min, quant_max, **observer_kwargs)
  265. assert isinstance(self.activation_post_process, (MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver)),\
  266. "Fused observer+fake_quant module only works with MovingAverageMinMaxObserver"
  267. self.register_buffer("fake_quant_enabled", torch.tensor([1], dtype=torch.long))
  268. self.register_buffer("observer_enabled", torch.tensor([1], dtype=torch.long))
  269. self.is_symmetric_quant = _is_symmetric_quant(self.activation_post_process.qscheme)
  270. @torch.jit.export
  271. def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
  272. return self.activation_post_process.calculate_qparams()
  273. @torch.jit.export
  274. def extra_repr(self) -> str:
  275. return (
  276. "fake_quant_enabled={}, observer_enabled={}, scale={}, zero_point={}, "
  277. "dtype={}, quant_min={}, quant_max={}, qscheme={}, reduce_range={}".format(
  278. self.fake_quant_enabled,
  279. self.observer_enabled,
  280. self.scale,
  281. self.zero_point,
  282. self.dtype,
  283. self.activation_post_process.quant_min,
  284. self.activation_post_process.quant_max,
  285. self.qscheme,
  286. self.activation_post_process.reduce_range,
  287. )
  288. )
  289. def forward(self, X: torch.Tensor) -> torch.Tensor:
  290. return torch.fused_moving_avg_obs_fake_quant(
  291. X,
  292. self.observer_enabled,
  293. self.fake_quant_enabled,
  294. self.activation_post_process.min_val,
  295. self.activation_post_process.max_val,
  296. self.scale,
  297. self.zero_point,
  298. self.activation_post_process.averaging_constant,
  299. self.activation_post_process.quant_min,
  300. self.activation_post_process.quant_max,
  301. self.ch_axis,
  302. self.is_per_channel,
  303. self.is_symmetric_quant,
  304. )
  305. default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
  306. dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
  307. """
  308. Default fake_quant for activations.
  309. """
  310. default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
  311. dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, reduce_range=False)
  312. """
  313. Default fake_quant for weights.
  314. Observer is memoryless since averaging_constant is 1.
  315. """
  316. default_dynamic_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
  317. dtype=torch.quint8, averaging_constant=1)
  318. """
  319. Default dynamic fake_quant for activations.
  320. """
  321. default_fixed_qparams_range_neg1to1_fake_quant = (
  322. FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_neg1to1_observer)
  323. )
  324. default_fixed_qparams_range_0to1_fake_quant = (
  325. FixedQParamsFakeQuantize.with_args(observer=default_fixed_qparams_range_0to1_observer)
  326. )
  327. # TODO: the following 2 variables are kept for backwards compatibility; remove after a few releases
  328. default_symmetric_fixed_qparams_fake_quant = default_fixed_qparams_range_neg1to1_fake_quant
  329. default_affine_fixed_qparams_fake_quant = default_fixed_qparams_range_0to1_fake_quant
  330. default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
  331. quant_min=-128,
  332. quant_max=127,
  333. dtype=torch.qint8,
  334. qscheme=torch.per_channel_symmetric,
  335. reduce_range=False,
  336. ch_axis=0)
  337. """
  338. Default fake_quant for per-channel weights.
  339. Observer is memoryless since averaging_constant is 1.
  340. """
  341. default_embedding_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
  342. qscheme=torch.per_channel_affine_float_qparams,
  343. dtype=torch.quint8,
  344. quant_min=0,
  345. quant_max=255,
  346. ch_axis=0,
  347. averaging_constant=1)
  348. """
  349. Default fake_quant for embeddings.
  350. Observer is memoryless since averaging_constant is 1.
  351. """
  352. default_embedding_fake_quant_4bit = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
  353. qscheme=torch.per_channel_affine_float_qparams,
  354. ch_axis=0,
  355. dtype=torch.quint4x2,
  356. averaging_constant=1)
  357. default_histogram_fake_quant = FakeQuantize.with_args(observer=HistogramObserver,
  358. quant_min=0,
  359. quant_max=255,
  360. dtype=torch.quint8,
  361. qscheme=torch.per_tensor_affine,
  362. reduce_range=True)
  363. """
  364. Fake_quant for activations using a histogram..
  365. """
  366. default_fused_act_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
  367. quant_min=0,
  368. quant_max=255,
  369. dtype=torch.quint8,)
  370. """
  371. Fused version of `default_fake_quant`, with improved performance.
  372. """
  373. default_fused_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
  374. quant_min=-128,
  375. quant_max=127,
  376. dtype=torch.qint8,
  377. qscheme=torch.per_tensor_symmetric)
  378. """
  379. Fused version of `default_weight_fake_quant`, with improved performance.
  380. """
  381. default_fused_per_channel_wt_fake_quant = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
  382. quant_min=-128,
  383. quant_max=127,
  384. dtype=torch.qint8,
  385. qscheme=torch.per_channel_symmetric)
  386. """
  387. Fused version of `default_per_channel_weight_fake_quant`, with improved performance.
  388. """
  389. fused_wt_fake_quant_range_neg_127_to_127 = FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
  390. quant_min=-127,
  391. quant_max=127,
  392. dtype=torch.qint8,
  393. qscheme=torch.per_tensor_symmetric,
  394. eps=2 ** -12)
  395. """
  396. Fused version of `default_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
  397. """
  398. fused_per_channel_wt_fake_quant_range_neg_127_to_127 = \
  399. FusedMovingAvgObsFakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
  400. quant_min=-127,
  401. quant_max=127,
  402. dtype=torch.qint8,
  403. qscheme=torch.per_channel_symmetric,
  404. eps=2 ** -12)
  405. """
  406. Fused version of `default_per_channel_weight_fake_quant`, with the 8-bit values restricted to [-127, +127], excluding -128.
  407. """
  408. def _is_fake_quant_script_module(mod):
  409. ''' Returns true if given mod is an instance of FakeQuantize script module.
  410. '''
  411. if isinstance(mod, torch.jit.RecursiveScriptModule):
  412. # qualified name looks like '__torch__.torch.ao.quantization.fake_quantize.___torch_mangle_2.FakeQuantize'
  413. suffix = mod._c.qualified_name.split('.', 1)[1]
  414. name = re.sub(r'\.___torch_mangle_\d+', '', suffix)
  415. return name == 'torch.ao.quantization.fake_quantize.FakeQuantize' or \
  416. name == 'torch.ao.quantization.fake_quantize.FusedMovingAvgObsFakeQuantize'
  417. return False
  418. def disable_fake_quant(mod):
  419. """
  420. Disable fake quantization for this module, if applicable. Example usage::
  421. # model is any PyTorch model
  422. model.apply(torch.ao.quantization.disable_fake_quant)
  423. """
  424. if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
  425. mod.disable_fake_quant()
  426. def enable_fake_quant(mod):
  427. """
  428. Enable fake quantization for this module, if applicable. Example usage::
  429. # model is any PyTorch model
  430. model.apply(torch.ao.quantization.enable_fake_quant)
  431. """
  432. if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
  433. mod.enable_fake_quant()
  434. def disable_observer(mod):
  435. """
  436. Disable observation for this module, if applicable. Example usage::
  437. # model is any PyTorch model
  438. model.apply(torch.ao.quantization.disable_observer)
  439. """
  440. if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
  441. mod.disable_observer()
  442. def enable_observer(mod):
  443. """
  444. Enable observation for this module, if applicable. Example usage::
  445. # model is any PyTorch model
  446. model.apply(torch.ao.quantization.enable_observer)
  447. """
  448. if isinstance(mod, FakeQuantizeBase) or _is_fake_quant_script_module(mod):
  449. mod.enable_observer()