_geometry.py 67 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447
  1. import math
  2. import numbers
  3. import warnings
  4. from typing import Any, Callable, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
  5. import PIL.Image
  6. import torch
  7. from torchvision import transforms as _transforms, tv_tensors
  8. from torchvision.ops.boxes import box_iou
  9. from torchvision.transforms.functional import _get_perspective_coeffs
  10. from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
  11. from torchvision.transforms.v2.functional._geometry import _check_interpolation
  12. from torchvision.transforms.v2.functional._utils import _FillType
  13. from ._transform import _RandomApplyTransform
  14. from ._utils import (
  15. _check_padding_arg,
  16. _check_padding_mode_arg,
  17. _check_sequence_input,
  18. _get_fill,
  19. _setup_angle,
  20. _setup_fill_arg,
  21. _setup_number_or_seq,
  22. _setup_size,
  23. get_bounding_boxes,
  24. has_all,
  25. has_any,
  26. is_pure_tensor,
  27. query_size,
  28. )
  29. class RandomHorizontalFlip(_RandomApplyTransform):
  30. """[BETA] Horizontally flip the input with a given probability.
  31. .. v2betastatus:: RandomHorizontalFlip transform
  32. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  33. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  34. it can have arbitrary number of leading batch dimensions. For example,
  35. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  36. Args:
  37. p (float, optional): probability of the input being flipped. Default value is 0.5
  38. """
  39. _v1_transform_cls = _transforms.RandomHorizontalFlip
  40. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  41. return self._call_kernel(F.horizontal_flip, inpt)
  42. class RandomVerticalFlip(_RandomApplyTransform):
  43. """[BETA] Vertically flip the input with a given probability.
  44. .. v2betastatus:: RandomVerticalFlip transform
  45. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  46. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  47. it can have arbitrary number of leading batch dimensions. For example,
  48. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  49. Args:
  50. p (float, optional): probability of the input being flipped. Default value is 0.5
  51. """
  52. _v1_transform_cls = _transforms.RandomVerticalFlip
  53. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  54. return self._call_kernel(F.vertical_flip, inpt)
  55. class Resize(Transform):
  56. """[BETA] Resize the input to the given size.
  57. .. v2betastatus:: Resize transform
  58. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  59. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  60. it can have arbitrary number of leading batch dimensions. For example,
  61. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  62. .. warning::
  63. The output image might be different depending on its type: when downsampling, the interpolation of PIL images
  64. and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
  65. in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
  66. types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
  67. closer.
  68. Args:
  69. size (sequence or int): Desired output size. If size is a sequence like
  70. (h, w), output size will be matched to this. If size is an int,
  71. smaller edge of the image will be matched to this number.
  72. i.e, if height > width, then image will be rescaled to
  73. (size * height / width, size).
  74. .. note::
  75. In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
  76. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  77. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  78. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
  79. ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
  80. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  81. max_size (int, optional): The maximum allowed for the longer edge of
  82. the resized image. If the longer edge of the image is greater
  83. than ``max_size`` after being resized according to ``size``,
  84. ``size`` will be overruled so that the longer edge is equal to
  85. ``max_size``.
  86. As a result, the smaller edge may be shorter than ``size``. This
  87. is only supported if ``size`` is an int (or a sequence of length
  88. 1 in torchscript mode).
  89. antialias (bool, optional): Whether to apply antialiasing.
  90. It only affects **tensors** with bilinear or bicubic modes and it is
  91. ignored otherwise: on PIL images, antialiasing is always applied on
  92. bilinear or bicubic modes; on other modes (for PIL images and
  93. tensors), antialiasing makes no sense and this parameter is ignored.
  94. Possible values are:
  95. - ``True``: will apply antialiasing for bilinear or bicubic modes.
  96. Other mode aren't affected. This is probably what you want to use.
  97. - ``False``: will not apply antialiasing for tensors on any mode. PIL
  98. images are still antialiased on bilinear or bicubic modes, because
  99. PIL doesn't support no antialias.
  100. - ``None``: equivalent to ``False`` for tensors and ``True`` for
  101. PIL images. This value exists for legacy reasons and you probably
  102. don't want to use it unless you really know what you are doing.
  103. The current default is ``None`` **but will change to** ``True`` **in
  104. v0.17** for the PIL and Tensor backends to be consistent.
  105. """
  106. _v1_transform_cls = _transforms.Resize
  107. def __init__(
  108. self,
  109. size: Union[int, Sequence[int]],
  110. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  111. max_size: Optional[int] = None,
  112. antialias: Optional[Union[str, bool]] = "warn",
  113. ) -> None:
  114. super().__init__()
  115. if isinstance(size, int):
  116. size = [size]
  117. elif isinstance(size, (list, tuple)) and len(size) in {1, 2}:
  118. size = list(size)
  119. else:
  120. raise ValueError(
  121. f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead."
  122. )
  123. self.size = size
  124. self.interpolation = _check_interpolation(interpolation)
  125. self.max_size = max_size
  126. self.antialias = antialias
  127. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  128. return self._call_kernel(
  129. F.resize,
  130. inpt,
  131. self.size,
  132. interpolation=self.interpolation,
  133. max_size=self.max_size,
  134. antialias=self.antialias,
  135. )
  136. class CenterCrop(Transform):
  137. """[BETA] Crop the input at the center.
  138. .. v2betastatus:: CenterCrop transform
  139. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  140. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  141. it can have arbitrary number of leading batch dimensions. For example,
  142. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  143. If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
  144. Args:
  145. size (sequence or int): Desired output size of the crop. If size is an
  146. int instead of sequence like (h, w), a square crop (size, size) is
  147. made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
  148. """
  149. _v1_transform_cls = _transforms.CenterCrop
  150. def __init__(self, size: Union[int, Sequence[int]]):
  151. super().__init__()
  152. self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
  153. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  154. return self._call_kernel(F.center_crop, inpt, output_size=self.size)
  155. class RandomResizedCrop(Transform):
  156. """[BETA] Crop a random portion of the input and resize it to a given size.
  157. .. v2betastatus:: RandomResizedCrop transform
  158. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  159. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  160. it can have arbitrary number of leading batch dimensions. For example,
  161. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  162. A crop of the original input is made: the crop has a random area (H * W)
  163. and a random aspect ratio. This crop is finally resized to the given
  164. size. This is popularly used to train the Inception networks.
  165. Args:
  166. size (int or sequence): expected output size of the crop, for each edge. If size is an
  167. int instead of sequence like (h, w), a square output size ``(size, size)`` is
  168. made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
  169. .. note::
  170. In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
  171. scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
  172. before resizing. The scale is defined with respect to the area of the original image.
  173. ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
  174. resizing.
  175. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  176. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  177. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
  178. ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
  179. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  180. antialias (bool, optional): Whether to apply antialiasing.
  181. It only affects **tensors** with bilinear or bicubic modes and it is
  182. ignored otherwise: on PIL images, antialiasing is always applied on
  183. bilinear or bicubic modes; on other modes (for PIL images and
  184. tensors), antialiasing makes no sense and this parameter is ignored.
  185. Possible values are:
  186. - ``True``: will apply antialiasing for bilinear or bicubic modes.
  187. Other mode aren't affected. This is probably what you want to use.
  188. - ``False``: will not apply antialiasing for tensors on any mode. PIL
  189. images are still antialiased on bilinear or bicubic modes, because
  190. PIL doesn't support no antialias.
  191. - ``None``: equivalent to ``False`` for tensors and ``True`` for
  192. PIL images. This value exists for legacy reasons and you probably
  193. don't want to use it unless you really know what you are doing.
  194. The current default is ``None`` **but will change to** ``True`` **in
  195. v0.17** for the PIL and Tensor backends to be consistent.
  196. """
  197. _v1_transform_cls = _transforms.RandomResizedCrop
  198. def __init__(
  199. self,
  200. size: Union[int, Sequence[int]],
  201. scale: Tuple[float, float] = (0.08, 1.0),
  202. ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
  203. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  204. antialias: Optional[Union[str, bool]] = "warn",
  205. ) -> None:
  206. super().__init__()
  207. self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
  208. if not isinstance(scale, Sequence):
  209. raise TypeError("Scale should be a sequence")
  210. scale = cast(Tuple[float, float], scale)
  211. if not isinstance(ratio, Sequence):
  212. raise TypeError("Ratio should be a sequence")
  213. ratio = cast(Tuple[float, float], ratio)
  214. if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
  215. warnings.warn("Scale and ratio should be of kind (min, max)")
  216. self.scale = scale
  217. self.ratio = ratio
  218. self.interpolation = _check_interpolation(interpolation)
  219. self.antialias = antialias
  220. self._log_ratio = torch.log(torch.tensor(self.ratio))
  221. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  222. height, width = query_size(flat_inputs)
  223. area = height * width
  224. log_ratio = self._log_ratio
  225. for _ in range(10):
  226. target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
  227. aspect_ratio = torch.exp(
  228. torch.empty(1).uniform_(
  229. log_ratio[0], # type: ignore[arg-type]
  230. log_ratio[1], # type: ignore[arg-type]
  231. )
  232. ).item()
  233. w = int(round(math.sqrt(target_area * aspect_ratio)))
  234. h = int(round(math.sqrt(target_area / aspect_ratio)))
  235. if 0 < w <= width and 0 < h <= height:
  236. i = torch.randint(0, height - h + 1, size=(1,)).item()
  237. j = torch.randint(0, width - w + 1, size=(1,)).item()
  238. break
  239. else:
  240. # Fallback to central crop
  241. in_ratio = float(width) / float(height)
  242. if in_ratio < min(self.ratio):
  243. w = width
  244. h = int(round(w / min(self.ratio)))
  245. elif in_ratio > max(self.ratio):
  246. h = height
  247. w = int(round(h * max(self.ratio)))
  248. else: # whole image
  249. w = width
  250. h = height
  251. i = (height - h) // 2
  252. j = (width - w) // 2
  253. return dict(top=i, left=j, height=h, width=w)
  254. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  255. return self._call_kernel(
  256. F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
  257. )
  258. class FiveCrop(Transform):
  259. """[BETA] Crop the image or video into four corners and the central crop.
  260. .. v2betastatus:: FiveCrop transform
  261. If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
  262. :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
  263. For example, the image can have ``[..., C, H, W]`` shape.
  264. .. Note::
  265. This transform returns a tuple of images and there may be a mismatch in the number of
  266. inputs and targets your Dataset returns. See below for an example of how to deal with
  267. this.
  268. Args:
  269. size (sequence or int): Desired output size of the crop. If size is an ``int``
  270. instead of sequence like (h, w), a square crop of size (size, size) is made.
  271. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
  272. Example:
  273. >>> class BatchMultiCrop(transforms.Transform):
  274. ... def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
  275. ... images_or_videos, labels = sample
  276. ... batch_size = len(images_or_videos)
  277. ... image_or_video = images_or_videos[0]
  278. ... images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
  279. ... labels = torch.full((batch_size,), label, device=images_or_videos.device)
  280. ... return images_or_videos, labels
  281. ...
  282. >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
  283. >>> label = 3
  284. >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
  285. >>> images, labels = transform(image, label)
  286. >>> images.shape
  287. torch.Size([5, 3, 224, 224])
  288. >>> labels
  289. tensor([3, 3, 3, 3, 3])
  290. """
  291. _v1_transform_cls = _transforms.FiveCrop
  292. def __init__(self, size: Union[int, Sequence[int]]) -> None:
  293. super().__init__()
  294. self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
  295. def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
  296. if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
  297. warnings.warn(
  298. f"{type(self).__name__}() is currently passing through inputs of type "
  299. f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
  300. )
  301. return super()._call_kernel(functional, inpt, *args, **kwargs)
  302. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  303. return self._call_kernel(F.five_crop, inpt, self.size)
  304. def _check_inputs(self, flat_inputs: List[Any]) -> None:
  305. if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
  306. raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
  307. class TenCrop(Transform):
  308. """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
  309. these (horizontal flipping is used by default).
  310. .. v2betastatus:: TenCrop transform
  311. If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
  312. :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
  313. For example, the image can have ``[..., C, H, W]`` shape.
  314. See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
  315. .. Note::
  316. This transform returns a tuple of images and there may be a mismatch in the number of
  317. inputs and targets your Dataset returns. See below for an example of how to deal with
  318. this.
  319. Args:
  320. size (sequence or int): Desired output size of the crop. If size is an
  321. int instead of sequence like (h, w), a square crop (size, size) is
  322. made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
  323. vertical_flip (bool, optional): Use vertical flipping instead of horizontal
  324. """
  325. _v1_transform_cls = _transforms.TenCrop
  326. def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
  327. super().__init__()
  328. self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
  329. self.vertical_flip = vertical_flip
  330. def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
  331. if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
  332. warnings.warn(
  333. f"{type(self).__name__}() is currently passing through inputs of type "
  334. f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
  335. )
  336. return super()._call_kernel(functional, inpt, *args, **kwargs)
  337. def _check_inputs(self, flat_inputs: List[Any]) -> None:
  338. if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
  339. raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
  340. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  341. return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
  342. class Pad(Transform):
  343. """[BETA] Pad the input on all sides with the given "pad" value.
  344. .. v2betastatus:: Pad transform
  345. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  346. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  347. it can have arbitrary number of leading batch dimensions. For example,
  348. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  349. Args:
  350. padding (int or sequence): Padding on each border. If a single int is provided this
  351. is used to pad all borders. If sequence of length 2 is provided this is the padding
  352. on left/right and top/bottom respectively. If a sequence of length 4 is provided
  353. this is the padding for the left, top, right and bottom borders respectively.
  354. .. note::
  355. In torchscript mode padding as single int is not supported, use a sequence of
  356. length 1: ``[padding, ]``.
  357. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  358. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  359. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  360. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  361. ``Mask`` will be filled with 0.
  362. padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
  363. Default is "constant".
  364. - constant: pads with a constant value, this value is specified with fill
  365. - edge: pads with the last value at the edge of the image.
  366. - reflect: pads with reflection of image without repeating the last value on the edge.
  367. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
  368. will result in [3, 2, 1, 2, 3, 4, 3, 2]
  369. - symmetric: pads with reflection of image repeating the last value on the edge.
  370. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
  371. will result in [2, 1, 1, 2, 3, 4, 4, 3]
  372. """
  373. _v1_transform_cls = _transforms.Pad
  374. def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
  375. params = super()._extract_params_for_v1_transform()
  376. if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
  377. raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
  378. return params
  379. def __init__(
  380. self,
  381. padding: Union[int, Sequence[int]],
  382. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  383. padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
  384. ) -> None:
  385. super().__init__()
  386. _check_padding_arg(padding)
  387. _check_padding_mode_arg(padding_mode)
  388. # This cast does Sequence[int] -> List[int] and is required to make mypy happy
  389. if not isinstance(padding, int):
  390. padding = list(padding)
  391. self.padding = padding
  392. self.fill = fill
  393. self._fill = _setup_fill_arg(fill)
  394. self.padding_mode = padding_mode
  395. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  396. fill = _get_fill(self._fill, type(inpt))
  397. return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode) # type: ignore[arg-type]
  398. class RandomZoomOut(_RandomApplyTransform):
  399. """[BETA] "Zoom out" transformation from
  400. `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
  401. .. v2betastatus:: RandomZoomOut transform
  402. This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
  403. Output spatial size is randomly sampled from original size up to a maximum size configured
  404. with ``side_range`` parameter:
  405. .. code-block:: python
  406. r = uniform_sample(side_range[0], side_range[1])
  407. output_width = input_width * r
  408. output_height = input_height * r
  409. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  410. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  411. it can have arbitrary number of leading batch dimensions. For example,
  412. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  413. Args:
  414. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  415. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  416. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  417. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  418. ``Mask`` will be filled with 0.
  419. side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
  420. scale the input size.
  421. p (float, optional): probability that the zoom operation will be performed.
  422. """
  423. def __init__(
  424. self,
  425. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  426. side_range: Sequence[float] = (1.0, 4.0),
  427. p: float = 0.5,
  428. ) -> None:
  429. super().__init__(p=p)
  430. self.fill = fill
  431. self._fill = _setup_fill_arg(fill)
  432. _check_sequence_input(side_range, "side_range", req_sizes=(2,))
  433. self.side_range = side_range
  434. if side_range[0] < 1.0 or side_range[0] > side_range[1]:
  435. raise ValueError(f"Invalid canvas side range provided {side_range}.")
  436. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  437. orig_h, orig_w = query_size(flat_inputs)
  438. r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
  439. canvas_width = int(orig_w * r)
  440. canvas_height = int(orig_h * r)
  441. r = torch.rand(2)
  442. left = int((canvas_width - orig_w) * r[0])
  443. top = int((canvas_height - orig_h) * r[1])
  444. right = canvas_width - (left + orig_w)
  445. bottom = canvas_height - (top + orig_h)
  446. padding = [left, top, right, bottom]
  447. return dict(padding=padding)
  448. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  449. fill = _get_fill(self._fill, type(inpt))
  450. return self._call_kernel(F.pad, inpt, **params, fill=fill)
  451. class RandomRotation(Transform):
  452. """[BETA] Rotate the input by angle.
  453. .. v2betastatus:: RandomRotation transform
  454. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  455. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  456. it can have arbitrary number of leading batch dimensions. For example,
  457. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  458. Args:
  459. degrees (sequence or number): Range of degrees to select from.
  460. If degrees is a number instead of sequence like (min, max), the range of degrees
  461. will be (-degrees, +degrees).
  462. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  463. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
  464. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
  465. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  466. expand (bool, optional): Optional expansion flag.
  467. If true, expands the output to make it large enough to hold the entire rotated image.
  468. If false or omitted, make the output image the same size as the input image.
  469. Note that the expand flag assumes rotation around the center (see note below) and no translation.
  470. center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
  471. Default is the center of the image.
  472. .. note::
  473. In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
  474. center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
  475. differences of the resulting image size compared to using the image center in the first place. Thus, when
  476. setting ``expand=True``, it's best to leave ``center=None`` (default).
  477. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  478. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  479. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  480. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  481. ``Mask`` will be filled with 0.
  482. .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
  483. """
  484. _v1_transform_cls = _transforms.RandomRotation
  485. def __init__(
  486. self,
  487. degrees: Union[numbers.Number, Sequence],
  488. interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
  489. expand: bool = False,
  490. center: Optional[List[float]] = None,
  491. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  492. ) -> None:
  493. super().__init__()
  494. self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
  495. self.interpolation = _check_interpolation(interpolation)
  496. self.expand = expand
  497. self.fill = fill
  498. self._fill = _setup_fill_arg(fill)
  499. if center is not None:
  500. _check_sequence_input(center, "center", req_sizes=(2,))
  501. self.center = center
  502. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  503. angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
  504. return dict(angle=angle)
  505. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  506. fill = _get_fill(self._fill, type(inpt))
  507. return self._call_kernel(
  508. F.rotate,
  509. inpt,
  510. **params,
  511. interpolation=self.interpolation,
  512. expand=self.expand,
  513. center=self.center,
  514. fill=fill,
  515. )
  516. class RandomAffine(Transform):
  517. """[BETA] Random affine transformation the input keeping center invariant.
  518. .. v2betastatus:: RandomAffine transform
  519. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  520. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  521. it can have arbitrary number of leading batch dimensions. For example,
  522. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  523. Args:
  524. degrees (sequence or number): Range of degrees to select from.
  525. If degrees is a number instead of sequence like (min, max), the range of degrees
  526. will be (-degrees, +degrees). Set to 0 to deactivate rotations.
  527. translate (tuple, optional): tuple of maximum absolute fraction for horizontal
  528. and vertical translations. For example translate=(a, b), then horizontal shift
  529. is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
  530. randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
  531. scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
  532. randomly sampled from the range a <= scale <= b. Will keep original scale by default.
  533. shear (sequence or number, optional): Range of degrees to select from.
  534. If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
  535. will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
  536. range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
  537. an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
  538. Will not apply shear by default.
  539. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  540. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
  541. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
  542. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  543. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  544. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  545. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  546. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  547. ``Mask`` will be filled with 0.
  548. center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
  549. Default is the center of the image.
  550. .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
  551. """
  552. _v1_transform_cls = _transforms.RandomAffine
  553. def __init__(
  554. self,
  555. degrees: Union[numbers.Number, Sequence],
  556. translate: Optional[Sequence[float]] = None,
  557. scale: Optional[Sequence[float]] = None,
  558. shear: Optional[Union[int, float, Sequence[float]]] = None,
  559. interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
  560. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  561. center: Optional[List[float]] = None,
  562. ) -> None:
  563. super().__init__()
  564. self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
  565. if translate is not None:
  566. _check_sequence_input(translate, "translate", req_sizes=(2,))
  567. for t in translate:
  568. if not (0.0 <= t <= 1.0):
  569. raise ValueError("translation values should be between 0 and 1")
  570. self.translate = translate
  571. if scale is not None:
  572. _check_sequence_input(scale, "scale", req_sizes=(2,))
  573. for s in scale:
  574. if s <= 0:
  575. raise ValueError("scale values should be positive")
  576. self.scale = scale
  577. if shear is not None:
  578. self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
  579. else:
  580. self.shear = shear
  581. self.interpolation = _check_interpolation(interpolation)
  582. self.fill = fill
  583. self._fill = _setup_fill_arg(fill)
  584. if center is not None:
  585. _check_sequence_input(center, "center", req_sizes=(2,))
  586. self.center = center
  587. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  588. height, width = query_size(flat_inputs)
  589. angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
  590. if self.translate is not None:
  591. max_dx = float(self.translate[0] * width)
  592. max_dy = float(self.translate[1] * height)
  593. tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
  594. ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
  595. translate = (tx, ty)
  596. else:
  597. translate = (0, 0)
  598. if self.scale is not None:
  599. scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
  600. else:
  601. scale = 1.0
  602. shear_x = shear_y = 0.0
  603. if self.shear is not None:
  604. shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
  605. if len(self.shear) == 4:
  606. shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
  607. shear = (shear_x, shear_y)
  608. return dict(angle=angle, translate=translate, scale=scale, shear=shear)
  609. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  610. fill = _get_fill(self._fill, type(inpt))
  611. return self._call_kernel(
  612. F.affine,
  613. inpt,
  614. **params,
  615. interpolation=self.interpolation,
  616. fill=fill,
  617. center=self.center,
  618. )
  619. class RandomCrop(Transform):
  620. """[BETA] Crop the input at a random location.
  621. .. v2betastatus:: RandomCrop transform
  622. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  623. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  624. it can have arbitrary number of leading batch dimensions. For example,
  625. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  626. Args:
  627. size (sequence or int): Desired output size of the crop. If size is an
  628. int instead of sequence like (h, w), a square crop (size, size) is
  629. made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
  630. padding (int or sequence, optional): Optional padding on each border
  631. of the image. Default is None. If a single int is provided this
  632. is used to pad all borders. If sequence of length 2 is provided this is the padding
  633. on left/right and top/bottom respectively. If a sequence of length 4 is provided
  634. this is the padding for the left, top, right and bottom borders respectively.
  635. .. note::
  636. In torchscript mode padding as single int is not supported, use a sequence of
  637. length 1: ``[padding, ]``.
  638. pad_if_needed (boolean, optional): It will pad the image if smaller than the
  639. desired size to avoid raising an exception. Since cropping is done
  640. after padding, the padding seems to be done at a random offset.
  641. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  642. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  643. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  644. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  645. ``Mask`` will be filled with 0.
  646. padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
  647. Default is constant.
  648. - constant: pads with a constant value, this value is specified with fill
  649. - edge: pads with the last value at the edge of the image.
  650. - reflect: pads with reflection of image without repeating the last value on the edge.
  651. For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
  652. will result in [3, 2, 1, 2, 3, 4, 3, 2]
  653. - symmetric: pads with reflection of image repeating the last value on the edge.
  654. For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
  655. will result in [2, 1, 1, 2, 3, 4, 4, 3]
  656. """
  657. _v1_transform_cls = _transforms.RandomCrop
  658. def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
  659. params = super()._extract_params_for_v1_transform()
  660. if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
  661. raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
  662. padding = self.padding
  663. if padding is not None:
  664. pad_left, pad_right, pad_top, pad_bottom = padding
  665. padding = [pad_left, pad_top, pad_right, pad_bottom]
  666. params["padding"] = padding
  667. return params
  668. def __init__(
  669. self,
  670. size: Union[int, Sequence[int]],
  671. padding: Optional[Union[int, Sequence[int]]] = None,
  672. pad_if_needed: bool = False,
  673. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  674. padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
  675. ) -> None:
  676. super().__init__()
  677. self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
  678. if pad_if_needed or padding is not None:
  679. if padding is not None:
  680. _check_padding_arg(padding)
  681. _check_padding_mode_arg(padding_mode)
  682. self.padding = F._geometry._parse_pad_padding(padding) if padding else None # type: ignore[arg-type]
  683. self.pad_if_needed = pad_if_needed
  684. self.fill = fill
  685. self._fill = _setup_fill_arg(fill)
  686. self.padding_mode = padding_mode
  687. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  688. padded_height, padded_width = query_size(flat_inputs)
  689. if self.padding is not None:
  690. pad_left, pad_right, pad_top, pad_bottom = self.padding
  691. padded_height += pad_top + pad_bottom
  692. padded_width += pad_left + pad_right
  693. else:
  694. pad_left = pad_right = pad_top = pad_bottom = 0
  695. cropped_height, cropped_width = self.size
  696. if self.pad_if_needed:
  697. if padded_height < cropped_height:
  698. diff = cropped_height - padded_height
  699. pad_top += diff
  700. pad_bottom += diff
  701. padded_height += 2 * diff
  702. if padded_width < cropped_width:
  703. diff = cropped_width - padded_width
  704. pad_left += diff
  705. pad_right += diff
  706. padded_width += 2 * diff
  707. if padded_height < cropped_height or padded_width < cropped_width:
  708. raise ValueError(
  709. f"Required crop size {(cropped_height, cropped_width)} is larger than "
  710. f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
  711. )
  712. # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
  713. padding = [pad_left, pad_top, pad_right, pad_bottom]
  714. needs_pad = any(padding)
  715. needs_vert_crop, top = (
  716. (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
  717. if padded_height > cropped_height
  718. else (False, 0)
  719. )
  720. needs_horz_crop, left = (
  721. (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
  722. if padded_width > cropped_width
  723. else (False, 0)
  724. )
  725. return dict(
  726. needs_crop=needs_vert_crop or needs_horz_crop,
  727. top=top,
  728. left=left,
  729. height=cropped_height,
  730. width=cropped_width,
  731. needs_pad=needs_pad,
  732. padding=padding,
  733. )
  734. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  735. if params["needs_pad"]:
  736. fill = _get_fill(self._fill, type(inpt))
  737. inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
  738. if params["needs_crop"]:
  739. inpt = self._call_kernel(
  740. F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
  741. )
  742. return inpt
  743. class RandomPerspective(_RandomApplyTransform):
  744. """[BETA] Perform a random perspective transformation of the input with a given probability.
  745. .. v2betastatus:: RandomPerspective transform
  746. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  747. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  748. it can have arbitrary number of leading batch dimensions. For example,
  749. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  750. Args:
  751. distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
  752. Default is 0.5.
  753. p (float, optional): probability of the input being transformed. Default is 0.5.
  754. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  755. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  756. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
  757. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  758. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  759. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  760. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  761. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  762. ``Mask`` will be filled with 0.
  763. """
  764. _v1_transform_cls = _transforms.RandomPerspective
  765. def __init__(
  766. self,
  767. distortion_scale: float = 0.5,
  768. p: float = 0.5,
  769. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  770. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  771. ) -> None:
  772. super().__init__(p=p)
  773. if not (0 <= distortion_scale <= 1):
  774. raise ValueError("Argument distortion_scale value should be between 0 and 1")
  775. self.distortion_scale = distortion_scale
  776. self.interpolation = _check_interpolation(interpolation)
  777. self.fill = fill
  778. self._fill = _setup_fill_arg(fill)
  779. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  780. height, width = query_size(flat_inputs)
  781. distortion_scale = self.distortion_scale
  782. half_height = height // 2
  783. half_width = width // 2
  784. bound_height = int(distortion_scale * half_height) + 1
  785. bound_width = int(distortion_scale * half_width) + 1
  786. topleft = [
  787. int(torch.randint(0, bound_width, size=(1,))),
  788. int(torch.randint(0, bound_height, size=(1,))),
  789. ]
  790. topright = [
  791. int(torch.randint(width - bound_width, width, size=(1,))),
  792. int(torch.randint(0, bound_height, size=(1,))),
  793. ]
  794. botright = [
  795. int(torch.randint(width - bound_width, width, size=(1,))),
  796. int(torch.randint(height - bound_height, height, size=(1,))),
  797. ]
  798. botleft = [
  799. int(torch.randint(0, bound_width, size=(1,))),
  800. int(torch.randint(height - bound_height, height, size=(1,))),
  801. ]
  802. startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
  803. endpoints = [topleft, topright, botright, botleft]
  804. perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
  805. return dict(coefficients=perspective_coeffs)
  806. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  807. fill = _get_fill(self._fill, type(inpt))
  808. return self._call_kernel(
  809. F.perspective,
  810. inpt,
  811. None,
  812. None,
  813. fill=fill,
  814. interpolation=self.interpolation,
  815. **params,
  816. )
  817. class ElasticTransform(Transform):
  818. """[BETA] Transform the input with elastic transformations.
  819. .. v2betastatus:: RandomPerspective transform
  820. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  821. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  822. it can have arbitrary number of leading batch dimensions. For example,
  823. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  824. Given alpha and sigma, it will generate displacement
  825. vectors for all pixels based on random offsets. Alpha controls the strength
  826. and sigma controls the smoothness of the displacements.
  827. The displacements are added to an identity grid and the resulting grid is
  828. used to transform the input.
  829. .. note::
  830. Implementation to transform bounding boxes is approximative (not exact).
  831. We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
  832. This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
  833. Our assumption is that ``displacement * displacement`` is small and can be ignored.
  834. Large displacements would lead to large errors in the approximation.
  835. Applications:
  836. Randomly transforms the morphology of objects in images and produces a
  837. see-through-water-like effect.
  838. Args:
  839. alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
  840. sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
  841. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  842. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  843. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
  844. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  845. fill (number or tuple or dict, optional): Pixel fill value used when the ``padding_mode`` is constant.
  846. Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
  847. Fill value can be also a dictionary mapping data type to the fill value, e.g.
  848. ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
  849. ``Mask`` will be filled with 0.
  850. """
  851. _v1_transform_cls = _transforms.ElasticTransform
  852. def __init__(
  853. self,
  854. alpha: Union[float, Sequence[float]] = 50.0,
  855. sigma: Union[float, Sequence[float]] = 5.0,
  856. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  857. fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
  858. ) -> None:
  859. super().__init__()
  860. self.alpha = _setup_number_or_seq(alpha, "alpha")
  861. self.sigma = _setup_number_or_seq(sigma, "sigma")
  862. self.interpolation = _check_interpolation(interpolation)
  863. self.fill = fill
  864. self._fill = _setup_fill_arg(fill)
  865. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  866. size = list(query_size(flat_inputs))
  867. dx = torch.rand([1, 1] + size) * 2 - 1
  868. if self.sigma[0] > 0.0:
  869. kx = int(8 * self.sigma[0] + 1)
  870. # if kernel size is even we have to make it odd
  871. if kx % 2 == 0:
  872. kx += 1
  873. dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
  874. dx = dx * self.alpha[0] / size[0]
  875. dy = torch.rand([1, 1] + size) * 2 - 1
  876. if self.sigma[1] > 0.0:
  877. ky = int(8 * self.sigma[1] + 1)
  878. # if kernel size is even we have to make it odd
  879. if ky % 2 == 0:
  880. ky += 1
  881. dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
  882. dy = dy * self.alpha[1] / size[1]
  883. displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1]) # 1 x H x W x 2
  884. return dict(displacement=displacement)
  885. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  886. fill = _get_fill(self._fill, type(inpt))
  887. return self._call_kernel(
  888. F.elastic,
  889. inpt,
  890. **params,
  891. fill=fill,
  892. interpolation=self.interpolation,
  893. )
  894. class RandomIoUCrop(Transform):
  895. """[BETA] Random IoU crop transformation from
  896. `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
  897. .. v2betastatus:: RandomIoUCrop transform
  898. This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
  899. .. warning::
  900. In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
  901. must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
  902. after or later in the transforms pipeline.
  903. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  904. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  905. it can have arbitrary number of leading batch dimensions. For example,
  906. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  907. Args:
  908. min_scale (float, optional): Minimum factors to scale the input size.
  909. max_scale (float, optional): Maximum factors to scale the input size.
  910. min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
  911. max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
  912. sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
  913. a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
  914. trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
  915. Default, 40.
  916. """
  917. def __init__(
  918. self,
  919. min_scale: float = 0.3,
  920. max_scale: float = 1.0,
  921. min_aspect_ratio: float = 0.5,
  922. max_aspect_ratio: float = 2.0,
  923. sampler_options: Optional[List[float]] = None,
  924. trials: int = 40,
  925. ):
  926. super().__init__()
  927. # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
  928. self.min_scale = min_scale
  929. self.max_scale = max_scale
  930. self.min_aspect_ratio = min_aspect_ratio
  931. self.max_aspect_ratio = max_aspect_ratio
  932. if sampler_options is None:
  933. sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
  934. self.options = sampler_options
  935. self.trials = trials
  936. def _check_inputs(self, flat_inputs: List[Any]) -> None:
  937. if not (
  938. has_all(flat_inputs, tv_tensors.BoundingBoxes)
  939. and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
  940. ):
  941. raise TypeError(
  942. f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
  943. "and bounding boxes. Sample can also contain masks."
  944. )
  945. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  946. orig_h, orig_w = query_size(flat_inputs)
  947. bboxes = get_bounding_boxes(flat_inputs)
  948. while True:
  949. # sample an option
  950. idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
  951. min_jaccard_overlap = self.options[idx]
  952. if min_jaccard_overlap >= 1.0: # a value larger than 1 encodes the leave as-is option
  953. return dict()
  954. for _ in range(self.trials):
  955. # check the aspect ratio limitations
  956. r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
  957. new_w = int(orig_w * r[0])
  958. new_h = int(orig_h * r[1])
  959. aspect_ratio = new_w / new_h
  960. if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
  961. continue
  962. # check for 0 area crops
  963. r = torch.rand(2)
  964. left = int((orig_w - new_w) * r[0])
  965. top = int((orig_h - new_h) * r[1])
  966. right = left + new_w
  967. bottom = top + new_h
  968. if left == right or top == bottom:
  969. continue
  970. # check for any valid boxes with centers within the crop area
  971. xyxy_bboxes = F.convert_bounding_box_format(
  972. bboxes.as_subclass(torch.Tensor),
  973. bboxes.format,
  974. tv_tensors.BoundingBoxFormat.XYXY,
  975. )
  976. cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
  977. cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
  978. is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
  979. if not is_within_crop_area.any():
  980. continue
  981. # check at least 1 box with jaccard limitations
  982. xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
  983. ious = box_iou(
  984. xyxy_bboxes,
  985. torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
  986. )
  987. if ious.max() < min_jaccard_overlap:
  988. continue
  989. return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
  990. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  991. if len(params) < 1:
  992. return inpt
  993. output = self._call_kernel(
  994. F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
  995. )
  996. if isinstance(output, tv_tensors.BoundingBoxes):
  997. # We "mark" the invalid boxes as degenreate, and they can be
  998. # removed by a later call to SanitizeBoundingBoxes()
  999. output[~params["is_within_crop_area"]] = 0
  1000. return output
  1001. class ScaleJitter(Transform):
  1002. """[BETA] Perform Large Scale Jitter on the input according to
  1003. `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
  1004. .. v2betastatus:: ScaleJitter transform
  1005. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  1006. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  1007. it can have arbitrary number of leading batch dimensions. For example,
  1008. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  1009. Args:
  1010. target_size (tuple of int): Target size. This parameter defines base scale for jittering,
  1011. e.g. ``min(target_size[0] / width, target_size[1] / height)``.
  1012. scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
  1013. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  1014. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  1015. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
  1016. ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
  1017. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  1018. antialias (bool, optional): Whether to apply antialiasing.
  1019. It only affects **tensors** with bilinear or bicubic modes and it is
  1020. ignored otherwise: on PIL images, antialiasing is always applied on
  1021. bilinear or bicubic modes; on other modes (for PIL images and
  1022. tensors), antialiasing makes no sense and this parameter is ignored.
  1023. Possible values are:
  1024. - ``True``: will apply antialiasing for bilinear or bicubic modes.
  1025. Other mode aren't affected. This is probably what you want to use.
  1026. - ``False``: will not apply antialiasing for tensors on any mode. PIL
  1027. images are still antialiased on bilinear or bicubic modes, because
  1028. PIL doesn't support no antialias.
  1029. - ``None``: equivalent to ``False`` for tensors and ``True`` for
  1030. PIL images. This value exists for legacy reasons and you probably
  1031. don't want to use it unless you really know what you are doing.
  1032. The current default is ``None`` **but will change to** ``True`` **in
  1033. v0.17** for the PIL and Tensor backends to be consistent.
  1034. """
  1035. def __init__(
  1036. self,
  1037. target_size: Tuple[int, int],
  1038. scale_range: Tuple[float, float] = (0.1, 2.0),
  1039. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  1040. antialias: Optional[Union[str, bool]] = "warn",
  1041. ):
  1042. super().__init__()
  1043. self.target_size = target_size
  1044. self.scale_range = scale_range
  1045. self.interpolation = _check_interpolation(interpolation)
  1046. self.antialias = antialias
  1047. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  1048. orig_height, orig_width = query_size(flat_inputs)
  1049. scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
  1050. r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
  1051. new_width = int(orig_width * r)
  1052. new_height = int(orig_height * r)
  1053. return dict(size=(new_height, new_width))
  1054. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  1055. return self._call_kernel(
  1056. F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
  1057. )
  1058. class RandomShortestSize(Transform):
  1059. """[BETA] Randomly resize the input.
  1060. .. v2betastatus:: RandomShortestSize transform
  1061. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  1062. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  1063. it can have arbitrary number of leading batch dimensions. For example,
  1064. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  1065. Args:
  1066. min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
  1067. max_size (int, optional): Maximum spatial size. Default, None.
  1068. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  1069. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  1070. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
  1071. ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
  1072. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  1073. antialias (bool, optional): Whether to apply antialiasing.
  1074. It only affects **tensors** with bilinear or bicubic modes and it is
  1075. ignored otherwise: on PIL images, antialiasing is always applied on
  1076. bilinear or bicubic modes; on other modes (for PIL images and
  1077. tensors), antialiasing makes no sense and this parameter is ignored.
  1078. Possible values are:
  1079. - ``True``: will apply antialiasing for bilinear or bicubic modes.
  1080. Other mode aren't affected. This is probably what you want to use.
  1081. - ``False``: will not apply antialiasing for tensors on any mode. PIL
  1082. images are still antialiased on bilinear or bicubic modes, because
  1083. PIL doesn't support no antialias.
  1084. - ``None``: equivalent to ``False`` for tensors and ``True`` for
  1085. PIL images. This value exists for legacy reasons and you probably
  1086. don't want to use it unless you really know what you are doing.
  1087. The current default is ``None`` **but will change to** ``True`` **in
  1088. v0.17** for the PIL and Tensor backends to be consistent.
  1089. """
  1090. def __init__(
  1091. self,
  1092. min_size: Union[List[int], Tuple[int], int],
  1093. max_size: Optional[int] = None,
  1094. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  1095. antialias: Optional[Union[str, bool]] = "warn",
  1096. ):
  1097. super().__init__()
  1098. self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
  1099. self.max_size = max_size
  1100. self.interpolation = _check_interpolation(interpolation)
  1101. self.antialias = antialias
  1102. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  1103. orig_height, orig_width = query_size(flat_inputs)
  1104. min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
  1105. r = min_size / min(orig_height, orig_width)
  1106. if self.max_size is not None:
  1107. r = min(r, self.max_size / max(orig_height, orig_width))
  1108. new_width = int(orig_width * r)
  1109. new_height = int(orig_height * r)
  1110. return dict(size=(new_height, new_width))
  1111. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  1112. return self._call_kernel(
  1113. F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
  1114. )
  1115. class RandomResize(Transform):
  1116. """[BETA] Randomly resize the input.
  1117. .. v2betastatus:: RandomResize transform
  1118. This transformation can be used together with ``RandomCrop`` as data augmentations to train
  1119. models on image segmentation task.
  1120. Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
  1121. .. code-block:: python
  1122. size = uniform_sample(min_size, max_size)
  1123. output_width = size
  1124. output_height = size
  1125. If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
  1126. :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
  1127. it can have arbitrary number of leading batch dimensions. For example,
  1128. the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
  1129. Args:
  1130. min_size (int): Minimum output size for random sampling
  1131. max_size (int): Maximum output size for random sampling
  1132. interpolation (InterpolationMode, optional): Desired interpolation enum defined by
  1133. :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
  1134. If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
  1135. ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
  1136. The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
  1137. antialias (bool, optional): Whether to apply antialiasing.
  1138. It only affects **tensors** with bilinear or bicubic modes and it is
  1139. ignored otherwise: on PIL images, antialiasing is always applied on
  1140. bilinear or bicubic modes; on other modes (for PIL images and
  1141. tensors), antialiasing makes no sense and this parameter is ignored.
  1142. Possible values are:
  1143. - ``True``: will apply antialiasing for bilinear or bicubic modes.
  1144. Other mode aren't affected. This is probably what you want to use.
  1145. - ``False``: will not apply antialiasing for tensors on any mode. PIL
  1146. images are still antialiased on bilinear or bicubic modes, because
  1147. PIL doesn't support no antialias.
  1148. - ``None``: equivalent to ``False`` for tensors and ``True`` for
  1149. PIL images. This value exists for legacy reasons and you probably
  1150. don't want to use it unless you really know what you are doing.
  1151. The current default is ``None`` **but will change to** ``True`` **in
  1152. v0.17** for the PIL and Tensor backends to be consistent.
  1153. """
  1154. def __init__(
  1155. self,
  1156. min_size: int,
  1157. max_size: int,
  1158. interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
  1159. antialias: Optional[Union[str, bool]] = "warn",
  1160. ) -> None:
  1161. super().__init__()
  1162. self.min_size = min_size
  1163. self.max_size = max_size
  1164. self.interpolation = _check_interpolation(interpolation)
  1165. self.antialias = antialias
  1166. def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
  1167. size = int(torch.randint(self.min_size, self.max_size, ()))
  1168. return dict(size=[size])
  1169. def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
  1170. return self._call_kernel(
  1171. F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
  1172. )