boxes.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. from typing import Tuple
  2. import torch
  3. import torchvision
  4. from torch import Tensor
  5. from torchvision.extension import _assert_has_ops
  6. from ..utils import _log_api_usage_once
  7. from ._box_convert import _box_cxcywh_to_xyxy, _box_xywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xyxy_to_xywh
  8. from ._utils import _upcast
  9. def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
  10. """
  11. Performs non-maximum suppression (NMS) on the boxes according
  12. to their intersection-over-union (IoU).
  13. NMS iteratively removes lower scoring boxes which have an
  14. IoU greater than iou_threshold with another (higher scoring)
  15. box.
  16. If multiple boxes have the exact same score and satisfy the IoU
  17. criterion with respect to a reference box, the selected box is
  18. not guaranteed to be the same between CPU and GPU. This is similar
  19. to the behavior of argsort in PyTorch when repeated values are present.
  20. Args:
  21. boxes (Tensor[N, 4])): boxes to perform NMS on. They
  22. are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
  23. ``0 <= y1 < y2``.
  24. scores (Tensor[N]): scores for each one of the boxes
  25. iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
  26. Returns:
  27. Tensor: int64 tensor with the indices of the elements that have been kept
  28. by NMS, sorted in decreasing order of scores
  29. """
  30. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  31. _log_api_usage_once(nms)
  32. _assert_has_ops()
  33. return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
  34. def batched_nms(
  35. boxes: Tensor,
  36. scores: Tensor,
  37. idxs: Tensor,
  38. iou_threshold: float,
  39. ) -> Tensor:
  40. """
  41. Performs non-maximum suppression in a batched fashion.
  42. Each index value correspond to a category, and NMS
  43. will not be applied between elements of different categories.
  44. Args:
  45. boxes (Tensor[N, 4]): boxes where NMS will be performed. They
  46. are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
  47. ``0 <= y1 < y2``.
  48. scores (Tensor[N]): scores for each one of the boxes
  49. idxs (Tensor[N]): indices of the categories for each one of the boxes.
  50. iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
  51. Returns:
  52. Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
  53. in decreasing order of scores
  54. """
  55. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  56. _log_api_usage_once(batched_nms)
  57. # Benchmarks that drove the following thresholds are at
  58. # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
  59. if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
  60. return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
  61. else:
  62. return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)
  63. @torch.jit._script_if_tracing
  64. def _batched_nms_coordinate_trick(
  65. boxes: Tensor,
  66. scores: Tensor,
  67. idxs: Tensor,
  68. iou_threshold: float,
  69. ) -> Tensor:
  70. # strategy: in order to perform NMS independently per class,
  71. # we add an offset to all the boxes. The offset is dependent
  72. # only on the class idx, and is large enough so that boxes
  73. # from different classes do not overlap
  74. if boxes.numel() == 0:
  75. return torch.empty((0,), dtype=torch.int64, device=boxes.device)
  76. max_coordinate = boxes.max()
  77. offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
  78. boxes_for_nms = boxes + offsets[:, None]
  79. keep = nms(boxes_for_nms, scores, iou_threshold)
  80. return keep
  81. @torch.jit._script_if_tracing
  82. def _batched_nms_vanilla(
  83. boxes: Tensor,
  84. scores: Tensor,
  85. idxs: Tensor,
  86. iou_threshold: float,
  87. ) -> Tensor:
  88. # Based on Detectron2 implementation, just manually call nms() on each class independently
  89. keep_mask = torch.zeros_like(scores, dtype=torch.bool)
  90. for class_id in torch.unique(idxs):
  91. curr_indices = torch.where(idxs == class_id)[0]
  92. curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
  93. keep_mask[curr_indices[curr_keep_indices]] = True
  94. keep_indices = torch.where(keep_mask)[0]
  95. return keep_indices[scores[keep_indices].sort(descending=True)[1]]
  96. def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
  97. """
  98. Remove boxes which contains at least one side smaller than min_size.
  99. Args:
  100. boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
  101. with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  102. min_size (float): minimum size
  103. Returns:
  104. Tensor[K]: indices of the boxes that have both sides
  105. larger than min_size
  106. """
  107. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  108. _log_api_usage_once(remove_small_boxes)
  109. ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
  110. keep = (ws >= min_size) & (hs >= min_size)
  111. keep = torch.where(keep)[0]
  112. return keep
  113. def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
  114. """
  115. Clip boxes so that they lie inside an image of size `size`.
  116. Args:
  117. boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
  118. with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  119. size (Tuple[height, width]): size of the image
  120. Returns:
  121. Tensor[N, 4]: clipped boxes
  122. """
  123. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  124. _log_api_usage_once(clip_boxes_to_image)
  125. dim = boxes.dim()
  126. boxes_x = boxes[..., 0::2]
  127. boxes_y = boxes[..., 1::2]
  128. height, width = size
  129. if torchvision._is_tracing():
  130. boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
  131. boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
  132. boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
  133. boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
  134. else:
  135. boxes_x = boxes_x.clamp(min=0, max=width)
  136. boxes_y = boxes_y.clamp(min=0, max=height)
  137. clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
  138. return clipped_boxes.reshape(boxes.shape)
  139. def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
  140. """
  141. Converts boxes from given in_fmt to out_fmt.
  142. Supported in_fmt and out_fmt are:
  143. 'xyxy': boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
  144. This is the format that torchvision utilities expect.
  145. 'xywh' : boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
  146. 'cxcywh' : boxes are represented via centre, width and height, cx, cy being center of box, w, h
  147. being width and height.
  148. Args:
  149. boxes (Tensor[N, 4]): boxes which will be converted.
  150. in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
  151. out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']
  152. Returns:
  153. Tensor[N, 4]: Boxes into converted format.
  154. """
  155. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  156. _log_api_usage_once(box_convert)
  157. allowed_fmts = ("xyxy", "xywh", "cxcywh")
  158. if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
  159. raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
  160. if in_fmt == out_fmt:
  161. return boxes.clone()
  162. if in_fmt != "xyxy" and out_fmt != "xyxy":
  163. # convert to xyxy and change in_fmt xyxy
  164. if in_fmt == "xywh":
  165. boxes = _box_xywh_to_xyxy(boxes)
  166. elif in_fmt == "cxcywh":
  167. boxes = _box_cxcywh_to_xyxy(boxes)
  168. in_fmt = "xyxy"
  169. if in_fmt == "xyxy":
  170. if out_fmt == "xywh":
  171. boxes = _box_xyxy_to_xywh(boxes)
  172. elif out_fmt == "cxcywh":
  173. boxes = _box_xyxy_to_cxcywh(boxes)
  174. elif out_fmt == "xyxy":
  175. if in_fmt == "xywh":
  176. boxes = _box_xywh_to_xyxy(boxes)
  177. elif in_fmt == "cxcywh":
  178. boxes = _box_cxcywh_to_xyxy(boxes)
  179. return boxes
  180. def box_area(boxes: Tensor) -> Tensor:
  181. """
  182. Computes the area of a set of bounding boxes, which are specified by their
  183. (x1, y1, x2, y2) coordinates.
  184. Args:
  185. boxes (Tensor[N, 4]): boxes for which the area will be computed. They
  186. are expected to be in (x1, y1, x2, y2) format with
  187. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  188. Returns:
  189. Tensor[N]: the area for each box
  190. """
  191. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  192. _log_api_usage_once(box_area)
  193. boxes = _upcast(boxes)
  194. return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
  195. # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
  196. # with slight modifications
  197. def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
  198. area1 = box_area(boxes1)
  199. area2 = box_area(boxes2)
  200. lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
  201. rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
  202. wh = _upcast(rb - lt).clamp(min=0) # [N,M,2]
  203. inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
  204. union = area1[:, None] + area2 - inter
  205. return inter, union
  206. def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
  207. """
  208. Return intersection-over-union (Jaccard index) between two sets of boxes.
  209. Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
  210. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  211. Args:
  212. boxes1 (Tensor[N, 4]): first set of boxes
  213. boxes2 (Tensor[M, 4]): second set of boxes
  214. Returns:
  215. Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
  216. """
  217. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  218. _log_api_usage_once(box_iou)
  219. inter, union = _box_inter_union(boxes1, boxes2)
  220. iou = inter / union
  221. return iou
  222. # Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
  223. def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
  224. """
  225. Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
  226. Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
  227. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  228. Args:
  229. boxes1 (Tensor[N, 4]): first set of boxes
  230. boxes2 (Tensor[M, 4]): second set of boxes
  231. Returns:
  232. Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
  233. for every element in boxes1 and boxes2
  234. """
  235. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  236. _log_api_usage_once(generalized_box_iou)
  237. inter, union = _box_inter_union(boxes1, boxes2)
  238. iou = inter / union
  239. lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
  240. rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
  241. whi = _upcast(rbi - lti).clamp(min=0) # [N,M,2]
  242. areai = whi[:, :, 0] * whi[:, :, 1]
  243. return iou - (areai - union) / areai
  244. def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
  245. """
  246. Return complete intersection-over-union (Jaccard index) between two sets of boxes.
  247. Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
  248. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  249. Args:
  250. boxes1 (Tensor[N, 4]): first set of boxes
  251. boxes2 (Tensor[M, 4]): second set of boxes
  252. eps (float, optional): small number to prevent division by zero. Default: 1e-7
  253. Returns:
  254. Tensor[N, M]: the NxM matrix containing the pairwise complete IoU values
  255. for every element in boxes1 and boxes2
  256. """
  257. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  258. _log_api_usage_once(complete_box_iou)
  259. boxes1 = _upcast(boxes1)
  260. boxes2 = _upcast(boxes2)
  261. diou, iou = _box_diou_iou(boxes1, boxes2, eps)
  262. w_pred = boxes1[:, None, 2] - boxes1[:, None, 0]
  263. h_pred = boxes1[:, None, 3] - boxes1[:, None, 1]
  264. w_gt = boxes2[:, 2] - boxes2[:, 0]
  265. h_gt = boxes2[:, 3] - boxes2[:, 1]
  266. v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w_pred / h_pred) - torch.atan(w_gt / h_gt), 2)
  267. with torch.no_grad():
  268. alpha = v / (1 - iou + v + eps)
  269. return diou - alpha * v
  270. def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
  271. """
  272. Return distance intersection-over-union (Jaccard index) between two sets of boxes.
  273. Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
  274. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  275. Args:
  276. boxes1 (Tensor[N, 4]): first set of boxes
  277. boxes2 (Tensor[M, 4]): second set of boxes
  278. eps (float, optional): small number to prevent division by zero. Default: 1e-7
  279. Returns:
  280. Tensor[N, M]: the NxM matrix containing the pairwise distance IoU values
  281. for every element in boxes1 and boxes2
  282. """
  283. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  284. _log_api_usage_once(distance_box_iou)
  285. boxes1 = _upcast(boxes1)
  286. boxes2 = _upcast(boxes2)
  287. diou, _ = _box_diou_iou(boxes1, boxes2, eps=eps)
  288. return diou
  289. def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tuple[Tensor, Tensor]:
  290. iou = box_iou(boxes1, boxes2)
  291. lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
  292. rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
  293. whi = _upcast(rbi - lti).clamp(min=0) # [N,M,2]
  294. diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
  295. # centers of boxes
  296. x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
  297. y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
  298. x_g = (boxes2[:, 0] + boxes2[:, 2]) / 2
  299. y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
  300. # The distance between boxes' centers squared.
  301. centers_distance_squared = (_upcast((x_p[:, None] - x_g[None, :])) ** 2) + (
  302. _upcast((y_p[:, None] - y_g[None, :])) ** 2
  303. )
  304. # The distance IoU is the IoU penalized by a normalized
  305. # distance between boxes' centers squared.
  306. return iou - (centers_distance_squared / diagonal_distance_squared), iou
  307. def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
  308. """
  309. Compute the bounding boxes around the provided masks.
  310. Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
  311. ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
  312. Args:
  313. masks (Tensor[N, H, W]): masks to transform where N is the number of masks
  314. and (H, W) are the spatial dimensions.
  315. Returns:
  316. Tensor[N, 4]: bounding boxes
  317. """
  318. if not torch.jit.is_scripting() and not torch.jit.is_tracing():
  319. _log_api_usage_once(masks_to_boxes)
  320. if masks.numel() == 0:
  321. return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
  322. n = masks.shape[0]
  323. bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
  324. for index, mask in enumerate(masks):
  325. y, x = torch.where(mask != 0)
  326. bounding_boxes[index, 0] = torch.min(x)
  327. bounding_boxes[index, 1] = torch.min(y)
  328. bounding_boxes[index, 2] = torch.max(x)
  329. bounding_boxes[index, 3] = torch.max(y)
  330. return bounding_boxes