roi_heads.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876
  1. from typing import Dict, List, Optional, Tuple
  2. import torch
  3. import torch.nn.functional as F
  4. import torchvision
  5. from torch import nn, Tensor
  6. from torchvision.ops import boxes as box_ops, roi_align
  7. from . import _utils as det_utils
  8. def fastrcnn_loss(class_logits, box_regression, labels, regression_targets):
  9. # type: (Tensor, Tensor, List[Tensor], List[Tensor]) -> Tuple[Tensor, Tensor]
  10. """
  11. Computes the loss for Faster R-CNN.
  12. Args:
  13. class_logits (Tensor)
  14. box_regression (Tensor)
  15. labels (list[BoxList])
  16. regression_targets (Tensor)
  17. Returns:
  18. classification_loss (Tensor)
  19. box_loss (Tensor)
  20. """
  21. labels = torch.cat(labels, dim=0)
  22. regression_targets = torch.cat(regression_targets, dim=0)
  23. classification_loss = F.cross_entropy(class_logits, labels)
  24. # get indices that correspond to the regression targets for
  25. # the corresponding ground truth labels, to be used with
  26. # advanced indexing
  27. sampled_pos_inds_subset = torch.where(labels > 0)[0]
  28. labels_pos = labels[sampled_pos_inds_subset]
  29. N, num_classes = class_logits.shape
  30. box_regression = box_regression.reshape(N, box_regression.size(-1) // 4, 4)
  31. box_loss = F.smooth_l1_loss(
  32. box_regression[sampled_pos_inds_subset, labels_pos],
  33. regression_targets[sampled_pos_inds_subset],
  34. beta=1 / 9,
  35. reduction="sum",
  36. )
  37. box_loss = box_loss / labels.numel()
  38. return classification_loss, box_loss
  39. def maskrcnn_inference(x, labels):
  40. # type: (Tensor, List[Tensor]) -> List[Tensor]
  41. """
  42. From the results of the CNN, post process the masks
  43. by taking the mask corresponding to the class with max
  44. probability (which are of fixed size and directly output
  45. by the CNN) and return the masks in the mask field of the BoxList.
  46. Args:
  47. x (Tensor): the mask logits
  48. labels (list[BoxList]): bounding boxes that are used as
  49. reference, one for ech image
  50. Returns:
  51. results (list[BoxList]): one BoxList for each image, containing
  52. the extra field mask
  53. """
  54. mask_prob = x.sigmoid()
  55. # select masks corresponding to the predicted classes
  56. num_masks = x.shape[0]
  57. boxes_per_image = [label.shape[0] for label in labels]
  58. labels = torch.cat(labels)
  59. index = torch.arange(num_masks, device=labels.device)
  60. mask_prob = mask_prob[index, labels][:, None]
  61. mask_prob = mask_prob.split(boxes_per_image, dim=0)
  62. return mask_prob
  63. def project_masks_on_boxes(gt_masks, boxes, matched_idxs, M):
  64. # type: (Tensor, Tensor, Tensor, int) -> Tensor
  65. """
  66. Given segmentation masks and the bounding boxes corresponding
  67. to the location of the masks in the image, this function
  68. crops and resizes the masks in the position defined by the
  69. boxes. This prepares the masks for them to be fed to the
  70. loss computation as the targets.
  71. """
  72. matched_idxs = matched_idxs.to(boxes)
  73. rois = torch.cat([matched_idxs[:, None], boxes], dim=1)
  74. gt_masks = gt_masks[:, None].to(rois)
  75. return roi_align(gt_masks, rois, (M, M), 1.0)[:, 0]
  76. def maskrcnn_loss(mask_logits, proposals, gt_masks, gt_labels, mask_matched_idxs):
  77. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  78. """
  79. Args:
  80. proposals (list[BoxList])
  81. mask_logits (Tensor)
  82. targets (list[BoxList])
  83. Return:
  84. mask_loss (Tensor): scalar tensor containing the loss
  85. """
  86. discretization_size = mask_logits.shape[-1]
  87. labels = [gt_label[idxs] for gt_label, idxs in zip(gt_labels, mask_matched_idxs)]
  88. mask_targets = [
  89. project_masks_on_boxes(m, p, i, discretization_size) for m, p, i in zip(gt_masks, proposals, mask_matched_idxs)
  90. ]
  91. labels = torch.cat(labels, dim=0)
  92. mask_targets = torch.cat(mask_targets, dim=0)
  93. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  94. # accept empty tensors, so handle it separately
  95. if mask_targets.numel() == 0:
  96. return mask_logits.sum() * 0
  97. mask_loss = F.binary_cross_entropy_with_logits(
  98. mask_logits[torch.arange(labels.shape[0], device=labels.device), labels], mask_targets
  99. )
  100. return mask_loss
  101. def keypoints_to_heatmap(keypoints, rois, heatmap_size):
  102. # type: (Tensor, Tensor, int) -> Tuple[Tensor, Tensor]
  103. offset_x = rois[:, 0]
  104. offset_y = rois[:, 1]
  105. scale_x = heatmap_size / (rois[:, 2] - rois[:, 0])
  106. scale_y = heatmap_size / (rois[:, 3] - rois[:, 1])
  107. offset_x = offset_x[:, None]
  108. offset_y = offset_y[:, None]
  109. scale_x = scale_x[:, None]
  110. scale_y = scale_y[:, None]
  111. x = keypoints[..., 0]
  112. y = keypoints[..., 1]
  113. x_boundary_inds = x == rois[:, 2][:, None]
  114. y_boundary_inds = y == rois[:, 3][:, None]
  115. x = (x - offset_x) * scale_x
  116. x = x.floor().long()
  117. y = (y - offset_y) * scale_y
  118. y = y.floor().long()
  119. x[x_boundary_inds] = heatmap_size - 1
  120. y[y_boundary_inds] = heatmap_size - 1
  121. valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size)
  122. vis = keypoints[..., 2] > 0
  123. valid = (valid_loc & vis).long()
  124. lin_ind = y * heatmap_size + x
  125. heatmaps = lin_ind * valid
  126. return heatmaps, valid
  127. def _onnx_heatmaps_to_keypoints(
  128. maps, maps_i, roi_map_width, roi_map_height, widths_i, heights_i, offset_x_i, offset_y_i
  129. ):
  130. num_keypoints = torch.scalar_tensor(maps.size(1), dtype=torch.int64)
  131. width_correction = widths_i / roi_map_width
  132. height_correction = heights_i / roi_map_height
  133. roi_map = F.interpolate(
  134. maps_i[:, None], size=(int(roi_map_height), int(roi_map_width)), mode="bicubic", align_corners=False
  135. )[:, 0]
  136. w = torch.scalar_tensor(roi_map.size(2), dtype=torch.int64)
  137. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  138. x_int = pos % w
  139. y_int = (pos - x_int) // w
  140. x = (torch.tensor(0.5, dtype=torch.float32) + x_int.to(dtype=torch.float32)) * width_correction.to(
  141. dtype=torch.float32
  142. )
  143. y = (torch.tensor(0.5, dtype=torch.float32) + y_int.to(dtype=torch.float32)) * height_correction.to(
  144. dtype=torch.float32
  145. )
  146. xy_preds_i_0 = x + offset_x_i.to(dtype=torch.float32)
  147. xy_preds_i_1 = y + offset_y_i.to(dtype=torch.float32)
  148. xy_preds_i_2 = torch.ones(xy_preds_i_1.shape, dtype=torch.float32)
  149. xy_preds_i = torch.stack(
  150. [
  151. xy_preds_i_0.to(dtype=torch.float32),
  152. xy_preds_i_1.to(dtype=torch.float32),
  153. xy_preds_i_2.to(dtype=torch.float32),
  154. ],
  155. 0,
  156. )
  157. # TODO: simplify when indexing without rank will be supported by ONNX
  158. base = num_keypoints * num_keypoints + num_keypoints + 1
  159. ind = torch.arange(num_keypoints)
  160. ind = ind.to(dtype=torch.int64) * base
  161. end_scores_i = (
  162. roi_map.index_select(1, y_int.to(dtype=torch.int64))
  163. .index_select(2, x_int.to(dtype=torch.int64))
  164. .view(-1)
  165. .index_select(0, ind.to(dtype=torch.int64))
  166. )
  167. return xy_preds_i, end_scores_i
  168. @torch.jit._script_if_tracing
  169. def _onnx_heatmaps_to_keypoints_loop(
  170. maps, rois, widths_ceil, heights_ceil, widths, heights, offset_x, offset_y, num_keypoints
  171. ):
  172. xy_preds = torch.zeros((0, 3, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  173. end_scores = torch.zeros((0, int(num_keypoints)), dtype=torch.float32, device=maps.device)
  174. for i in range(int(rois.size(0))):
  175. xy_preds_i, end_scores_i = _onnx_heatmaps_to_keypoints(
  176. maps, maps[i], widths_ceil[i], heights_ceil[i], widths[i], heights[i], offset_x[i], offset_y[i]
  177. )
  178. xy_preds = torch.cat((xy_preds.to(dtype=torch.float32), xy_preds_i.unsqueeze(0).to(dtype=torch.float32)), 0)
  179. end_scores = torch.cat(
  180. (end_scores.to(dtype=torch.float32), end_scores_i.to(dtype=torch.float32).unsqueeze(0)), 0
  181. )
  182. return xy_preds, end_scores
  183. def heatmaps_to_keypoints(maps, rois):
  184. """Extract predicted keypoint locations from heatmaps. Output has shape
  185. (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
  186. for each keypoint.
  187. """
  188. # This function converts a discrete image coordinate in a HEATMAP_SIZE x
  189. # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
  190. # consistency with keypoints_to_heatmap_labels by using the conversion from
  191. # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
  192. # continuous coordinate.
  193. offset_x = rois[:, 0]
  194. offset_y = rois[:, 1]
  195. widths = rois[:, 2] - rois[:, 0]
  196. heights = rois[:, 3] - rois[:, 1]
  197. widths = widths.clamp(min=1)
  198. heights = heights.clamp(min=1)
  199. widths_ceil = widths.ceil()
  200. heights_ceil = heights.ceil()
  201. num_keypoints = maps.shape[1]
  202. if torchvision._is_tracing():
  203. xy_preds, end_scores = _onnx_heatmaps_to_keypoints_loop(
  204. maps,
  205. rois,
  206. widths_ceil,
  207. heights_ceil,
  208. widths,
  209. heights,
  210. offset_x,
  211. offset_y,
  212. torch.scalar_tensor(num_keypoints, dtype=torch.int64),
  213. )
  214. return xy_preds.permute(0, 2, 1), end_scores
  215. xy_preds = torch.zeros((len(rois), 3, num_keypoints), dtype=torch.float32, device=maps.device)
  216. end_scores = torch.zeros((len(rois), num_keypoints), dtype=torch.float32, device=maps.device)
  217. for i in range(len(rois)):
  218. roi_map_width = int(widths_ceil[i].item())
  219. roi_map_height = int(heights_ceil[i].item())
  220. width_correction = widths[i] / roi_map_width
  221. height_correction = heights[i] / roi_map_height
  222. roi_map = F.interpolate(
  223. maps[i][:, None], size=(roi_map_height, roi_map_width), mode="bicubic", align_corners=False
  224. )[:, 0]
  225. # roi_map_probs = scores_to_probs(roi_map.copy())
  226. w = roi_map.shape[2]
  227. pos = roi_map.reshape(num_keypoints, -1).argmax(dim=1)
  228. x_int = pos % w
  229. y_int = torch.div(pos - x_int, w, rounding_mode="floor")
  230. # assert (roi_map_probs[k, y_int, x_int] ==
  231. # roi_map_probs[k, :, :].max())
  232. x = (x_int.float() + 0.5) * width_correction
  233. y = (y_int.float() + 0.5) * height_correction
  234. xy_preds[i, 0, :] = x + offset_x[i]
  235. xy_preds[i, 1, :] = y + offset_y[i]
  236. xy_preds[i, 2, :] = 1
  237. end_scores[i, :] = roi_map[torch.arange(num_keypoints, device=roi_map.device), y_int, x_int]
  238. return xy_preds.permute(0, 2, 1), end_scores
  239. def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched_idxs):
  240. # type: (Tensor, List[Tensor], List[Tensor], List[Tensor]) -> Tensor
  241. N, K, H, W = keypoint_logits.shape
  242. if H != W:
  243. raise ValueError(
  244. f"keypoint_logits height and width (last two elements of shape) should be equal. Instead got H = {H} and W = {W}"
  245. )
  246. discretization_size = H
  247. heatmaps = []
  248. valid = []
  249. for proposals_per_image, gt_kp_in_image, midx in zip(proposals, gt_keypoints, keypoint_matched_idxs):
  250. kp = gt_kp_in_image[midx]
  251. heatmaps_per_image, valid_per_image = keypoints_to_heatmap(kp, proposals_per_image, discretization_size)
  252. heatmaps.append(heatmaps_per_image.view(-1))
  253. valid.append(valid_per_image.view(-1))
  254. keypoint_targets = torch.cat(heatmaps, dim=0)
  255. valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
  256. valid = torch.where(valid)[0]
  257. # torch.mean (in binary_cross_entropy_with_logits) doesn't
  258. # accept empty tensors, so handle it sepaartely
  259. if keypoint_targets.numel() == 0 or len(valid) == 0:
  260. return keypoint_logits.sum() * 0
  261. keypoint_logits = keypoint_logits.view(N * K, H * W)
  262. keypoint_loss = F.cross_entropy(keypoint_logits[valid], keypoint_targets[valid])
  263. return keypoint_loss
  264. def keypointrcnn_inference(x, boxes):
  265. # type: (Tensor, List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  266. kp_probs = []
  267. kp_scores = []
  268. boxes_per_image = [box.size(0) for box in boxes]
  269. x2 = x.split(boxes_per_image, dim=0)
  270. for xx, bb in zip(x2, boxes):
  271. kp_prob, scores = heatmaps_to_keypoints(xx, bb)
  272. kp_probs.append(kp_prob)
  273. kp_scores.append(scores)
  274. return kp_probs, kp_scores
  275. def _onnx_expand_boxes(boxes, scale):
  276. # type: (Tensor, float) -> Tensor
  277. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  278. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  279. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  280. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  281. w_half = w_half.to(dtype=torch.float32) * scale
  282. h_half = h_half.to(dtype=torch.float32) * scale
  283. boxes_exp0 = x_c - w_half
  284. boxes_exp1 = y_c - h_half
  285. boxes_exp2 = x_c + w_half
  286. boxes_exp3 = y_c + h_half
  287. boxes_exp = torch.stack((boxes_exp0, boxes_exp1, boxes_exp2, boxes_exp3), 1)
  288. return boxes_exp
  289. # the next two functions should be merged inside Masker
  290. # but are kept here for the moment while we need them
  291. # temporarily for paste_mask_in_image
  292. def expand_boxes(boxes, scale):
  293. # type: (Tensor, float) -> Tensor
  294. if torchvision._is_tracing():
  295. return _onnx_expand_boxes(boxes, scale)
  296. w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
  297. h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
  298. x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
  299. y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
  300. w_half *= scale
  301. h_half *= scale
  302. boxes_exp = torch.zeros_like(boxes)
  303. boxes_exp[:, 0] = x_c - w_half
  304. boxes_exp[:, 2] = x_c + w_half
  305. boxes_exp[:, 1] = y_c - h_half
  306. boxes_exp[:, 3] = y_c + h_half
  307. return boxes_exp
  308. @torch.jit.unused
  309. def expand_masks_tracing_scale(M, padding):
  310. # type: (int, int) -> float
  311. return torch.tensor(M + 2 * padding).to(torch.float32) / torch.tensor(M).to(torch.float32)
  312. def expand_masks(mask, padding):
  313. # type: (Tensor, int) -> Tuple[Tensor, float]
  314. M = mask.shape[-1]
  315. if torch._C._get_tracing_state(): # could not import is_tracing(), not sure why
  316. scale = expand_masks_tracing_scale(M, padding)
  317. else:
  318. scale = float(M + 2 * padding) / M
  319. padded_mask = F.pad(mask, (padding,) * 4)
  320. return padded_mask, scale
  321. def paste_mask_in_image(mask, box, im_h, im_w):
  322. # type: (Tensor, Tensor, int, int) -> Tensor
  323. TO_REMOVE = 1
  324. w = int(box[2] - box[0] + TO_REMOVE)
  325. h = int(box[3] - box[1] + TO_REMOVE)
  326. w = max(w, 1)
  327. h = max(h, 1)
  328. # Set shape to [batchxCxHxW]
  329. mask = mask.expand((1, 1, -1, -1))
  330. # Resize mask
  331. mask = F.interpolate(mask, size=(h, w), mode="bilinear", align_corners=False)
  332. mask = mask[0][0]
  333. im_mask = torch.zeros((im_h, im_w), dtype=mask.dtype, device=mask.device)
  334. x_0 = max(box[0], 0)
  335. x_1 = min(box[2] + 1, im_w)
  336. y_0 = max(box[1], 0)
  337. y_1 = min(box[3] + 1, im_h)
  338. im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  339. return im_mask
  340. def _onnx_paste_mask_in_image(mask, box, im_h, im_w):
  341. one = torch.ones(1, dtype=torch.int64)
  342. zero = torch.zeros(1, dtype=torch.int64)
  343. w = box[2] - box[0] + one
  344. h = box[3] - box[1] + one
  345. w = torch.max(torch.cat((w, one)))
  346. h = torch.max(torch.cat((h, one)))
  347. # Set shape to [batchxCxHxW]
  348. mask = mask.expand((1, 1, mask.size(0), mask.size(1)))
  349. # Resize mask
  350. mask = F.interpolate(mask, size=(int(h), int(w)), mode="bilinear", align_corners=False)
  351. mask = mask[0][0]
  352. x_0 = torch.max(torch.cat((box[0].unsqueeze(0), zero)))
  353. x_1 = torch.min(torch.cat((box[2].unsqueeze(0) + one, im_w.unsqueeze(0))))
  354. y_0 = torch.max(torch.cat((box[1].unsqueeze(0), zero)))
  355. y_1 = torch.min(torch.cat((box[3].unsqueeze(0) + one, im_h.unsqueeze(0))))
  356. unpaded_im_mask = mask[(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])]
  357. # TODO : replace below with a dynamic padding when support is added in ONNX
  358. # pad y
  359. zeros_y0 = torch.zeros(y_0, unpaded_im_mask.size(1))
  360. zeros_y1 = torch.zeros(im_h - y_1, unpaded_im_mask.size(1))
  361. concat_0 = torch.cat((zeros_y0, unpaded_im_mask.to(dtype=torch.float32), zeros_y1), 0)[0:im_h, :]
  362. # pad x
  363. zeros_x0 = torch.zeros(concat_0.size(0), x_0)
  364. zeros_x1 = torch.zeros(concat_0.size(0), im_w - x_1)
  365. im_mask = torch.cat((zeros_x0, concat_0, zeros_x1), 1)[:, :im_w]
  366. return im_mask
  367. @torch.jit._script_if_tracing
  368. def _onnx_paste_masks_in_image_loop(masks, boxes, im_h, im_w):
  369. res_append = torch.zeros(0, im_h, im_w)
  370. for i in range(masks.size(0)):
  371. mask_res = _onnx_paste_mask_in_image(masks[i][0], boxes[i], im_h, im_w)
  372. mask_res = mask_res.unsqueeze(0)
  373. res_append = torch.cat((res_append, mask_res))
  374. return res_append
  375. def paste_masks_in_image(masks, boxes, img_shape, padding=1):
  376. # type: (Tensor, Tensor, Tuple[int, int], int) -> Tensor
  377. masks, scale = expand_masks(masks, padding=padding)
  378. boxes = expand_boxes(boxes, scale).to(dtype=torch.int64)
  379. im_h, im_w = img_shape
  380. if torchvision._is_tracing():
  381. return _onnx_paste_masks_in_image_loop(
  382. masks, boxes, torch.scalar_tensor(im_h, dtype=torch.int64), torch.scalar_tensor(im_w, dtype=torch.int64)
  383. )[:, None]
  384. res = [paste_mask_in_image(m[0], b, im_h, im_w) for m, b in zip(masks, boxes)]
  385. if len(res) > 0:
  386. ret = torch.stack(res, dim=0)[:, None]
  387. else:
  388. ret = masks.new_empty((0, 1, im_h, im_w))
  389. return ret
  390. class RoIHeads(nn.Module):
  391. __annotations__ = {
  392. "box_coder": det_utils.BoxCoder,
  393. "proposal_matcher": det_utils.Matcher,
  394. "fg_bg_sampler": det_utils.BalancedPositiveNegativeSampler,
  395. }
  396. def __init__(
  397. self,
  398. box_roi_pool,
  399. box_head,
  400. box_predictor,
  401. # Faster R-CNN training
  402. fg_iou_thresh,
  403. bg_iou_thresh,
  404. batch_size_per_image,
  405. positive_fraction,
  406. bbox_reg_weights,
  407. # Faster R-CNN inference
  408. score_thresh,
  409. nms_thresh,
  410. detections_per_img,
  411. # Mask
  412. mask_roi_pool=None,
  413. mask_head=None,
  414. mask_predictor=None,
  415. keypoint_roi_pool=None,
  416. keypoint_head=None,
  417. keypoint_predictor=None,
  418. ):
  419. super().__init__()
  420. self.box_similarity = box_ops.box_iou
  421. # assign ground-truth boxes for each proposal
  422. self.proposal_matcher = det_utils.Matcher(fg_iou_thresh, bg_iou_thresh, allow_low_quality_matches=False)
  423. self.fg_bg_sampler = det_utils.BalancedPositiveNegativeSampler(batch_size_per_image, positive_fraction)
  424. if bbox_reg_weights is None:
  425. bbox_reg_weights = (10.0, 10.0, 5.0, 5.0)
  426. self.box_coder = det_utils.BoxCoder(bbox_reg_weights)
  427. self.box_roi_pool = box_roi_pool
  428. self.box_head = box_head
  429. self.box_predictor = box_predictor
  430. self.score_thresh = score_thresh
  431. self.nms_thresh = nms_thresh
  432. self.detections_per_img = detections_per_img
  433. self.mask_roi_pool = mask_roi_pool
  434. self.mask_head = mask_head
  435. self.mask_predictor = mask_predictor
  436. self.keypoint_roi_pool = keypoint_roi_pool
  437. self.keypoint_head = keypoint_head
  438. self.keypoint_predictor = keypoint_predictor
  439. def has_mask(self):
  440. if self.mask_roi_pool is None:
  441. return False
  442. if self.mask_head is None:
  443. return False
  444. if self.mask_predictor is None:
  445. return False
  446. return True
  447. def has_keypoint(self):
  448. if self.keypoint_roi_pool is None:
  449. return False
  450. if self.keypoint_head is None:
  451. return False
  452. if self.keypoint_predictor is None:
  453. return False
  454. return True
  455. def assign_targets_to_proposals(self, proposals, gt_boxes, gt_labels):
  456. # type: (List[Tensor], List[Tensor], List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]
  457. matched_idxs = []
  458. labels = []
  459. for proposals_in_image, gt_boxes_in_image, gt_labels_in_image in zip(proposals, gt_boxes, gt_labels):
  460. if gt_boxes_in_image.numel() == 0:
  461. # Background image
  462. device = proposals_in_image.device
  463. clamped_matched_idxs_in_image = torch.zeros(
  464. (proposals_in_image.shape[0],), dtype=torch.int64, device=device
  465. )
  466. labels_in_image = torch.zeros((proposals_in_image.shape[0],), dtype=torch.int64, device=device)
  467. else:
  468. # set to self.box_similarity when https://github.com/pytorch/pytorch/issues/27495 lands
  469. match_quality_matrix = box_ops.box_iou(gt_boxes_in_image, proposals_in_image)
  470. matched_idxs_in_image = self.proposal_matcher(match_quality_matrix)
  471. clamped_matched_idxs_in_image = matched_idxs_in_image.clamp(min=0)
  472. labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
  473. labels_in_image = labels_in_image.to(dtype=torch.int64)
  474. # Label background (below the low threshold)
  475. bg_inds = matched_idxs_in_image == self.proposal_matcher.BELOW_LOW_THRESHOLD
  476. labels_in_image[bg_inds] = 0
  477. # Label ignore proposals (between low and high thresholds)
  478. ignore_inds = matched_idxs_in_image == self.proposal_matcher.BETWEEN_THRESHOLDS
  479. labels_in_image[ignore_inds] = -1 # -1 is ignored by sampler
  480. matched_idxs.append(clamped_matched_idxs_in_image)
  481. labels.append(labels_in_image)
  482. return matched_idxs, labels
  483. def subsample(self, labels):
  484. # type: (List[Tensor]) -> List[Tensor]
  485. sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
  486. sampled_inds = []
  487. for img_idx, (pos_inds_img, neg_inds_img) in enumerate(zip(sampled_pos_inds, sampled_neg_inds)):
  488. img_sampled_inds = torch.where(pos_inds_img | neg_inds_img)[0]
  489. sampled_inds.append(img_sampled_inds)
  490. return sampled_inds
  491. def add_gt_proposals(self, proposals, gt_boxes):
  492. # type: (List[Tensor], List[Tensor]) -> List[Tensor]
  493. proposals = [torch.cat((proposal, gt_box)) for proposal, gt_box in zip(proposals, gt_boxes)]
  494. return proposals
  495. def check_targets(self, targets):
  496. # type: (Optional[List[Dict[str, Tensor]]]) -> None
  497. if targets is None:
  498. raise ValueError("targets should not be None")
  499. if not all(["boxes" in t for t in targets]):
  500. raise ValueError("Every element of targets should have a boxes key")
  501. if not all(["labels" in t for t in targets]):
  502. raise ValueError("Every element of targets should have a labels key")
  503. if self.has_mask():
  504. if not all(["masks" in t for t in targets]):
  505. raise ValueError("Every element of targets should have a masks key")
  506. def select_training_samples(
  507. self,
  508. proposals, # type: List[Tensor]
  509. targets, # type: Optional[List[Dict[str, Tensor]]]
  510. ):
  511. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor], List[Tensor]]
  512. self.check_targets(targets)
  513. if targets is None:
  514. raise ValueError("targets should not be None")
  515. dtype = proposals[0].dtype
  516. device = proposals[0].device
  517. gt_boxes = [t["boxes"].to(dtype) for t in targets]
  518. gt_labels = [t["labels"] for t in targets]
  519. # append ground-truth bboxes to propos
  520. proposals = self.add_gt_proposals(proposals, gt_boxes)
  521. # get matching gt indices for each proposal
  522. matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
  523. # sample a fixed proportion of positive-negative proposals
  524. sampled_inds = self.subsample(labels)
  525. matched_gt_boxes = []
  526. num_images = len(proposals)
  527. for img_id in range(num_images):
  528. img_sampled_inds = sampled_inds[img_id]
  529. proposals[img_id] = proposals[img_id][img_sampled_inds]
  530. labels[img_id] = labels[img_id][img_sampled_inds]
  531. matched_idxs[img_id] = matched_idxs[img_id][img_sampled_inds]
  532. gt_boxes_in_image = gt_boxes[img_id]
  533. if gt_boxes_in_image.numel() == 0:
  534. gt_boxes_in_image = torch.zeros((1, 4), dtype=dtype, device=device)
  535. matched_gt_boxes.append(gt_boxes_in_image[matched_idxs[img_id]])
  536. regression_targets = self.box_coder.encode(matched_gt_boxes, proposals)
  537. return proposals, matched_idxs, labels, regression_targets
  538. def postprocess_detections(
  539. self,
  540. class_logits, # type: Tensor
  541. box_regression, # type: Tensor
  542. proposals, # type: List[Tensor]
  543. image_shapes, # type: List[Tuple[int, int]]
  544. ):
  545. # type: (...) -> Tuple[List[Tensor], List[Tensor], List[Tensor]]
  546. device = class_logits.device
  547. num_classes = class_logits.shape[-1]
  548. boxes_per_image = [boxes_in_image.shape[0] for boxes_in_image in proposals]
  549. pred_boxes = self.box_coder.decode(box_regression, proposals)
  550. pred_scores = F.softmax(class_logits, -1)
  551. pred_boxes_list = pred_boxes.split(boxes_per_image, 0)
  552. pred_scores_list = pred_scores.split(boxes_per_image, 0)
  553. all_boxes = []
  554. all_scores = []
  555. all_labels = []
  556. for boxes, scores, image_shape in zip(pred_boxes_list, pred_scores_list, image_shapes):
  557. boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
  558. # create labels for each prediction
  559. labels = torch.arange(num_classes, device=device)
  560. labels = labels.view(1, -1).expand_as(scores)
  561. # remove predictions with the background label
  562. boxes = boxes[:, 1:]
  563. scores = scores[:, 1:]
  564. labels = labels[:, 1:]
  565. # batch everything, by making every class prediction be a separate instance
  566. boxes = boxes.reshape(-1, 4)
  567. scores = scores.reshape(-1)
  568. labels = labels.reshape(-1)
  569. # remove low scoring boxes
  570. inds = torch.where(scores > self.score_thresh)[0]
  571. boxes, scores, labels = boxes[inds], scores[inds], labels[inds]
  572. # remove empty boxes
  573. keep = box_ops.remove_small_boxes(boxes, min_size=1e-2)
  574. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  575. # non-maximum suppression, independently done per class
  576. keep = box_ops.batched_nms(boxes, scores, labels, self.nms_thresh)
  577. # keep only topk scoring predictions
  578. keep = keep[: self.detections_per_img]
  579. boxes, scores, labels = boxes[keep], scores[keep], labels[keep]
  580. all_boxes.append(boxes)
  581. all_scores.append(scores)
  582. all_labels.append(labels)
  583. return all_boxes, all_scores, all_labels
  584. def forward(
  585. self,
  586. features, # type: Dict[str, Tensor]
  587. proposals, # type: List[Tensor]
  588. image_shapes, # type: List[Tuple[int, int]]
  589. targets=None, # type: Optional[List[Dict[str, Tensor]]]
  590. ):
  591. # type: (...) -> Tuple[List[Dict[str, Tensor]], Dict[str, Tensor]]
  592. """
  593. Args:
  594. features (List[Tensor])
  595. proposals (List[Tensor[N, 4]])
  596. image_shapes (List[Tuple[H, W]])
  597. targets (List[Dict])
  598. """
  599. if targets is not None:
  600. for t in targets:
  601. # TODO: https://github.com/pytorch/pytorch/issues/26731
  602. floating_point_types = (torch.float, torch.double, torch.half)
  603. if not t["boxes"].dtype in floating_point_types:
  604. raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
  605. if not t["labels"].dtype == torch.int64:
  606. raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
  607. if self.has_keypoint():
  608. if not t["keypoints"].dtype == torch.float32:
  609. raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
  610. if self.training:
  611. proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
  612. else:
  613. labels = None
  614. regression_targets = None
  615. matched_idxs = None
  616. box_features = self.box_roi_pool(features, proposals, image_shapes)
  617. box_features = self.box_head(box_features)
  618. class_logits, box_regression = self.box_predictor(box_features)
  619. result: List[Dict[str, torch.Tensor]] = []
  620. losses = {}
  621. if self.training:
  622. if labels is None:
  623. raise ValueError("labels cannot be None")
  624. if regression_targets is None:
  625. raise ValueError("regression_targets cannot be None")
  626. loss_classifier, loss_box_reg = fastrcnn_loss(class_logits, box_regression, labels, regression_targets)
  627. losses = {"loss_classifier": loss_classifier, "loss_box_reg": loss_box_reg}
  628. else:
  629. boxes, scores, labels = self.postprocess_detections(class_logits, box_regression, proposals, image_shapes)
  630. num_images = len(boxes)
  631. for i in range(num_images):
  632. result.append(
  633. {
  634. "boxes": boxes[i],
  635. "labels": labels[i],
  636. "scores": scores[i],
  637. }
  638. )
  639. if self.has_mask():
  640. mask_proposals = [p["boxes"] for p in result]
  641. if self.training:
  642. if matched_idxs is None:
  643. raise ValueError("if in training, matched_idxs should not be None")
  644. # during training, only focus on positive boxes
  645. num_images = len(proposals)
  646. mask_proposals = []
  647. pos_matched_idxs = []
  648. for img_id in range(num_images):
  649. pos = torch.where(labels[img_id] > 0)[0]
  650. mask_proposals.append(proposals[img_id][pos])
  651. pos_matched_idxs.append(matched_idxs[img_id][pos])
  652. else:
  653. pos_matched_idxs = None
  654. if self.mask_roi_pool is not None:
  655. mask_features = self.mask_roi_pool(features, mask_proposals, image_shapes)
  656. mask_features = self.mask_head(mask_features)
  657. mask_logits = self.mask_predictor(mask_features)
  658. else:
  659. raise Exception("Expected mask_roi_pool to be not None")
  660. loss_mask = {}
  661. if self.training:
  662. if targets is None or pos_matched_idxs is None or mask_logits is None:
  663. raise ValueError("targets, pos_matched_idxs, mask_logits cannot be None when training")
  664. gt_masks = [t["masks"] for t in targets]
  665. gt_labels = [t["labels"] for t in targets]
  666. rcnn_loss_mask = maskrcnn_loss(mask_logits, mask_proposals, gt_masks, gt_labels, pos_matched_idxs)
  667. loss_mask = {"loss_mask": rcnn_loss_mask}
  668. else:
  669. labels = [r["labels"] for r in result]
  670. masks_probs = maskrcnn_inference(mask_logits, labels)
  671. for mask_prob, r in zip(masks_probs, result):
  672. r["masks"] = mask_prob
  673. losses.update(loss_mask)
  674. # keep none checks in if conditional so torchscript will conditionally
  675. # compile each branch
  676. if (
  677. self.keypoint_roi_pool is not None
  678. and self.keypoint_head is not None
  679. and self.keypoint_predictor is not None
  680. ):
  681. keypoint_proposals = [p["boxes"] for p in result]
  682. if self.training:
  683. # during training, only focus on positive boxes
  684. num_images = len(proposals)
  685. keypoint_proposals = []
  686. pos_matched_idxs = []
  687. if matched_idxs is None:
  688. raise ValueError("if in trainning, matched_idxs should not be None")
  689. for img_id in range(num_images):
  690. pos = torch.where(labels[img_id] > 0)[0]
  691. keypoint_proposals.append(proposals[img_id][pos])
  692. pos_matched_idxs.append(matched_idxs[img_id][pos])
  693. else:
  694. pos_matched_idxs = None
  695. keypoint_features = self.keypoint_roi_pool(features, keypoint_proposals, image_shapes)
  696. keypoint_features = self.keypoint_head(keypoint_features)
  697. keypoint_logits = self.keypoint_predictor(keypoint_features)
  698. loss_keypoint = {}
  699. if self.training:
  700. if targets is None or pos_matched_idxs is None:
  701. raise ValueError("both targets and pos_matched_idxs should not be None when in training mode")
  702. gt_keypoints = [t["keypoints"] for t in targets]
  703. rcnn_loss_keypoint = keypointrcnn_loss(
  704. keypoint_logits, keypoint_proposals, gt_keypoints, pos_matched_idxs
  705. )
  706. loss_keypoint = {"loss_keypoint": rcnn_loss_keypoint}
  707. else:
  708. if keypoint_logits is None or keypoint_proposals is None:
  709. raise ValueError(
  710. "both keypoint_logits and keypoint_proposals should not be None when not in training mode"
  711. )
  712. keypoints_probs, kp_scores = keypointrcnn_inference(keypoint_logits, keypoint_proposals)
  713. for keypoint_prob, kps, r in zip(keypoints_probs, kp_scores, result):
  714. r["keypoints"] = keypoint_prob
  715. r["keypoints_scores"] = kps
  716. losses.update(loss_keypoint)
  717. return result, losses