123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682 |
- import warnings
- from collections import OrderedDict
- from typing import Any, Dict, List, Optional, Tuple
- import torch
- import torch.nn.functional as F
- from torch import nn, Tensor
- from ...ops import boxes as box_ops
- from ...transforms._presets import ObjectDetection
- from ...utils import _log_api_usage_once
- from .._api import register_model, Weights, WeightsEnum
- from .._meta import _COCO_CATEGORIES
- from .._utils import _ovewrite_value_param, handle_legacy_interface
- from ..vgg import VGG, vgg16, VGG16_Weights
- from . import _utils as det_utils
- from .anchor_utils import DefaultBoxGenerator
- from .backbone_utils import _validate_trainable_layers
- from .transform import GeneralizedRCNNTransform
- __all__ = [
- "SSD300_VGG16_Weights",
- "ssd300_vgg16",
- ]
- class SSD300_VGG16_Weights(WeightsEnum):
- COCO_V1 = Weights(
- url="https://download.pytorch.org/models/ssd300_vgg16_coco-b556d3b4.pth",
- transforms=ObjectDetection,
- meta={
- "num_params": 35641826,
- "categories": _COCO_CATEGORIES,
- "min_size": (1, 1),
- "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssd300-vgg16",
- "_metrics": {
- "COCO-val2017": {
- "box_map": 25.1,
- }
- },
- "_ops": 34.858,
- "_file_size": 135.988,
- "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
- },
- )
- DEFAULT = COCO_V1
- def _xavier_init(conv: nn.Module):
- for layer in conv.modules():
- if isinstance(layer, nn.Conv2d):
- torch.nn.init.xavier_uniform_(layer.weight)
- if layer.bias is not None:
- torch.nn.init.constant_(layer.bias, 0.0)
- class SSDHead(nn.Module):
- def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int):
- super().__init__()
- self.classification_head = SSDClassificationHead(in_channels, num_anchors, num_classes)
- self.regression_head = SSDRegressionHead(in_channels, num_anchors)
- def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
- return {
- "bbox_regression": self.regression_head(x),
- "cls_logits": self.classification_head(x),
- }
- class SSDScoringHead(nn.Module):
- def __init__(self, module_list: nn.ModuleList, num_columns: int):
- super().__init__()
- self.module_list = module_list
- self.num_columns = num_columns
- def _get_result_from_module_list(self, x: Tensor, idx: int) -> Tensor:
- """
- This is equivalent to self.module_list[idx](x),
- but torchscript doesn't support this yet
- """
- num_blocks = len(self.module_list)
- if idx < 0:
- idx += num_blocks
- out = x
- for i, module in enumerate(self.module_list):
- if i == idx:
- out = module(x)
- return out
- def forward(self, x: List[Tensor]) -> Tensor:
- all_results = []
- for i, features in enumerate(x):
- results = self._get_result_from_module_list(features, i)
- # Permute output from (N, A * K, H, W) to (N, HWA, K).
- N, _, H, W = results.shape
- results = results.view(N, -1, self.num_columns, H, W)
- results = results.permute(0, 3, 4, 1, 2)
- results = results.reshape(N, -1, self.num_columns) # Size=(N, HWA, K)
- all_results.append(results)
- return torch.cat(all_results, dim=1)
- class SSDClassificationHead(SSDScoringHead):
- def __init__(self, in_channels: List[int], num_anchors: List[int], num_classes: int):
- cls_logits = nn.ModuleList()
- for channels, anchors in zip(in_channels, num_anchors):
- cls_logits.append(nn.Conv2d(channels, num_classes * anchors, kernel_size=3, padding=1))
- _xavier_init(cls_logits)
- super().__init__(cls_logits, num_classes)
- class SSDRegressionHead(SSDScoringHead):
- def __init__(self, in_channels: List[int], num_anchors: List[int]):
- bbox_reg = nn.ModuleList()
- for channels, anchors in zip(in_channels, num_anchors):
- bbox_reg.append(nn.Conv2d(channels, 4 * anchors, kernel_size=3, padding=1))
- _xavier_init(bbox_reg)
- super().__init__(bbox_reg, 4)
- class SSD(nn.Module):
- """
- Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
- The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
- image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
- to a fixed size before passing it to the backbone.
- The behavior of the model changes depending on if it is in training or evaluation mode.
- During training, the model expects both the input tensors and targets (list of dictionary),
- containing:
- - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- - labels (Int64Tensor[N]): the class label for each ground-truth box
- The model returns a Dict[Tensor] during training, containing the classification and regression
- losses.
- During inference, the model requires only the input tensors, and returns the post-processed
- predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
- follows, where ``N`` is the number of detections:
- - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- - labels (Int64Tensor[N]): the predicted labels for each detection
- - scores (Tensor[N]): the scores for each detection
- Args:
- backbone (nn.Module): the network used to compute the features for the model.
- It should contain an out_channels attribute with the list of the output channels of
- each feature map. The backbone should return a single Tensor or an OrderedDict[Tensor].
- anchor_generator (DefaultBoxGenerator): module that generates the default boxes for a
- set of feature maps.
- size (Tuple[int, int]): the width and height to which images will be rescaled before feeding them
- to the backbone.
- num_classes (int): number of output classes of the model (including the background).
- image_mean (Tuple[float, float, float]): mean values used for input normalization.
- They are generally the mean values of the dataset on which the backbone has been trained
- on
- image_std (Tuple[float, float, float]): std values used for input normalization.
- They are generally the std values of the dataset on which the backbone has been trained on
- head (nn.Module, optional): Module run on top of the backbone features. Defaults to a module containing
- a classification and regression module.
- score_thresh (float): Score threshold used for postprocessing the detections.
- nms_thresh (float): NMS threshold used for postprocessing the detections.
- detections_per_img (int): Number of best detections to keep after NMS.
- iou_thresh (float): minimum IoU between the anchor and the GT box so that they can be
- considered as positive during training.
- topk_candidates (int): Number of best detections to keep before NMS.
- positive_fraction (float): a number between 0 and 1 which indicates the proportion of positive
- proposals used during the training of the classification head. It is used to estimate the negative to
- positive ratio.
- """
- __annotations__ = {
- "box_coder": det_utils.BoxCoder,
- "proposal_matcher": det_utils.Matcher,
- }
- def __init__(
- self,
- backbone: nn.Module,
- anchor_generator: DefaultBoxGenerator,
- size: Tuple[int, int],
- num_classes: int,
- image_mean: Optional[List[float]] = None,
- image_std: Optional[List[float]] = None,
- head: Optional[nn.Module] = None,
- score_thresh: float = 0.01,
- nms_thresh: float = 0.45,
- detections_per_img: int = 200,
- iou_thresh: float = 0.5,
- topk_candidates: int = 400,
- positive_fraction: float = 0.25,
- **kwargs: Any,
- ):
- super().__init__()
- _log_api_usage_once(self)
- self.backbone = backbone
- self.anchor_generator = anchor_generator
- self.box_coder = det_utils.BoxCoder(weights=(10.0, 10.0, 5.0, 5.0))
- if head is None:
- if hasattr(backbone, "out_channels"):
- out_channels = backbone.out_channels
- else:
- out_channels = det_utils.retrieve_out_channels(backbone, size)
- if len(out_channels) != len(anchor_generator.aspect_ratios):
- raise ValueError(
- f"The length of the output channels from the backbone ({len(out_channels)}) do not match the length of the anchor generator aspect ratios ({len(anchor_generator.aspect_ratios)})"
- )
- num_anchors = self.anchor_generator.num_anchors_per_location()
- head = SSDHead(out_channels, num_anchors, num_classes)
- self.head = head
- self.proposal_matcher = det_utils.SSDMatcher(iou_thresh)
- if image_mean is None:
- image_mean = [0.485, 0.456, 0.406]
- if image_std is None:
- image_std = [0.229, 0.224, 0.225]
- self.transform = GeneralizedRCNNTransform(
- min(size), max(size), image_mean, image_std, size_divisible=1, fixed_size=size, **kwargs
- )
- self.score_thresh = score_thresh
- self.nms_thresh = nms_thresh
- self.detections_per_img = detections_per_img
- self.topk_candidates = topk_candidates
- self.neg_to_pos_ratio = (1.0 - positive_fraction) / positive_fraction
- # used only on torchscript mode
- self._has_warned = False
- @torch.jit.unused
- def eager_outputs(
- self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]]
- ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
- if self.training:
- return losses
- return detections
- def compute_loss(
- self,
- targets: List[Dict[str, Tensor]],
- head_outputs: Dict[str, Tensor],
- anchors: List[Tensor],
- matched_idxs: List[Tensor],
- ) -> Dict[str, Tensor]:
- bbox_regression = head_outputs["bbox_regression"]
- cls_logits = head_outputs["cls_logits"]
- # Match original targets with default boxes
- num_foreground = 0
- bbox_loss = []
- cls_targets = []
- for (
- targets_per_image,
- bbox_regression_per_image,
- cls_logits_per_image,
- anchors_per_image,
- matched_idxs_per_image,
- ) in zip(targets, bbox_regression, cls_logits, anchors, matched_idxs):
- # produce the matching between boxes and targets
- foreground_idxs_per_image = torch.where(matched_idxs_per_image >= 0)[0]
- foreground_matched_idxs_per_image = matched_idxs_per_image[foreground_idxs_per_image]
- num_foreground += foreground_matched_idxs_per_image.numel()
- # Calculate regression loss
- matched_gt_boxes_per_image = targets_per_image["boxes"][foreground_matched_idxs_per_image]
- bbox_regression_per_image = bbox_regression_per_image[foreground_idxs_per_image, :]
- anchors_per_image = anchors_per_image[foreground_idxs_per_image, :]
- target_regression = self.box_coder.encode_single(matched_gt_boxes_per_image, anchors_per_image)
- bbox_loss.append(
- torch.nn.functional.smooth_l1_loss(bbox_regression_per_image, target_regression, reduction="sum")
- )
- # Estimate ground truth for class targets
- gt_classes_target = torch.zeros(
- (cls_logits_per_image.size(0),),
- dtype=targets_per_image["labels"].dtype,
- device=targets_per_image["labels"].device,
- )
- gt_classes_target[foreground_idxs_per_image] = targets_per_image["labels"][
- foreground_matched_idxs_per_image
- ]
- cls_targets.append(gt_classes_target)
- bbox_loss = torch.stack(bbox_loss)
- cls_targets = torch.stack(cls_targets)
- # Calculate classification loss
- num_classes = cls_logits.size(-1)
- cls_loss = F.cross_entropy(cls_logits.view(-1, num_classes), cls_targets.view(-1), reduction="none").view(
- cls_targets.size()
- )
- # Hard Negative Sampling
- foreground_idxs = cls_targets > 0
- num_negative = self.neg_to_pos_ratio * foreground_idxs.sum(1, keepdim=True)
- # num_negative[num_negative < self.neg_to_pos_ratio] = self.neg_to_pos_ratio
- negative_loss = cls_loss.clone()
- negative_loss[foreground_idxs] = -float("inf") # use -inf to detect positive values that creeped in the sample
- values, idx = negative_loss.sort(1, descending=True)
- # background_idxs = torch.logical_and(idx.sort(1)[1] < num_negative, torch.isfinite(values))
- background_idxs = idx.sort(1)[1] < num_negative
- N = max(1, num_foreground)
- return {
- "bbox_regression": bbox_loss.sum() / N,
- "classification": (cls_loss[foreground_idxs].sum() + cls_loss[background_idxs].sum()) / N,
- }
- def forward(
- self, images: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None
- ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
- if self.training:
- if targets is None:
- torch._assert(False, "targets should not be none when in training mode")
- else:
- for target in targets:
- boxes = target["boxes"]
- if isinstance(boxes, torch.Tensor):
- torch._assert(
- len(boxes.shape) == 2 and boxes.shape[-1] == 4,
- f"Expected target boxes to be a tensor of shape [N, 4], got {boxes.shape}.",
- )
- else:
- torch._assert(False, f"Expected target boxes to be of type Tensor, got {type(boxes)}.")
- # get the original image sizes
- original_image_sizes: List[Tuple[int, int]] = []
- for img in images:
- val = img.shape[-2:]
- torch._assert(
- len(val) == 2,
- f"expecting the last two dimensions of the Tensor to be H and W instead got {img.shape[-2:]}",
- )
- original_image_sizes.append((val[0], val[1]))
- # transform the input
- images, targets = self.transform(images, targets)
- # Check for degenerate boxes
- if targets is not None:
- for target_idx, target in enumerate(targets):
- boxes = target["boxes"]
- degenerate_boxes = boxes[:, 2:] <= boxes[:, :2]
- if degenerate_boxes.any():
- bb_idx = torch.where(degenerate_boxes.any(dim=1))[0][0]
- degen_bb: List[float] = boxes[bb_idx].tolist()
- torch._assert(
- False,
- "All bounding boxes should have positive height and width."
- f" Found invalid box {degen_bb} for target at index {target_idx}.",
- )
- # get the features from the backbone
- features = self.backbone(images.tensors)
- if isinstance(features, torch.Tensor):
- features = OrderedDict([("0", features)])
- features = list(features.values())
- # compute the ssd heads outputs using the features
- head_outputs = self.head(features)
- # create the set of anchors
- anchors = self.anchor_generator(images, features)
- losses = {}
- detections: List[Dict[str, Tensor]] = []
- if self.training:
- matched_idxs = []
- if targets is None:
- torch._assert(False, "targets should not be none when in training mode")
- else:
- for anchors_per_image, targets_per_image in zip(anchors, targets):
- if targets_per_image["boxes"].numel() == 0:
- matched_idxs.append(
- torch.full(
- (anchors_per_image.size(0),), -1, dtype=torch.int64, device=anchors_per_image.device
- )
- )
- continue
- match_quality_matrix = box_ops.box_iou(targets_per_image["boxes"], anchors_per_image)
- matched_idxs.append(self.proposal_matcher(match_quality_matrix))
- losses = self.compute_loss(targets, head_outputs, anchors, matched_idxs)
- else:
- detections = self.postprocess_detections(head_outputs, anchors, images.image_sizes)
- detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)
- if torch.jit.is_scripting():
- if not self._has_warned:
- warnings.warn("SSD always returns a (Losses, Detections) tuple in scripting")
- self._has_warned = True
- return losses, detections
- return self.eager_outputs(losses, detections)
- def postprocess_detections(
- self, head_outputs: Dict[str, Tensor], image_anchors: List[Tensor], image_shapes: List[Tuple[int, int]]
- ) -> List[Dict[str, Tensor]]:
- bbox_regression = head_outputs["bbox_regression"]
- pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1)
- num_classes = pred_scores.size(-1)
- device = pred_scores.device
- detections: List[Dict[str, Tensor]] = []
- for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
- boxes = self.box_coder.decode_single(boxes, anchors)
- boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
- image_boxes = []
- image_scores = []
- image_labels = []
- for label in range(1, num_classes):
- score = scores[:, label]
- keep_idxs = score > self.score_thresh
- score = score[keep_idxs]
- box = boxes[keep_idxs]
- # keep only topk scoring predictions
- num_topk = det_utils._topk_min(score, self.topk_candidates, 0)
- score, idxs = score.topk(num_topk)
- box = box[idxs]
- image_boxes.append(box)
- image_scores.append(score)
- image_labels.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device))
- image_boxes = torch.cat(image_boxes, dim=0)
- image_scores = torch.cat(image_scores, dim=0)
- image_labels = torch.cat(image_labels, dim=0)
- # non-maximum suppression
- keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
- keep = keep[: self.detections_per_img]
- detections.append(
- {
- "boxes": image_boxes[keep],
- "scores": image_scores[keep],
- "labels": image_labels[keep],
- }
- )
- return detections
- class SSDFeatureExtractorVGG(nn.Module):
- def __init__(self, backbone: nn.Module, highres: bool):
- super().__init__()
- _, _, maxpool3_pos, maxpool4_pos, _ = (i for i, layer in enumerate(backbone) if isinstance(layer, nn.MaxPool2d))
- # Patch ceil_mode for maxpool3 to get the same WxH output sizes as the paper
- backbone[maxpool3_pos].ceil_mode = True
- # parameters used for L2 regularization + rescaling
- self.scale_weight = nn.Parameter(torch.ones(512) * 20)
- # Multiple Feature maps - page 4, Fig 2 of SSD paper
- self.features = nn.Sequential(*backbone[:maxpool4_pos]) # until conv4_3
- # SSD300 case - page 4, Fig 2 of SSD paper
- extra = nn.ModuleList(
- [
- nn.Sequential(
- nn.Conv2d(1024, 256, kernel_size=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(256, 512, kernel_size=3, padding=1, stride=2), # conv8_2
- nn.ReLU(inplace=True),
- ),
- nn.Sequential(
- nn.Conv2d(512, 128, kernel_size=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(128, 256, kernel_size=3, padding=1, stride=2), # conv9_2
- nn.ReLU(inplace=True),
- ),
- nn.Sequential(
- nn.Conv2d(256, 128, kernel_size=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(128, 256, kernel_size=3), # conv10_2
- nn.ReLU(inplace=True),
- ),
- nn.Sequential(
- nn.Conv2d(256, 128, kernel_size=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(128, 256, kernel_size=3), # conv11_2
- nn.ReLU(inplace=True),
- ),
- ]
- )
- if highres:
- # Additional layers for the SSD512 case. See page 11, footernote 5.
- extra.append(
- nn.Sequential(
- nn.Conv2d(256, 128, kernel_size=1),
- nn.ReLU(inplace=True),
- nn.Conv2d(128, 256, kernel_size=4), # conv12_2
- nn.ReLU(inplace=True),
- )
- )
- _xavier_init(extra)
- fc = nn.Sequential(
- nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=False), # add modified maxpool5
- nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=3, padding=6, dilation=6), # FC6 with atrous
- nn.ReLU(inplace=True),
- nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=1), # FC7
- nn.ReLU(inplace=True),
- )
- _xavier_init(fc)
- extra.insert(
- 0,
- nn.Sequential(
- *backbone[maxpool4_pos:-1], # until conv5_3, skip maxpool5
- fc,
- ),
- )
- self.extra = extra
- def forward(self, x: Tensor) -> Dict[str, Tensor]:
- # L2 regularization + Rescaling of 1st block's feature map
- x = self.features(x)
- rescaled = self.scale_weight.view(1, -1, 1, 1) * F.normalize(x)
- output = [rescaled]
- # Calculating Feature maps for the rest blocks
- for block in self.extra:
- x = block(x)
- output.append(x)
- return OrderedDict([(str(i), v) for i, v in enumerate(output)])
- def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
- backbone = backbone.features
- # Gather the indices of maxpools. These are the locations of output blocks.
- stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
- num_stages = len(stage_indices)
- # find the index of the layer from which we won't freeze
- torch._assert(
- 0 <= trainable_layers <= num_stages,
- f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
- )
- freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
- for b in backbone[:freeze_before]:
- for parameter in b.parameters():
- parameter.requires_grad_(False)
- return SSDFeatureExtractorVGG(backbone, highres)
- @register_model()
- @handle_legacy_interface(
- weights=("pretrained", SSD300_VGG16_Weights.COCO_V1),
- weights_backbone=("pretrained_backbone", VGG16_Weights.IMAGENET1K_FEATURES),
- )
- def ssd300_vgg16(
- *,
- weights: Optional[SSD300_VGG16_Weights] = None,
- progress: bool = True,
- num_classes: Optional[int] = None,
- weights_backbone: Optional[VGG16_Weights] = VGG16_Weights.IMAGENET1K_FEATURES,
- trainable_backbone_layers: Optional[int] = None,
- **kwargs: Any,
- ) -> SSD:
- """The SSD300 model is based on the `SSD: Single Shot MultiBox Detector
- <https://arxiv.org/abs/1512.02325>`_ paper.
- .. betastatus:: detection module
- The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
- image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
- to a fixed size before passing it to the backbone.
- The behavior of the model changes depending on if it is in training or evaluation mode.
- During training, the model expects both the input tensors and targets (list of dictionary),
- containing:
- - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- - labels (Int64Tensor[N]): the class label for each ground-truth box
- The model returns a Dict[Tensor] during training, containing the classification and regression
- losses.
- During inference, the model requires only the input tensors, and returns the post-processed
- predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
- follows, where ``N`` is the number of detections:
- - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
- ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
- - labels (Int64Tensor[N]): the predicted labels for each detection
- - scores (Tensor[N]): the scores for each detection
- Example:
- >>> model = torchvision.models.detection.ssd300_vgg16(weights=SSD300_VGG16_Weights.DEFAULT)
- >>> model.eval()
- >>> x = [torch.rand(3, 300, 300), torch.rand(3, 500, 400)]
- >>> predictions = model(x)
- Args:
- weights (:class:`~torchvision.models.detection.SSD300_VGG16_Weights`, optional): The pretrained
- weights to use. See
- :class:`~torchvision.models.detection.SSD300_VGG16_Weights`
- below for more details, and possible values. By default, no
- pre-trained weights are used.
- progress (bool, optional): If True, displays a progress bar of the download to stderr
- Default is True.
- num_classes (int, optional): number of output classes of the model (including the background)
- weights_backbone (:class:`~torchvision.models.VGG16_Weights`, optional): The pretrained weights for the
- backbone
- trainable_backbone_layers (int, optional): number of trainable (not frozen) layers starting from final block.
- Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable. If ``None`` is
- passed (the default) this value is set to 4.
- **kwargs: parameters passed to the ``torchvision.models.detection.SSD``
- base class. Please refer to the `source code
- <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/ssd.py>`_
- for more details about this class.
- .. autoclass:: torchvision.models.detection.SSD300_VGG16_Weights
- :members:
- """
- weights = SSD300_VGG16_Weights.verify(weights)
- weights_backbone = VGG16_Weights.verify(weights_backbone)
- if "size" in kwargs:
- warnings.warn("The size of the model is already fixed; ignoring the parameter.")
- if weights is not None:
- weights_backbone = None
- num_classes = _ovewrite_value_param("num_classes", num_classes, len(weights.meta["categories"]))
- elif num_classes is None:
- num_classes = 91
- trainable_backbone_layers = _validate_trainable_layers(
- weights is not None or weights_backbone is not None, trainable_backbone_layers, 5, 4
- )
- # Use custom backbones more appropriate for SSD
- backbone = vgg16(weights=weights_backbone, progress=progress)
- backbone = _vgg_extractor(backbone, False, trainable_backbone_layers)
- anchor_generator = DefaultBoxGenerator(
- [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
- scales=[0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05],
- steps=[8, 16, 32, 64, 100, 300],
- )
- defaults = {
- # Rescale the input in a way compatible to the backbone
- "image_mean": [0.48235, 0.45882, 0.40784],
- "image_std": [1.0 / 255.0, 1.0 / 255.0, 1.0 / 255.0], # undo the 0-1 scaling of toTensor
- }
- kwargs: Any = {**defaults, **kwargs}
- model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
- if weights is not None:
- model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
- return model
|