123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462 |
- """
- =======================
- Visualization utilities
- =======================
- .. note::
- Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_visualization_utils.ipynb>`_
- or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_visualization_utils.py>` to download the full example code.
- This example illustrates some of the utilities that torchvision offers for
- visualizing images, bounding boxes, segmentation masks and keypoints.
- """
- # sphinx_gallery_thumbnail_path = "../../gallery/assets/visualization_utils_thumbnail2.png"
- import torch
- import numpy as np
- import matplotlib.pyplot as plt
- import torchvision.transforms.functional as F
- plt.rcParams["savefig.bbox"] = 'tight'
- def show(imgs):
- if not isinstance(imgs, list):
- imgs = [imgs]
- fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
- for i, img in enumerate(imgs):
- img = img.detach()
- img = F.to_pil_image(img)
- axs[0, i].imshow(np.asarray(img))
- axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
- # %%
- # Visualizing a grid of images
- # ----------------------------
- # The :func:`~torchvision.utils.make_grid` function can be used to create a
- # tensor that represents multiple images in a grid. This util requires a single
- # image of dtype ``uint8`` as input.
- from torchvision.utils import make_grid
- from torchvision.io import read_image
- from pathlib import Path
- dog1_int = read_image(str(Path('../assets') / 'dog1.jpg'))
- dog2_int = read_image(str(Path('../assets') / 'dog2.jpg'))
- dog_list = [dog1_int, dog2_int]
- grid = make_grid(dog_list)
- show(grid)
- # %%
- # Visualizing bounding boxes
- # --------------------------
- # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
- # image. We can set the colors, labels, width as well as font and font size.
- # The boxes are in ``(xmin, ymin, xmax, ymax)`` format.
- from torchvision.utils import draw_bounding_boxes
- boxes = torch.tensor([[50, 50, 100, 200], [210, 150, 350, 430]], dtype=torch.float)
- colors = ["blue", "yellow"]
- result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
- show(result)
- # %%
- # Naturally, we can also plot bounding boxes produced by torchvision detection
- # models. Here is a demo with a Faster R-CNN model loaded from
- # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
- # model. For more details on the output of such models, you may
- # refer to :ref:`instance_seg_output`.
- from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
- weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
- transforms = weights.transforms()
- images = [transforms(d) for d in dog_list]
- model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
- model = model.eval()
- outputs = model(images)
- print(outputs)
- # %%
- # Let's plot the boxes detected by our model. We will only plot the boxes with a
- # score greater than a given threshold.
- score_threshold = .8
- dogs_with_boxes = [
- draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > score_threshold], width=4)
- for dog_int, output in zip(dog_list, outputs)
- ]
- show(dogs_with_boxes)
- # %%
- # Visualizing segmentation masks
- # ------------------------------
- # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
- # draw segmentation masks on images. Semantic segmentation and instance
- # segmentation models have different outputs, so we will treat each
- # independently.
- #
- # .. _semantic_seg_output:
- #
- # Semantic segmentation models
- # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- #
- # We will see how to use it with torchvision's FCN Resnet-50, loaded with
- # :func:`~torchvision.models.segmentation.fcn_resnet50`. Let's start by looking
- # at the output of the model.
- from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
- weights = FCN_ResNet50_Weights.DEFAULT
- transforms = weights.transforms(resize_size=None)
- model = fcn_resnet50(weights=weights, progress=False)
- model = model.eval()
- batch = torch.stack([transforms(d) for d in dog_list])
- output = model(batch)['out']
- print(output.shape, output.min().item(), output.max().item())
- # %%
- # As we can see above, the output of the segmentation model is a tensor of shape
- # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
- # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
- # we can interpret each value as a probability indicating how likely a given
- # pixel is to belong to a given class.
- #
- # Let's plot the masks that have been detected for the dog class and for the
- # boat class:
- sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
- normalized_masks = torch.nn.functional.softmax(output, dim=1)
- dog_and_boat_masks = [
- normalized_masks[img_idx, sem_class_to_idx[cls]]
- for img_idx in range(len(dog_list))
- for cls in ('dog', 'boat')
- ]
- show(dog_and_boat_masks)
- # %%
- # As expected, the model is confident about the dog class, but not so much for
- # the boat class.
- #
- # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
- # plots those masks on top of the original image. This function expects the
- # masks to be boolean masks, but our masks above contain probabilities in ``[0,
- # 1]``. To get boolean masks, we can do the following:
- class_dim = 1
- boolean_dog_masks = (normalized_masks.argmax(class_dim) == sem_class_to_idx['dog'])
- print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
- show([m.float() for m in boolean_dog_masks])
- # %%
- # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
- # can read it as the following query: "For which pixels is 'dog' the most likely
- # class?"
- #
- # .. note::
- # While we're using the ``normalized_masks`` here, we would have
- # gotten the same result by using the non-normalized scores of the model
- # directly (as the softmax operation preserves the order).
- #
- # Now that we have boolean masks, we can use them with
- # :func:`~torchvision.utils.draw_segmentation_masks` to plot them on top of the
- # original images:
- from torchvision.utils import draw_segmentation_masks
- dogs_with_masks = [
- draw_segmentation_masks(img, masks=mask, alpha=0.7)
- for img, mask in zip(dog_list, boolean_dog_masks)
- ]
- show(dogs_with_masks)
- # %%
- # We can plot more than one mask per image! Remember that the model returned as
- # many masks as there are classes. Let's ask the same query as above, but this
- # time for *all* classes, not just the dog class: "For each pixel and each class
- # C, is class C the most likely class?"
- #
- # This one is a bit more involved, so we'll first show how to do it with a
- # single image, and then we'll generalize to the batch
- num_classes = normalized_masks.shape[1]
- dog1_masks = normalized_masks[0]
- class_dim = 0
- dog1_all_classes_masks = dog1_masks.argmax(class_dim) == torch.arange(num_classes)[:, None, None]
- print(f"dog1_masks shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}")
- print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_all_classes_masks.dtype}")
- dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
- show(dog_with_all_masks)
- # %%
- # We can see in the image above that only 2 masks were drawn: the mask for the
- # background and the mask for the dog. This is because the model thinks that
- # only these 2 classes are the most likely ones across all the pixels. If the
- # model had detected another class as the most likely among other pixels, we
- # would have seen its mask above.
- #
- # Removing the background mask is as simple as passing
- # ``masks=dog1_all_classes_masks[1:]``, because the background class is the
- # class with index 0.
- #
- # Let's now do the same but for an entire batch of images. The code is similar
- # but involves a bit more juggling with the dimensions.
- class_dim = 1
- all_classes_masks = normalized_masks.argmax(class_dim) == torch.arange(num_classes)[:, None, None, None]
- print(f"shape = {all_classes_masks.shape}, dtype = {all_classes_masks.dtype}")
- # The first dimension is the classes now, so we need to swap it
- all_classes_masks = all_classes_masks.swapaxes(0, 1)
- dogs_with_masks = [
- draw_segmentation_masks(img, masks=mask, alpha=.6)
- for img, mask in zip(dog_list, all_classes_masks)
- ]
- show(dogs_with_masks)
- # %%
- # .. _instance_seg_output:
- #
- # Instance segmentation models
- # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- #
- # Instance segmentation models have a significantly different output from the
- # semantic segmentation models. We will see here how to plot the masks for such
- # models. Let's start by analyzing the output of a Mask-RCNN model. Note that
- # these models don't require the images to be normalized, so we don't need to
- # use the normalized batch.
- #
- # .. note::
- #
- # We will here describe the output of a Mask-RCNN model. The models in
- # :ref:`object_det_inst_seg_pers_keypoint_det` all have a similar output
- # format, but some of them may have extra info like keypoints for
- # :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn`, and some
- # of them may not have masks, like
- # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`.
- from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights
- weights = MaskRCNN_ResNet50_FPN_Weights.DEFAULT
- transforms = weights.transforms()
- images = [transforms(d) for d in dog_list]
- model = maskrcnn_resnet50_fpn(weights=weights, progress=False)
- model = model.eval()
- output = model(images)
- print(output)
- # %%
- # Let's break this down. For each image in the batch, the model outputs some
- # detections (or instances). The number of detections varies for each input
- # image. Each instance is described by its bounding box, its label, its score
- # and its mask.
- #
- # The way the output is organized is as follows: the output is a list of length
- # ``batch_size``. Each entry in the list corresponds to an input image, and it
- # is a dict with keys 'boxes', 'labels', 'scores', and 'masks'. Each value
- # associated to those keys has ``num_instances`` elements in it. In our case
- # above there are 3 instances detected in the first image, and 2 instances in
- # the second one.
- #
- # The boxes can be plotted with :func:`~torchvision.utils.draw_bounding_boxes`
- # as above, but here we're more interested in the masks. These masks are quite
- # different from the masks that we saw above for the semantic segmentation
- # models.
- dog1_output = output[0]
- dog1_masks = dog1_output['masks']
- print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
- f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
- # %%
- # Here the masks correspond to probabilities indicating, for each pixel, how
- # likely it is to belong to the predicted label of that instance. Those
- # predicted labels correspond to the 'labels' element in the same output dict.
- # Let's see which labels were predicted for the instances of the first image.
- print("For the first dog, the following instances were detected:")
- print([weights.meta["categories"][label] for label in dog1_output['labels']])
- # %%
- # Interestingly, the model detects two persons in the image. Let's go ahead and
- # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
- # expects boolean masks, we need to convert those probabilities into boolean
- # values. Remember that the semantic of those masks is "How likely is this pixel
- # to belong to the predicted class?". As a result, a natural way of converting
- # those masks into boolean values is to threshold them with the 0.5 probability
- # (one could also choose a different threshold).
- proba_threshold = 0.5
- dog1_bool_masks = dog1_output['masks'] > proba_threshold
- print(f"shape = {dog1_bool_masks.shape}, dtype = {dog1_bool_masks.dtype}")
- # There's an extra dimension (1) to the masks. We need to remove it
- dog1_bool_masks = dog1_bool_masks.squeeze(1)
- show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
- # %%
- # The model seems to have properly detected the dog, but it also confused trees
- # with people. Looking more closely at the scores will help us plot more
- # relevant masks:
- print(dog1_output['scores'])
- # %%
- # Clearly the model is more confident about the dog detection than it is about
- # the people detections. That's good news. When plotting the masks, we can ask
- # for only those that have a good score. Let's use a score threshold of .75
- # here, and also plot the masks of the second dog.
- score_threshold = .75
- boolean_masks = [
- out['masks'][out['scores'] > score_threshold] > proba_threshold
- for out in output
- ]
- dogs_with_masks = [
- draw_segmentation_masks(img, mask.squeeze(1))
- for img, mask in zip(dog_list, boolean_masks)
- ]
- show(dogs_with_masks)
- # %%
- # The two 'people' masks in the first image where not selected because they have
- # a lower score than the score threshold. Similarly, in the second image, the
- # instance with class 15 (which corresponds to 'bench') was not selected.
- # %%
- # .. _keypoint_output:
- #
- # Visualizing keypoints
- # ------------------------------
- # The :func:`~torchvision.utils.draw_keypoints` function can be used to
- # draw keypoints on images. We will see how to use it with
- # torchvision's KeypointRCNN loaded with :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn`.
- # We will first have a look at output of the model.
- #
- from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
- from torchvision.io import read_image
- person_int = read_image(str(Path("../assets") / "person1.jpg"))
- weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
- transforms = weights.transforms()
- person_float = transforms(person_int)
- model = keypointrcnn_resnet50_fpn(weights=weights, progress=False)
- model = model.eval()
- outputs = model([person_float])
- print(outputs)
- # %%
- # As we see the output contains a list of dictionaries.
- # The output list is of length batch_size.
- # We currently have just a single image so length of list is 1.
- # Each entry in the list corresponds to an input image,
- # and it is a dict with keys `boxes`, `labels`, `scores`, `keypoints` and `keypoint_scores`.
- # Each value associated to those keys has `num_instances` elements in it.
- # In our case above there are 2 instances detected in the image.
- kpts = outputs[0]['keypoints']
- scores = outputs[0]['scores']
- print(kpts)
- print(scores)
- # %%
- # The KeypointRCNN model detects there are two instances in the image.
- # If you plot the boxes by using :func:`~draw_bounding_boxes`
- # you would recognize they are the person and the surfboard.
- # If we look at the scores, we will realize that the model is much more confident about the person than surfboard.
- # We could now set a threshold confidence and plot instances which we are confident enough.
- # Let us set a threshold of 0.75 and filter out the keypoints corresponding to the person.
- detect_threshold = 0.75
- idx = torch.where(scores > detect_threshold)
- keypoints = kpts[idx]
- print(keypoints)
- # %%
- # Great, now we have the keypoints corresponding to the person.
- # Each keypoint is represented by x, y coordinates and the visibility.
- # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
- # Note that the utility expects uint8 images.
- from torchvision.utils import draw_keypoints
- res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
- show(res)
- # %%
- # As we see the keypoints appear as colored circles over the image.
- # The coco keypoints for a person are ordered and represent the following list.\
- coco_keypoints = [
- "nose", "left_eye", "right_eye", "left_ear", "right_ear",
- "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
- "left_wrist", "right_wrist", "left_hip", "right_hip",
- "left_knee", "right_knee", "left_ankle", "right_ankle",
- ]
- # %%
- # What if we are interested in joining the keypoints?
- # This is especially useful in creating pose detection or action recognition.
- # We can join the keypoints easily using the `connectivity` parameter.
- # A close observation would reveal that we would need to join the points in below
- # order to construct human skeleton.
- #
- # nose -> left_eye -> left_ear. (0, 1), (1, 3)
- #
- # nose -> right_eye -> right_ear. (0, 2), (2, 4)
- #
- # nose -> left_shoulder -> left_elbow -> left_wrist. (0, 5), (5, 7), (7, 9)
- #
- # nose -> right_shoulder -> right_elbow -> right_wrist. (0, 6), (6, 8), (8, 10)
- #
- # left_shoulder -> left_hip -> left_knee -> left_ankle. (5, 11), (11, 13), (13, 15)
- #
- # right_shoulder -> right_hip -> right_knee -> right_ankle. (6, 12), (12, 14), (14, 16)
- #
- # We will create a list containing these keypoint ids to be connected.
- connect_skeleton = [
- (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 7), (6, 8),
- (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
- ]
- # %%
- # We pass the above list to the connectivity parameter to connect the keypoints.
- #
- res = draw_keypoints(person_int, keypoints, connectivity=connect_skeleton, colors="blue", radius=4, width=3)
- show(res)
|