123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346 |
- """
- =========
- Video API
- =========
- .. note::
- Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_video_api.ipynb>`_
- or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_video_api.py>` to download the full example code.
- This example illustrates some of the APIs that torchvision offers for
- videos, together with the examples on how to build datasets and more.
- """
- # %%
- # 1. Introduction: building a new video object and examining the properties
- # -------------------------------------------------------------------------
- # First we select a video to test the object out. For the sake of argument
- # we're using one from kinetics400 dataset.
- # To create it, we need to define the path and the stream we want to use.
- # %%
- # Chosen video statistics:
- #
- # - WUzgd7C1pWA.mp4
- # - source:
- # - kinetics-400
- # - video:
- # - H-264
- # - MPEG-4 AVC (part 10) (avc1)
- # - fps: 29.97
- # - audio:
- # - MPEG AAC audio (mp4a)
- # - sample rate: 48K Hz
- #
- import torch
- import torchvision
- from torchvision.datasets.utils import download_url
- torchvision.set_video_backend("video_reader")
- # Download the sample video
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
- ".",
- "WUzgd7C1pWA.mp4"
- )
- video_path = "./WUzgd7C1pWA.mp4"
- # %%
- # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
- # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
- # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
- # Firstly, let's get the metadata for our particular video:
- stream = "video"
- video = torchvision.io.VideoReader(video_path, stream)
- video.get_metadata()
- # %%
- # Here we can see that video has two streams - a video and an audio stream.
- # Currently available stream types include ['video', 'audio'].
- # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
- # (which are determined by video encoding).
- # In this way, if the video container contains multiple streams of the same type,
- # users can access the one they want.
- # If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
- # %%
- # Let's read all the frames from the video stream. By default, the return value of
- # ``next(video_reader)`` is a dict containing the following fields.
- #
- # The return fields are:
- #
- # - ``data``: containing a torch.tensor
- # - ``pts``: containing a float timestamp of this particular frame
- metadata = video.get_metadata()
- video.set_current_stream("audio")
- frames = [] # we are going to save the frames here.
- ptss = [] # pts is a presentation timestamp in seconds (float) of each frame
- for frame in video:
- frames.append(frame['data'])
- ptss.append(frame['pts'])
- print("PTS for first five frames ", ptss[:5])
- print("Total number of frames: ", len(frames))
- approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
- print("Approx total number of datapoints we can expect: ", approx_nf)
- print("Read data size: ", frames[0].size(0) * len(frames))
- # %%
- # But what if we only want to read certain time segment of the video?
- # That can be done easily using the combination of our ``seek`` function, and the fact that each call
- # to next returns the presentation timestamp of the returned frame in seconds.
- #
- # Given that our implementation relies on python iterators,
- # we can leverage itertools to simplify the process and make it more pythonic.
- #
- # For example, if we wanted to read ten frames from second second:
- import itertools
- video.set_current_stream("video")
- frames = [] # we are going to save the frames here.
- # We seek into a second second of the video and use islice to get 10 frames since
- for frame, pts in itertools.islice(video.seek(2), 10):
- frames.append(frame)
- print("Total number of frames: ", len(frames))
- # %%
- # Or if we wanted to read from 2nd to 5th second,
- # We seek into a second second of the video,
- # then we utilize the itertools takewhile to get the
- # correct number of frames:
- video.set_current_stream("video")
- frames = [] # we are going to save the frames here.
- video = video.seek(2)
- for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
- frames.append(frame['data'])
- print("Total number of frames: ", len(frames))
- approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
- print("We can expect approx: ", approx_nf)
- print("Tensor size: ", frames[0].size())
- # %%
- # 2. Building a sample read_video function
- # ----------------------------------------------------------------------------------------
- # We can utilize the methods above to build the read video function that follows
- # the same API to the existing ``read_video`` function.
- def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
- if end is None:
- end = float("inf")
- if end < start:
- raise ValueError(
- "end time should be larger than start time, got "
- f"start time={start} and end time={end}"
- )
- video_frames = torch.empty(0)
- video_pts = []
- if read_video:
- video_object.set_current_stream("video")
- frames = []
- for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
- frames.append(frame['data'])
- video_pts.append(frame['pts'])
- if len(frames) > 0:
- video_frames = torch.stack(frames, 0)
- audio_frames = torch.empty(0)
- audio_pts = []
- if read_audio:
- video_object.set_current_stream("audio")
- frames = []
- for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
- frames.append(frame['data'])
- audio_pts.append(frame['pts'])
- if len(frames) > 0:
- audio_frames = torch.cat(frames, 0)
- return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
- # Total number of frames should be 327 for video and 523264 datapoints for audio
- vf, af, info, meta = example_read_video(video)
- print(vf.size(), af.size())
- # %%
- # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
- # -------------------------------------------------------------------------------------------------------
- # Cool, so now we can use the same principle to make the sample dataset.
- # We suggest trying out iterable dataset for this purpose.
- # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
- # %%
- # Make sample dataset
- import os
- os.makedirs("./dataset", exist_ok=True)
- os.makedirs("./dataset/1", exist_ok=True)
- os.makedirs("./dataset/2", exist_ok=True)
- # %%
- # Download the videos
- from torchvision.datasets.utils import download_url
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
- "./dataset/1", "WUzgd7C1pWA.mp4"
- )
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
- "./dataset/1",
- "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
- )
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
- "./dataset/2",
- "SOX5yA1l24A.mp4"
- )
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
- "./dataset/2",
- "v_SoccerJuggling_g23_c01.avi"
- )
- download_url(
- "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
- "./dataset/2",
- "v_SoccerJuggling_g24_c01.avi"
- )
- # %%
- # Housekeeping and utilities
- import os
- import random
- from torchvision.datasets.folder import make_dataset
- from torchvision import transforms as t
- def _find_classes(dir):
- classes = [d.name for d in os.scandir(dir) if d.is_dir()]
- classes.sort()
- class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
- return classes, class_to_idx
- def get_samples(root, extensions=(".mp4", ".avi")):
- _, class_to_idx = _find_classes(root)
- return make_dataset(root, class_to_idx, extensions=extensions)
- # %%
- # We are going to define the dataset and some basic arguments.
- # We assume the structure of the FolderDataset, and add the following parameters:
- #
- # - ``clip_len``: length of a clip in frames
- # - ``frame_transform``: transform for every frame individually
- # - ``video_transform``: transform on a video sequence
- #
- # .. note::
- # We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
- # class allows us to naturally oversample clips or images from each video if needed.
- class RandomDataset(torch.utils.data.IterableDataset):
- def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
- super(RandomDataset).__init__()
- self.samples = get_samples(root)
- # Allow for temporal jittering
- if epoch_size is None:
- epoch_size = len(self.samples)
- self.epoch_size = epoch_size
- self.clip_len = clip_len
- self.frame_transform = frame_transform
- self.video_transform = video_transform
- def __iter__(self):
- for i in range(self.epoch_size):
- # Get random sample
- path, target = random.choice(self.samples)
- # Get video object
- vid = torchvision.io.VideoReader(path, "video")
- metadata = vid.get_metadata()
- video_frames = [] # video frame buffer
- # Seek and return frames
- max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
- start = random.uniform(0., max_seek)
- for frame in itertools.islice(vid.seek(start), self.clip_len):
- video_frames.append(self.frame_transform(frame['data']))
- current_pts = frame['pts']
- # Stack it into a tensor
- video = torch.stack(video_frames, 0)
- if self.video_transform:
- video = self.video_transform(video)
- output = {
- 'path': path,
- 'video': video,
- 'target': target,
- 'start': start,
- 'end': current_pts}
- yield output
- # %%
- # Given a path of videos in a folder structure, i.e:
- #
- # - dataset
- # - class 1
- # - file 0
- # - file 1
- # - ...
- # - class 2
- # - file 0
- # - file 1
- # - ...
- # - ...
- #
- # We can generate a dataloader and test the dataset.
- transforms = [t.Resize((112, 112))]
- frame_transform = t.Compose(transforms)
- dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
- # %%
- from torch.utils.data import DataLoader
- loader = DataLoader(dataset, batch_size=12)
- data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
- for batch in loader:
- for i in range(len(batch['path'])):
- data['video'].append(batch['path'][i])
- data['start'].append(batch['start'][i].item())
- data['end'].append(batch['end'][i].item())
- data['tensorsize'].append(batch['video'][i].size())
- print(data)
- # %%
- # 4. Data Visualization
- # ----------------------------------
- # Example of visualized video
- import matplotlib.pyplot as plt
- plt.figure(figsize=(12, 12))
- for i in range(16):
- plt.subplot(4, 4, i + 1)
- plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
- plt.axis("off")
- # %%
- # Cleanup the video and dataset:
- import os
- import shutil
- os.remove("./WUzgd7C1pWA.mp4")
- shutil.rmtree("./dataset")
|