from typing import Tuple import torchvision from torch import Tensor class KineticsWithVideoId(torchvision.datasets.Kinetics): def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor, int]: video, audio, info, video_idx = self.video_clips.get_clip(idx) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label, video_idx