loaders.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. import glob
  3. import math
  4. import os
  5. import time
  6. from dataclasses import dataclass
  7. from pathlib import Path
  8. from threading import Thread
  9. from urllib.parse import urlparse
  10. import cv2
  11. import numpy as np
  12. import requests
  13. import torch
  14. from PIL import Image
  15. from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
  16. from ultralytics.utils import LOGGER, is_colab, is_kaggle, ops
  17. from ultralytics.utils.checks import check_requirements
  18. @dataclass
  19. class SourceTypes:
  20. webcam: bool = False
  21. screenshot: bool = False
  22. from_img: bool = False
  23. tensor: bool = False
  24. class LoadStreams:
  25. """YOLOv8 streamloader, i.e. `yolo predict source='rtsp://example.com/media.mp4' # RTSP, RTMP, HTTP streams`."""
  26. def __init__(self, sources='file.streams', imgsz=640, vid_stride=1):
  27. """Initialize instance variables and check for consistent input stream shapes."""
  28. torch.backends.cudnn.benchmark = True # faster for fixed-size inference
  29. self.running = True # running flag for Thread
  30. self.mode = 'stream'
  31. self.imgsz = imgsz
  32. self.vid_stride = vid_stride # video frame-rate stride
  33. sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
  34. n = len(sources)
  35. self.sources = [ops.clean_str(x) for x in sources] # clean source names for later
  36. self.imgs, self.fps, self.frames, self.threads, self.shape = [[]] * n, [0] * n, [0] * n, [None] * n, [None] * n
  37. self.caps = [None] * n # video capture objects
  38. for i, s in enumerate(sources): # index, source
  39. # Start thread to read frames from video stream
  40. st = f'{i + 1}/{n}: {s}... '
  41. if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'): # if source is YouTube video
  42. # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/Zgi9g1ksQHc'
  43. s = get_best_youtube_url(s)
  44. s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
  45. if s == 0 and (is_colab() or is_kaggle()):
  46. raise NotImplementedError("'source=0' webcam not supported in Colab and Kaggle notebooks. "
  47. "Try running 'source=0' in a local environment.")
  48. self.caps[i] = cv2.VideoCapture(s) # store video capture object
  49. if not self.caps[i].isOpened():
  50. raise ConnectionError(f'{st}Failed to open {s}')
  51. w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
  52. h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
  53. fps = self.caps[i].get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan
  54. self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
  55. 'inf') # infinite stream fallback
  56. self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback
  57. success, im = self.caps[i].read() # guarantee first frame
  58. if not success or im is None:
  59. raise ConnectionError(f'{st}Failed to read images from {s}')
  60. self.imgs[i].append(im)
  61. self.shape[i] = im.shape
  62. self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
  63. LOGGER.info(f'{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)')
  64. self.threads[i].start()
  65. LOGGER.info('') # newline
  66. # Check for common shapes
  67. self.bs = self.__len__()
  68. def update(self, i, cap, stream):
  69. """Read stream `i` frames in daemon thread."""
  70. n, f = 0, self.frames[i] # frame number, frame array
  71. while self.running and cap.isOpened() and n < (f - 1):
  72. # Only read a new frame if the buffer is empty
  73. if not self.imgs[i]:
  74. n += 1
  75. cap.grab() # .read() = .grab() followed by .retrieve()
  76. if n % self.vid_stride == 0:
  77. success, im = cap.retrieve()
  78. if not success:
  79. im = np.zeros(self.shape[i], dtype=np.uint8)
  80. LOGGER.warning('WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.')
  81. cap.open(stream) # re-open stream if signal was lost
  82. self.imgs[i].append(im) # add image to buffer
  83. else:
  84. time.sleep(0.01) # wait until the buffer is empty
  85. def close(self):
  86. """Close stream loader and release resources."""
  87. self.running = False # stop flag for Thread
  88. for thread in self.threads:
  89. if thread.is_alive():
  90. thread.join(timeout=5) # Add timeout
  91. for cap in self.caps: # Iterate through the stored VideoCapture objects
  92. try:
  93. cap.release() # release video capture
  94. except Exception as e:
  95. LOGGER.warning(f'WARNING ⚠️ Could not release VideoCapture object: {e}')
  96. cv2.destroyAllWindows()
  97. def __iter__(self):
  98. """Iterates through YOLO image feed and re-opens unresponsive streams."""
  99. self.count = -1
  100. return self
  101. def __next__(self):
  102. """Returns source paths, transformed and original images for processing."""
  103. self.count += 1
  104. # Wait until a frame is available in each buffer
  105. while not all(self.imgs):
  106. if not all(x.is_alive() for x in self.threads) or cv2.waitKey(1) == ord('q'): # q to quit
  107. cv2.destroyAllWindows()
  108. raise StopIteration
  109. time.sleep(1 / min(self.fps))
  110. # Get and remove the next frame from imgs buffer
  111. return self.sources, [x.pop(0) for x in self.imgs], None, ''
  112. def __len__(self):
  113. """Return the length of the sources object."""
  114. return len(self.sources) # 1E12 frames = 32 streams at 30 FPS for 30 years
  115. class LoadScreenshots:
  116. """YOLOv8 screenshot dataloader, i.e. `yolo predict source=screen`."""
  117. def __init__(self, source, imgsz=640):
  118. """source = [screen_number left top width height] (pixels)."""
  119. check_requirements('mss')
  120. import mss # noqa
  121. source, *params = source.split()
  122. self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0
  123. if len(params) == 1:
  124. self.screen = int(params[0])
  125. elif len(params) == 4:
  126. left, top, width, height = (int(x) for x in params)
  127. elif len(params) == 5:
  128. self.screen, left, top, width, height = (int(x) for x in params)
  129. self.imgsz = imgsz
  130. self.mode = 'stream'
  131. self.frame = 0
  132. self.sct = mss.mss()
  133. self.bs = 1
  134. # Parse monitor shape
  135. monitor = self.sct.monitors[self.screen]
  136. self.top = monitor['top'] if top is None else (monitor['top'] + top)
  137. self.left = monitor['left'] if left is None else (monitor['left'] + left)
  138. self.width = width or monitor['width']
  139. self.height = height or monitor['height']
  140. self.monitor = {'left': self.left, 'top': self.top, 'width': self.width, 'height': self.height}
  141. def __iter__(self):
  142. """Returns an iterator of the object."""
  143. return self
  144. def __next__(self):
  145. """mss screen capture: get raw pixels from the screen as np array."""
  146. im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3] # BGRA to BGR
  147. s = f'screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: '
  148. self.frame += 1
  149. return [str(self.screen)], [im0], None, s # screen, img, vid_cap, string
  150. class LoadImages:
  151. """YOLOv8 image/video dataloader, i.e. `yolo predict source=image.jpg/vid.mp4`."""
  152. def __init__(self, path, imgsz=640, vid_stride=1):
  153. """Initialize the Dataloader and raise FileNotFoundError if file not found."""
  154. parent = None
  155. if isinstance(path, str) and Path(path).suffix == '.txt': # *.txt file with img/vid/dir on each line
  156. parent = Path(path).parent
  157. path = Path(path).read_text().splitlines() # list of sources
  158. files = []
  159. for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
  160. a = str(Path(p).absolute()) # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
  161. if '*' in a:
  162. files.extend(sorted(glob.glob(a, recursive=True))) # glob
  163. elif os.path.isdir(a):
  164. files.extend(sorted(glob.glob(os.path.join(a, '*.*')))) # dir
  165. elif os.path.isfile(a):
  166. files.append(a) # files (absolute or relative to CWD)
  167. elif parent and (parent / p).is_file():
  168. files.append(str((parent / p).absolute())) # files (relative to *.txt file parent)
  169. else:
  170. raise FileNotFoundError(f'{p} does not exist')
  171. images = [x for x in files if x.split('.')[-1].lower() in IMG_FORMATS]
  172. videos = [x for x in files if x.split('.')[-1].lower() in VID_FORMATS]
  173. ni, nv = len(images), len(videos)
  174. self.imgsz = imgsz
  175. self.files = images + videos
  176. self.nf = ni + nv # number of files
  177. self.video_flag = [False] * ni + [True] * nv
  178. self.mode = 'image'
  179. self.vid_stride = vid_stride # video frame-rate stride
  180. self.bs = 1
  181. if any(videos):
  182. self._new_video(videos[0]) # new video
  183. else:
  184. self.cap = None
  185. if self.nf == 0:
  186. raise FileNotFoundError(f'No images or videos found in {p}. '
  187. f'Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}')
  188. def __iter__(self):
  189. """Returns an iterator object for VideoStream or ImageFolder."""
  190. self.count = 0
  191. return self
  192. def __next__(self):
  193. """Return next image, path and metadata from dataset."""
  194. if self.count == self.nf:
  195. raise StopIteration
  196. path = self.files[self.count]
  197. if self.video_flag[self.count]:
  198. # Read video
  199. self.mode = 'video'
  200. for _ in range(self.vid_stride):
  201. self.cap.grab()
  202. success, im0 = self.cap.retrieve()
  203. while not success:
  204. self.count += 1
  205. self.cap.release()
  206. if self.count == self.nf: # last video
  207. raise StopIteration
  208. path = self.files[self.count]
  209. self._new_video(path)
  210. success, im0 = self.cap.read()
  211. self.frame += 1
  212. # im0 = self._cv2_rotate(im0) # for use if cv2 autorotation is False
  213. s = f'video {self.count + 1}/{self.nf} ({self.frame}/{self.frames}) {path}: '
  214. else:
  215. # Read image
  216. self.count += 1
  217. im0 = cv2.imread(path) # BGR
  218. if im0 is None:
  219. raise FileNotFoundError(f'Image Not Found {path}')
  220. s = f'image {self.count}/{self.nf} {path}: '
  221. return [path], [im0], self.cap, s
  222. def _new_video(self, path):
  223. """Create a new video capture object."""
  224. self.frame = 0
  225. self.cap = cv2.VideoCapture(path)
  226. self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
  227. def __len__(self):
  228. """Returns the number of files in the object."""
  229. return self.nf # number of files
  230. class LoadPilAndNumpy:
  231. def __init__(self, im0, imgsz=640):
  232. """Initialize PIL and Numpy Dataloader."""
  233. if not isinstance(im0, list):
  234. im0 = [im0]
  235. self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
  236. self.im0 = [self._single_check(im) for im in im0]
  237. self.imgsz = imgsz
  238. self.mode = 'image'
  239. # Generate fake paths
  240. self.bs = len(self.im0)
  241. @staticmethod
  242. def _single_check(im):
  243. """Validate and format an image to numpy array."""
  244. assert isinstance(im, (Image.Image, np.ndarray)), f'Expected PIL/np.ndarray image type, but got {type(im)}'
  245. if isinstance(im, Image.Image):
  246. if im.mode != 'RGB':
  247. im = im.convert('RGB')
  248. im = np.asarray(im)[:, :, ::-1]
  249. im = np.ascontiguousarray(im) # contiguous
  250. return im
  251. def __len__(self):
  252. """Returns the length of the 'im0' attribute."""
  253. return len(self.im0)
  254. def __next__(self):
  255. """Returns batch paths, images, processed images, None, ''."""
  256. if self.count == 1: # loop only once as it's batch inference
  257. raise StopIteration
  258. self.count += 1
  259. return self.paths, self.im0, None, ''
  260. def __iter__(self):
  261. """Enables iteration for class LoadPilAndNumpy."""
  262. self.count = 0
  263. return self
  264. class LoadTensor:
  265. def __init__(self, im0) -> None:
  266. self.im0 = self._single_check(im0)
  267. self.bs = self.im0.shape[0]
  268. self.mode = 'image'
  269. self.paths = [getattr(im, 'filename', f'image{i}.jpg') for i, im in enumerate(im0)]
  270. @staticmethod
  271. def _single_check(im, stride=32):
  272. """Validate and format an image to torch.Tensor."""
  273. s = f'WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) ' \
  274. f'divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible.'
  275. if len(im.shape) != 4:
  276. if len(im.shape) != 3:
  277. raise ValueError(s)
  278. LOGGER.warning(s)
  279. im = im.unsqueeze(0)
  280. if im.shape[2] % stride or im.shape[3] % stride:
  281. raise ValueError(s)
  282. if im.max() > 1.0:
  283. LOGGER.warning(f'WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. '
  284. f'Dividing input by 255.')
  285. im = im.float() / 255.0
  286. return im
  287. def __iter__(self):
  288. """Returns an iterator object."""
  289. self.count = 0
  290. return self
  291. def __next__(self):
  292. """Return next item in the iterator."""
  293. if self.count == 1:
  294. raise StopIteration
  295. self.count += 1
  296. return self.paths, self.im0, None, ''
  297. def __len__(self):
  298. """Returns the batch size."""
  299. return self.bs
  300. def autocast_list(source):
  301. """
  302. Merges a list of source of different types into a list of numpy arrays or PIL images
  303. """
  304. files = []
  305. for im in source:
  306. if isinstance(im, (str, Path)): # filename or uri
  307. files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith('http') else im))
  308. elif isinstance(im, (Image.Image, np.ndarray)): # PIL or np Image
  309. files.append(im)
  310. else:
  311. raise TypeError(f'type {type(im).__name__} is not a supported Ultralytics prediction source type. \n'
  312. f'See https://docs.ultralytics.com/modes/predict for supported source types.')
  313. return files
  314. LOADERS = LoadStreams, LoadPilAndNumpy, LoadImages, LoadScreenshots # tuple
  315. def get_best_youtube_url(url, use_pafy=False):
  316. """
  317. Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
  318. This function uses the pafy or yt_dlp library to extract the video info from YouTube. It then finds the highest
  319. quality MP4 format that has video codec but no audio codec, and returns the URL of this video stream.
  320. Args:
  321. url (str): The URL of the YouTube video.
  322. use_pafy (bool): Use the pafy package, default=True, otherwise use yt_dlp package.
  323. Returns:
  324. (str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
  325. """
  326. if use_pafy:
  327. check_requirements(('pafy', 'youtube_dl==2020.12.2'))
  328. import pafy # noqa
  329. return pafy.new(url).getbestvideo(preftype='mp4').url
  330. else:
  331. check_requirements('yt-dlp')
  332. import yt_dlp
  333. with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
  334. info_dict = ydl.extract_info(url, download=False) # extract info
  335. for f in reversed(info_dict.get('formats', [])): # reversed because best is usually last
  336. # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
  337. good_size = (f.get('width') or 0) >= 1920 or (f.get('height') or 0) >= 1080
  338. if good_size and f['vcodec'] != 'none' and f['acodec'] == 'none' and f['ext'] == 'mp4':
  339. return f.get('url')