benchmarks.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. """
  3. Benchmark a YOLO model formats for speed and accuracy
  4. Usage:
  5. from ultralytics.utils.benchmarks import ProfileModels, benchmark
  6. ProfileModels(['yolov8n.yaml', 'yolov8s.yaml']).profile()
  7. benchmark(model='yolov8n.pt', imgsz=160)
  8. Format | `format=argument` | Model
  9. --- | --- | ---
  10. PyTorch | - | yolov8n.pt
  11. TorchScript | `torchscript` | yolov8n.torchscript
  12. ONNX | `onnx` | yolov8n.onnx
  13. OpenVINO | `openvino` | yolov8n_openvino_model/
  14. TensorRT | `engine` | yolov8n.engine
  15. CoreML | `coreml` | yolov8n.mlpackage
  16. TensorFlow SavedModel | `saved_model` | yolov8n_saved_model/
  17. TensorFlow GraphDef | `pb` | yolov8n.pb
  18. TensorFlow Lite | `tflite` | yolov8n.tflite
  19. TensorFlow Edge TPU | `edgetpu` | yolov8n_edgetpu.tflite
  20. TensorFlow.js | `tfjs` | yolov8n_web_model/
  21. PaddlePaddle | `paddle` | yolov8n_paddle_model/
  22. ncnn | `ncnn` | yolov8n_ncnn_model/
  23. """
  24. import glob
  25. import platform
  26. import sys
  27. import time
  28. from pathlib import Path
  29. import numpy as np
  30. import torch.cuda
  31. from tqdm import tqdm
  32. from ultralytics import YOLO
  33. from ultralytics.cfg import TASK2DATA, TASK2METRIC
  34. from ultralytics.engine.exporter import export_formats
  35. from ultralytics.utils import ASSETS, LINUX, LOGGER, MACOS, SETTINGS
  36. from ultralytics.utils.checks import check_requirements, check_yolo
  37. from ultralytics.utils.files import file_size
  38. from ultralytics.utils.torch_utils import select_device
  39. def benchmark(model=Path(SETTINGS['weights_dir']) / 'yolov8n.pt',
  40. data=None,
  41. imgsz=160,
  42. half=False,
  43. int8=False,
  44. device='cpu',
  45. verbose=False):
  46. """
  47. Benchmark a YOLO model across different formats for speed and accuracy.
  48. Args:
  49. model (str | Path | optional): Path to the model file or directory. Default is
  50. Path(SETTINGS['weights_dir']) / 'yolov8n.pt'.
  51. data (str, optional): Dataset to evaluate on, inherited from TASK2DATA if not passed. Default is None.
  52. imgsz (int, optional): Image size for the benchmark. Default is 160.
  53. half (bool, optional): Use half-precision for the model if True. Default is False.
  54. int8 (bool, optional): Use int8-precision for the model if True. Default is False.
  55. device (str, optional): Device to run the benchmark on, either 'cpu' or 'cuda'. Default is 'cpu'.
  56. verbose (bool | float | optional): If True or a float, assert benchmarks pass with given metric.
  57. Default is False.
  58. Returns:
  59. df (pandas.DataFrame): A pandas DataFrame with benchmark results for each format, including file size,
  60. metric, and inference time.
  61. Example:
  62. ```python
  63. from ultralytics.utils.benchmarks import benchmark
  64. benchmark(model='yolov8n.pt', imgsz=640)
  65. ```
  66. """
  67. import pandas as pd
  68. pd.options.display.max_columns = 10
  69. pd.options.display.width = 120
  70. device = select_device(device, verbose=False)
  71. if isinstance(model, (str, Path)):
  72. model = YOLO(model)
  73. y = []
  74. t0 = time.time()
  75. for i, (name, format, suffix, cpu, gpu) in export_formats().iterrows(): # index, (name, format, suffix, CPU, GPU)
  76. emoji, filename = '❌', None # export defaults
  77. try:
  78. assert i != 9 or LINUX, 'Edge TPU export only supported on Linux'
  79. if i == 10:
  80. assert MACOS or LINUX, 'TF.js export only supported on macOS and Linux'
  81. elif i == 11:
  82. assert sys.version_info < (3, 11), 'PaddlePaddle export only supported on Python<=3.10'
  83. if 'cpu' in device.type:
  84. assert cpu, 'inference not supported on CPU'
  85. if 'cuda' in device.type:
  86. assert gpu, 'inference not supported on GPU'
  87. # Export
  88. if format == '-':
  89. filename = model.ckpt_path or model.cfg
  90. export = model # PyTorch format
  91. else:
  92. filename = model.export(imgsz=imgsz, format=format, half=half, int8=int8, device=device, verbose=False)
  93. export = YOLO(filename, task=model.task)
  94. assert suffix in str(filename), 'export failed'
  95. emoji = '❎' # indicates export succeeded
  96. # Predict
  97. assert model.task != 'pose' or i != 7, 'GraphDef Pose inference is not supported'
  98. assert i not in (9, 10), 'inference not supported' # Edge TPU and TF.js are unsupported
  99. assert i != 5 or platform.system() == 'Darwin', 'inference only supported on macOS>=10.13' # CoreML
  100. export.predict(ASSETS / 'bus.jpg', imgsz=imgsz, device=device, half=half)
  101. # Validate
  102. data = data or TASK2DATA[model.task] # task to dataset, i.e. coco8.yaml for task=detect
  103. key = TASK2METRIC[model.task] # task to metric, i.e. metrics/mAP50-95(B) for task=detect
  104. results = export.val(data=data,
  105. batch=1,
  106. imgsz=imgsz,
  107. plots=False,
  108. device=device,
  109. half=half,
  110. int8=int8,
  111. verbose=False)
  112. metric, speed = results.results_dict[key], results.speed['inference']
  113. y.append([name, '✅', round(file_size(filename), 1), round(metric, 4), round(speed, 2)])
  114. except Exception as e:
  115. if verbose:
  116. assert type(e) is AssertionError, f'Benchmark failure for {name}: {e}'
  117. LOGGER.warning(f'ERROR ❌️ Benchmark failure for {name}: {e}')
  118. y.append([name, emoji, round(file_size(filename), 1), None, None]) # mAP, t_inference
  119. # Print results
  120. check_yolo(device=device) # print system info
  121. df = pd.DataFrame(y, columns=['Format', 'Status❔', 'Size (MB)', key, 'Inference time (ms/im)'])
  122. name = Path(model.ckpt_path).name
  123. s = f'\nBenchmarks complete for {name} on {data} at imgsz={imgsz} ({time.time() - t0:.2f}s)\n{df}\n'
  124. LOGGER.info(s)
  125. with open('benchmarks.log', 'a', errors='ignore', encoding='utf-8') as f:
  126. f.write(s)
  127. if verbose and isinstance(verbose, float):
  128. metrics = df[key].array # values to compare to floor
  129. floor = verbose # minimum metric floor to pass, i.e. = 0.29 mAP for YOLOv5n
  130. assert all(x > floor for x in metrics if pd.notna(x)), f'Benchmark failure: metric(s) < floor {floor}'
  131. return df
  132. class ProfileModels:
  133. """
  134. ProfileModels class for profiling different models on ONNX and TensorRT.
  135. This class profiles the performance of different models, provided their paths. The profiling includes parameters such as
  136. model speed and FLOPs.
  137. Attributes:
  138. paths (list): Paths of the models to profile.
  139. num_timed_runs (int): Number of timed runs for the profiling. Default is 100.
  140. num_warmup_runs (int): Number of warmup runs before profiling. Default is 10.
  141. min_time (float): Minimum number of seconds to profile for. Default is 60.
  142. imgsz (int): Image size used in the models. Default is 640.
  143. Methods:
  144. profile(): Profiles the models and prints the result.
  145. Example:
  146. ```python
  147. from ultralytics.utils.benchmarks import ProfileModels
  148. ProfileModels(['yolov8n.yaml', 'yolov8s.yaml'], imgsz=640).profile()
  149. ```
  150. """
  151. def __init__(self,
  152. paths: list,
  153. num_timed_runs=100,
  154. num_warmup_runs=10,
  155. min_time=60,
  156. imgsz=640,
  157. trt=True,
  158. device=None):
  159. self.paths = paths
  160. self.num_timed_runs = num_timed_runs
  161. self.num_warmup_runs = num_warmup_runs
  162. self.min_time = min_time
  163. self.imgsz = imgsz
  164. self.trt = trt # run TensorRT profiling
  165. self.device = device or torch.device(0 if torch.cuda.is_available() else 'cpu')
  166. def profile(self):
  167. files = self.get_files()
  168. if not files:
  169. print('No matching *.pt or *.onnx files found.')
  170. return
  171. table_rows = []
  172. output = []
  173. for file in files:
  174. engine_file = file.with_suffix('.engine')
  175. if file.suffix in ('.pt', '.yaml', '.yml'):
  176. model = YOLO(str(file))
  177. model.fuse() # to report correct params and GFLOPs in model.info()
  178. model_info = model.info()
  179. if self.trt and self.device.type != 'cpu' and not engine_file.is_file():
  180. engine_file = model.export(format='engine',
  181. half=True,
  182. imgsz=self.imgsz,
  183. device=self.device,
  184. verbose=False)
  185. onnx_file = model.export(format='onnx',
  186. half=True,
  187. imgsz=self.imgsz,
  188. simplify=True,
  189. device=self.device,
  190. verbose=False)
  191. elif file.suffix == '.onnx':
  192. model_info = self.get_onnx_model_info(file)
  193. onnx_file = file
  194. else:
  195. continue
  196. t_engine = self.profile_tensorrt_model(str(engine_file))
  197. t_onnx = self.profile_onnx_model(str(onnx_file))
  198. table_rows.append(self.generate_table_row(file.stem, t_onnx, t_engine, model_info))
  199. output.append(self.generate_results_dict(file.stem, t_onnx, t_engine, model_info))
  200. self.print_table(table_rows)
  201. return output
  202. def get_files(self):
  203. files = []
  204. for path in self.paths:
  205. path = Path(path)
  206. if path.is_dir():
  207. extensions = ['*.pt', '*.onnx', '*.yaml']
  208. files.extend([file for ext in extensions for file in glob.glob(str(path / ext))])
  209. elif path.suffix in {'.pt', '.yaml', '.yml'}: # add non-existing
  210. files.append(str(path))
  211. else:
  212. files.extend(glob.glob(str(path)))
  213. print(f'Profiling: {sorted(files)}')
  214. return [Path(file) for file in sorted(files)]
  215. def get_onnx_model_info(self, onnx_file: str):
  216. # return (num_layers, num_params, num_gradients, num_flops)
  217. return 0.0, 0.0, 0.0, 0.0
  218. def iterative_sigma_clipping(self, data, sigma=2, max_iters=3):
  219. data = np.array(data)
  220. for _ in range(max_iters):
  221. mean, std = np.mean(data), np.std(data)
  222. clipped_data = data[(data > mean - sigma * std) & (data < mean + sigma * std)]
  223. if len(clipped_data) == len(data):
  224. break
  225. data = clipped_data
  226. return data
  227. def profile_tensorrt_model(self, engine_file: str, eps: float = 1e-7):
  228. if not self.trt or not Path(engine_file).is_file():
  229. return 0.0, 0.0
  230. # Model and input
  231. model = YOLO(engine_file)
  232. input_data = np.random.rand(self.imgsz, self.imgsz, 3).astype(np.float32) # must be FP32
  233. # Warmup runs
  234. elapsed = 0.0
  235. for _ in range(3):
  236. start_time = time.time()
  237. for _ in range(self.num_warmup_runs):
  238. model(input_data, imgsz=self.imgsz, verbose=False)
  239. elapsed = time.time() - start_time
  240. # Compute number of runs as higher of min_time or num_timed_runs
  241. num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs * 50)
  242. # Timed runs
  243. run_times = []
  244. for _ in tqdm(range(num_runs), desc=engine_file):
  245. results = model(input_data, imgsz=self.imgsz, verbose=False)
  246. run_times.append(results[0].speed['inference']) # Convert to milliseconds
  247. run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=3) # sigma clipping
  248. return np.mean(run_times), np.std(run_times)
  249. def profile_onnx_model(self, onnx_file: str, eps: float = 1e-7):
  250. check_requirements('onnxruntime')
  251. import onnxruntime as ort
  252. # Session with either 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'
  253. sess_options = ort.SessionOptions()
  254. sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
  255. sess_options.intra_op_num_threads = 8 # Limit the number of threads
  256. sess = ort.InferenceSession(onnx_file, sess_options, providers=['CPUExecutionProvider'])
  257. input_tensor = sess.get_inputs()[0]
  258. input_type = input_tensor.type
  259. # Mapping ONNX datatype to numpy datatype
  260. if 'float16' in input_type:
  261. input_dtype = np.float16
  262. elif 'float' in input_type:
  263. input_dtype = np.float32
  264. elif 'double' in input_type:
  265. input_dtype = np.float64
  266. elif 'int64' in input_type:
  267. input_dtype = np.int64
  268. elif 'int32' in input_type:
  269. input_dtype = np.int32
  270. else:
  271. raise ValueError(f'Unsupported ONNX datatype {input_type}')
  272. input_data = np.random.rand(*input_tensor.shape).astype(input_dtype)
  273. input_name = input_tensor.name
  274. output_name = sess.get_outputs()[0].name
  275. # Warmup runs
  276. elapsed = 0.0
  277. for _ in range(3):
  278. start_time = time.time()
  279. for _ in range(self.num_warmup_runs):
  280. sess.run([output_name], {input_name: input_data})
  281. elapsed = time.time() - start_time
  282. # Compute number of runs as higher of min_time or num_timed_runs
  283. num_runs = max(round(self.min_time / (elapsed + eps) * self.num_warmup_runs), self.num_timed_runs)
  284. # Timed runs
  285. run_times = []
  286. for _ in tqdm(range(num_runs), desc=onnx_file):
  287. start_time = time.time()
  288. sess.run([output_name], {input_name: input_data})
  289. run_times.append((time.time() - start_time) * 1000) # Convert to milliseconds
  290. run_times = self.iterative_sigma_clipping(np.array(run_times), sigma=2, max_iters=5) # sigma clipping
  291. return np.mean(run_times), np.std(run_times)
  292. def generate_table_row(self, model_name, t_onnx, t_engine, model_info):
  293. layers, params, gradients, flops = model_info
  294. return f'| {model_name:18s} | {self.imgsz} | - | {t_onnx[0]:.2f} ± {t_onnx[1]:.2f} ms | {t_engine[0]:.2f} ± {t_engine[1]:.2f} ms | {params / 1e6:.1f} | {flops:.1f} |'
  295. def generate_results_dict(self, model_name, t_onnx, t_engine, model_info):
  296. layers, params, gradients, flops = model_info
  297. return {
  298. 'model/name': model_name,
  299. 'model/parameters': params,
  300. 'model/GFLOPs': round(flops, 3),
  301. 'model/speed_ONNX(ms)': round(t_onnx[0], 3),
  302. 'model/speed_TensorRT(ms)': round(t_engine[0], 3)}
  303. def print_table(self, table_rows):
  304. gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'GPU'
  305. header = f'| Model | size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) | Speed<br><sup>{gpu} TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |'
  306. separator = '|-------------|---------------------|--------------------|------------------------------|-----------------------------------|------------------|-----------------|'
  307. print(f'\n\n{header}')
  308. print(separator)
  309. for row in table_rows:
  310. print(row)