builtin_dataset_mocks.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582
  1. import bz2
  2. import collections.abc
  3. import csv
  4. import functools
  5. import gzip
  6. import io
  7. import itertools
  8. import json
  9. import lzma
  10. import pathlib
  11. import pickle
  12. import random
  13. import shutil
  14. import unittest.mock
  15. import xml.etree.ElementTree as ET
  16. from collections import Counter, defaultdict
  17. import numpy as np
  18. import pytest
  19. import torch
  20. from common_utils import combinations_grid
  21. from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
  22. from torch.nn.functional import one_hot
  23. from torch.testing import make_tensor as _make_tensor
  24. from torchvision.prototype import datasets
  25. make_tensor = functools.partial(_make_tensor, device="cpu")
  26. make_scalar = functools.partial(make_tensor, ())
  27. __all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
  28. class DatasetMock:
  29. def __init__(self, name, *, mock_data_fn, configs):
  30. # FIXME: error handling for unknown names
  31. self.name = name
  32. self.mock_data_fn = mock_data_fn
  33. self.configs = configs
  34. def _parse_mock_info(self, mock_info):
  35. if mock_info is None:
  36. raise pytest.UsageError(
  37. f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
  38. f"integer indicating the number of samples for the current `config`."
  39. )
  40. elif isinstance(mock_info, int):
  41. mock_info = dict(num_samples=mock_info)
  42. elif not isinstance(mock_info, dict):
  43. raise pytest.UsageError(
  44. f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object "
  45. f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no "
  46. f"additional information is required for specific tests, the number of samples can also be returned as "
  47. f"an integer."
  48. )
  49. elif "num_samples" not in mock_info:
  50. raise pytest.UsageError(
  51. f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a "
  52. f"`'num_samples'` entry indicating the number of samples."
  53. )
  54. return mock_info
  55. def load(self, config):
  56. # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
  57. # test/test_prototype_builtin_datasets.py
  58. root = pathlib.Path(datasets.home()) / self.name
  59. # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
  60. # this will only download **and** preprocess if the file is not present. In other words, if we already place
  61. # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
  62. # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
  63. # `root` only when it is requested.
  64. tmp_mock_data_folder = root / "__mock__"
  65. tmp_mock_data_folder.mkdir(parents=True)
  66. mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
  67. def patched_download(resource, root, **kwargs):
  68. src = tmp_mock_data_folder / resource.file_name
  69. if not src.exists():
  70. raise pytest.UsageError(
  71. f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
  72. f"but it was not created by the mock data function."
  73. )
  74. dst = root / resource.file_name
  75. shutil.move(str(src), str(root))
  76. return dst
  77. with unittest.mock.patch(
  78. "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
  79. ):
  80. dataset = datasets.load(self.name, **config)
  81. extra_files = list(tmp_mock_data_folder.glob("**/*"))
  82. if extra_files:
  83. raise pytest.UsageError(
  84. (
  85. f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
  86. f"but they were not loaded:\n\n"
  87. )
  88. + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
  89. )
  90. tmp_mock_data_folder.rmdir()
  91. return dataset, mock_info
  92. def config_id(name, config):
  93. parts = [name]
  94. for name, value in config.items():
  95. if isinstance(value, bool):
  96. part = ("" if value else "no_") + name
  97. else:
  98. part = str(value)
  99. parts.append(part)
  100. return "-".join(parts)
  101. def parametrize_dataset_mocks(*dataset_mocks, marks=None):
  102. mocks = {}
  103. for mock in dataset_mocks:
  104. if isinstance(mock, DatasetMock):
  105. mocks[mock.name] = mock
  106. elif isinstance(mock, collections.abc.Mapping):
  107. mocks.update(mock)
  108. else:
  109. raise pytest.UsageError(
  110. f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, "
  111. f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, "
  112. f"but got {mock} instead."
  113. )
  114. dataset_mocks = mocks
  115. if marks is None:
  116. marks = {}
  117. elif not isinstance(marks, collections.abc.Mapping):
  118. raise pytest.UsageError()
  119. return pytest.mark.parametrize(
  120. ("dataset_mock", "config"),
  121. [
  122. pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ()))
  123. for name, dataset_mock in dataset_mocks.items()
  124. for config in dataset_mock.configs
  125. ],
  126. )
  127. DATASET_MOCKS = {}
  128. def register_mock(name=None, *, configs):
  129. def wrapper(mock_data_fn):
  130. nonlocal name
  131. if name is None:
  132. name = mock_data_fn.__name__
  133. DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs)
  134. return mock_data_fn
  135. return wrapper
  136. class MNISTMockData:
  137. _DTYPES_ID = {
  138. torch.uint8: 8,
  139. torch.int8: 9,
  140. torch.int16: 11,
  141. torch.int32: 12,
  142. torch.float32: 13,
  143. torch.float64: 14,
  144. }
  145. @classmethod
  146. def _magic(cls, dtype, ndim):
  147. return cls._DTYPES_ID[dtype] * 256 + ndim + 1
  148. @staticmethod
  149. def _encode(t):
  150. return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]
  151. @staticmethod
  152. def _big_endian_dtype(dtype):
  153. np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
  154. return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")
  155. @classmethod
  156. def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
  157. with compressor(root / filename, "wb") as fh:
  158. for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
  159. fh.write(cls._encode(meta))
  160. data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)
  161. fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())
  162. @classmethod
  163. def generate(
  164. cls,
  165. root,
  166. *,
  167. num_categories,
  168. num_samples=None,
  169. images_file,
  170. labels_file,
  171. image_size=(28, 28),
  172. image_dtype=torch.uint8,
  173. label_size=(),
  174. label_dtype=torch.uint8,
  175. compressor=None,
  176. ):
  177. if num_samples is None:
  178. num_samples = num_categories
  179. if compressor is None:
  180. compressor = gzip.open
  181. cls._create_binary_file(
  182. root,
  183. images_file,
  184. num_samples=num_samples,
  185. shape=image_size,
  186. dtype=image_dtype,
  187. compressor=compressor,
  188. high=float("inf"),
  189. )
  190. cls._create_binary_file(
  191. root,
  192. labels_file,
  193. num_samples=num_samples,
  194. shape=label_size,
  195. dtype=label_dtype,
  196. compressor=compressor,
  197. high=num_categories,
  198. )
  199. return num_samples
  200. def mnist(root, config):
  201. prefix = "train" if config["split"] == "train" else "t10k"
  202. return MNISTMockData.generate(
  203. root,
  204. num_categories=10,
  205. images_file=f"{prefix}-images-idx3-ubyte.gz",
  206. labels_file=f"{prefix}-labels-idx1-ubyte.gz",
  207. )
  208. DATASET_MOCKS.update(
  209. {
  210. name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test")))
  211. for name in ["mnist", "fashionmnist", "kmnist"]
  212. }
  213. )
  214. @register_mock(
  215. configs=combinations_grid(
  216. split=("train", "test"),
  217. image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
  218. )
  219. )
  220. def emnist(root, config):
  221. num_samples_map = {}
  222. file_names = set()
  223. for split, image_set in itertools.product(
  224. ("train", "test"),
  225. ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
  226. ):
  227. prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}"
  228. images_file = f"{prefix}-images-idx3-ubyte.gz"
  229. labels_file = f"{prefix}-labels-idx1-ubyte.gz"
  230. file_names.update({images_file, labels_file})
  231. num_samples_map[(split, image_set)] = MNISTMockData.generate(
  232. root,
  233. # The image sets that merge some lower case letters in their respective upper case variant, still use dense
  234. # labels in the data files. Thus, num_categories != len(categories) there.
  235. num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62,
  236. images_file=images_file,
  237. labels_file=labels_file,
  238. )
  239. make_zip(root, "emnist-gzip.zip", *file_names)
  240. return num_samples_map[(config["split"], config["image_set"])]
  241. @register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist")))
  242. def qmnist(root, config):
  243. num_categories = 10
  244. if config["split"] == "train":
  245. num_samples = num_samples_gen = num_categories + 2
  246. prefix = "qmnist-train"
  247. suffix = ".gz"
  248. compressor = gzip.open
  249. elif config["split"].startswith("test"):
  250. # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
  251. # more than 10000 images for the dataset to not be empty.
  252. num_samples_gen = 10001
  253. num_samples = {
  254. "test": num_samples_gen,
  255. "test10k": min(num_samples_gen, 10_000),
  256. "test50k": num_samples_gen - 10_000,
  257. }[config["split"]]
  258. prefix = "qmnist-test"
  259. suffix = ".gz"
  260. compressor = gzip.open
  261. else: # config["split"] == "nist"
  262. num_samples = num_samples_gen = num_categories + 3
  263. prefix = "xnist"
  264. suffix = ".xz"
  265. compressor = lzma.open
  266. MNISTMockData.generate(
  267. root,
  268. num_categories=num_categories,
  269. num_samples=num_samples_gen,
  270. images_file=f"{prefix}-images-idx3-ubyte{suffix}",
  271. labels_file=f"{prefix}-labels-idx2-int{suffix}",
  272. label_size=(8,),
  273. label_dtype=torch.int32,
  274. compressor=compressor,
  275. )
  276. return num_samples
  277. class CIFARMockData:
  278. NUM_PIXELS = 32 * 32 * 3
  279. @classmethod
  280. def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
  281. content = {
  282. "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
  283. labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
  284. }
  285. with open(pathlib.Path(root) / name, "wb") as fh:
  286. pickle.dump(content, fh)
  287. @classmethod
  288. def generate(
  289. cls,
  290. root,
  291. name,
  292. *,
  293. folder,
  294. train_files,
  295. test_files,
  296. num_categories,
  297. labels_key,
  298. ):
  299. folder = root / folder
  300. folder.mkdir()
  301. files = (*train_files, *test_files)
  302. for file in files:
  303. cls._create_batch_file(
  304. folder,
  305. file,
  306. num_categories=num_categories,
  307. labels_key=labels_key,
  308. )
  309. make_tar(root, name, folder, compression="gz")
  310. @register_mock(configs=combinations_grid(split=("train", "test")))
  311. def cifar10(root, config):
  312. train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
  313. test_files = ["test_batch"]
  314. CIFARMockData.generate(
  315. root=root,
  316. name="cifar-10-python.tar.gz",
  317. folder=pathlib.Path("cifar-10-batches-py"),
  318. train_files=train_files,
  319. test_files=test_files,
  320. num_categories=10,
  321. labels_key="labels",
  322. )
  323. return len(train_files if config["split"] == "train" else test_files)
  324. @register_mock(configs=combinations_grid(split=("train", "test")))
  325. def cifar100(root, config):
  326. train_files = ["train"]
  327. test_files = ["test"]
  328. CIFARMockData.generate(
  329. root=root,
  330. name="cifar-100-python.tar.gz",
  331. folder=pathlib.Path("cifar-100-python"),
  332. train_files=train_files,
  333. test_files=test_files,
  334. num_categories=100,
  335. labels_key="fine_labels",
  336. )
  337. return len(train_files if config["split"] == "train" else test_files)
  338. @register_mock(configs=[dict()])
  339. def caltech101(root, config):
  340. def create_ann_file(root, name):
  341. import scipy.io
  342. box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
  343. obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()
  344. scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))
  345. def create_ann_folder(root, name, file_name_fn, num_examples):
  346. root = pathlib.Path(root) / name
  347. root.mkdir(parents=True)
  348. for idx in range(num_examples):
  349. create_ann_file(root, file_name_fn(idx))
  350. images_root = root / "101_ObjectCategories"
  351. anns_root = root / "Annotations"
  352. image_category_map = {
  353. "Faces": "Faces_2",
  354. "Faces_easy": "Faces_3",
  355. "Motorbikes": "Motorbikes_16",
  356. "airplanes": "Airplanes_Side_2",
  357. }
  358. categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
  359. num_images_per_category = 2
  360. for category in categories:
  361. create_image_folder(
  362. root=images_root,
  363. name=category,
  364. file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
  365. num_examples=num_images_per_category,
  366. )
  367. create_ann_folder(
  368. root=anns_root,
  369. name=image_category_map.get(category, category),
  370. file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
  371. num_examples=num_images_per_category,
  372. )
  373. (images_root / "BACKGROUND_Goodle").mkdir()
  374. make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")
  375. make_tar(root, f"{anns_root.name}.tar", anns_root)
  376. return num_images_per_category * len(categories)
  377. @register_mock(configs=[dict()])
  378. def caltech256(root, config):
  379. dir = root / "256_ObjectCategories"
  380. num_images_per_category = 2
  381. categories = [
  382. (1, "ak47"),
  383. (127, "laptop-101"),
  384. (198, "spider"),
  385. (257, "clutter"),
  386. ]
  387. for category_idx, category in categories:
  388. files = create_image_folder(
  389. dir,
  390. name=f"{category_idx:03d}.{category}",
  391. file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
  392. num_examples=num_images_per_category,
  393. )
  394. if category == "spider":
  395. open(files[0].parent / "RENAME2", "w").close()
  396. make_tar(root, f"{dir.name}.tar", dir)
  397. return num_images_per_category * len(categories)
  398. @register_mock(configs=combinations_grid(split=("train", "val", "test")))
  399. def imagenet(root, config):
  400. from scipy.io import savemat
  401. info = datasets.info("imagenet")
  402. if config["split"] == "train":
  403. num_samples = len(info["wnids"])
  404. archive_name = "ILSVRC2012_img_train.tar"
  405. files = []
  406. for wnid in info["wnids"]:
  407. create_image_folder(
  408. root=root,
  409. name=wnid,
  410. file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
  411. num_examples=1,
  412. )
  413. files.append(make_tar(root, f"{wnid}.tar"))
  414. elif config["split"] == "val":
  415. num_samples = 3
  416. archive_name = "ILSVRC2012_img_val.tar"
  417. files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
  418. devkit_root = root / "ILSVRC2012_devkit_t12"
  419. data_root = devkit_root / "data"
  420. data_root.mkdir(parents=True)
  421. with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
  422. for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist():
  423. file.write(f"{label}\n")
  424. num_children = 0
  425. synsets = [
  426. (idx, wnid, category, "", num_children, [], 0, 0)
  427. for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1)
  428. ]
  429. num_children = 1
  430. synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
  431. synsets = np.array(
  432. synsets,
  433. dtype=np.dtype(
  434. [
  435. ("ILSVRC2012_ID", "O"),
  436. ("WNID", "O"),
  437. ("words", "O"),
  438. ("gloss", "O"),
  439. ("num_children", "O"),
  440. ("children", "O"),
  441. ("wordnet_height", "O"),
  442. ("num_train_images", "O"),
  443. ]
  444. ),
  445. )
  446. savemat(data_root / "meta.mat", dict(synsets=synsets))
  447. make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
  448. else: # config["split"] == "test"
  449. num_samples = 5
  450. archive_name = "ILSVRC2012_img_test_v10102019.tar"
  451. files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
  452. make_tar(root, archive_name, *files)
  453. return num_samples
  454. class CocoMockData:
  455. @classmethod
  456. def _make_annotations_json(
  457. cls,
  458. root,
  459. name,
  460. *,
  461. images_meta,
  462. fn,
  463. ):
  464. num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
  465. num_anns_total = int(num_anns_per_image.sum())
  466. ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])
  467. anns_meta = []
  468. for image_meta, num_anns in zip(images_meta, num_anns_per_image):
  469. for _ in range(num_anns):
  470. ann_id = int(next(ann_ids_iter))
  471. anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
  472. anns_meta.sort(key=lambda ann: ann["id"])
  473. with open(root / name, "w") as file:
  474. json.dump(dict(images=images_meta, annotations=anns_meta), file)
  475. return num_anns_per_image
  476. @staticmethod
  477. def _make_instances_data(ann_id, image_meta):
  478. def make_rle_segmentation():
  479. height, width = image_meta["height"], image_meta["width"]
  480. numel = height * width
  481. counts = []
  482. while sum(counts) <= numel:
  483. counts.append(int(torch.randint(5, 8, ())))
  484. if sum(counts) > numel:
  485. counts[-1] -= sum(counts) - numel
  486. return dict(counts=counts, size=[height, width])
  487. return dict(
  488. segmentation=make_rle_segmentation(),
  489. bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
  490. iscrowd=True,
  491. area=float(make_scalar(dtype=torch.float32)),
  492. category_id=int(make_scalar(dtype=torch.int64)),
  493. )
  494. @staticmethod
  495. def _make_captions_data(ann_id, image_meta):
  496. return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")
  497. @classmethod
  498. def _make_annotations(cls, root, name, *, images_meta):
  499. num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
  500. for annotations, fn in (
  501. ("instances", cls._make_instances_data),
  502. ("captions", cls._make_captions_data),
  503. ):
  504. num_anns_per_image += cls._make_annotations_json(
  505. root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
  506. )
  507. return int(num_anns_per_image.sum())
  508. @classmethod
  509. def generate(
  510. cls,
  511. root,
  512. *,
  513. split,
  514. year,
  515. num_samples,
  516. ):
  517. annotations_dir = root / "annotations"
  518. annotations_dir.mkdir()
  519. for split_ in ("train", "val"):
  520. config_name = f"{split_}{year}"
  521. images_meta = [
  522. dict(
  523. file_name=f"{idx:012d}.jpg",
  524. id=idx,
  525. width=width,
  526. height=height,
  527. )
  528. for idx, (height, width) in enumerate(
  529. torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
  530. )
  531. ]
  532. if split_ == split:
  533. create_image_folder(
  534. root,
  535. config_name,
  536. file_name_fn=lambda idx: images_meta[idx]["file_name"],
  537. num_examples=num_samples,
  538. size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
  539. )
  540. make_zip(root, f"{config_name}.zip")
  541. cls._make_annotations(
  542. annotations_dir,
  543. config_name,
  544. images_meta=images_meta,
  545. )
  546. make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)
  547. return num_samples
  548. @register_mock(
  549. configs=combinations_grid(
  550. split=("train", "val"),
  551. year=("2017", "2014"),
  552. annotations=("instances", "captions", None),
  553. )
  554. )
  555. def coco(root, config):
  556. return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
  557. class SBDMockData:
  558. _NUM_CATEGORIES = 20
  559. @classmethod
  560. def _make_split_files(cls, root_map, *, split):
  561. splits_and_idcs = [
  562. ("train", [0, 1, 2]),
  563. ("val", [3]),
  564. ]
  565. if split == "train_noval":
  566. splits_and_idcs.append(("train_noval", [0, 2]))
  567. ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
  568. for split, ids in ids_map.items():
  569. with open(root_map[split] / f"{split}.txt", "w") as fh:
  570. fh.writelines(f"{id}\n" for id in ids)
  571. return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
  572. @classmethod
  573. def _make_anns_folder(cls, root, name, ids):
  574. from scipy.io import savemat
  575. anns_folder = root / name
  576. anns_folder.mkdir()
  577. sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist()
  578. for id, size in zip(ids, sizes):
  579. savemat(
  580. anns_folder / f"{id}.mat",
  581. {
  582. "GTcls": {
  583. "Boundaries": cls._make_boundaries(size),
  584. "Segmentation": cls._make_segmentation(size),
  585. }
  586. },
  587. )
  588. return sizes
  589. @classmethod
  590. def _make_boundaries(cls, size):
  591. from scipy.sparse import csc_matrix
  592. return [
  593. [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES)
  594. ]
  595. @classmethod
  596. def _make_segmentation(cls, size):
  597. return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
  598. @classmethod
  599. def generate(cls, root, *, split):
  600. archive_folder = root / "benchmark_RELEASE"
  601. dataset_folder = archive_folder / "dataset"
  602. dataset_folder.mkdir(parents=True, exist_ok=True)
  603. ids, num_samples_map = cls._make_split_files(
  604. defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
  605. )
  606. sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
  607. create_image_folder(
  608. dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
  609. )
  610. make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
  611. return num_samples_map[split]
  612. @register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
  613. def sbd(root, config):
  614. return SBDMockData.generate(root, split=config["split"])
  615. @register_mock(configs=[dict()])
  616. def semeion(root, config):
  617. num_samples = 3
  618. num_categories = 10
  619. images = torch.rand(num_samples, 256)
  620. labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories)
  621. with open(root / "semeion.data", "w") as fh:
  622. for image, one_hot_label in zip(images, labels):
  623. image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
  624. labels_columns = " ".join([str(label.item()) for label in one_hot_label])
  625. fh.write(f"{image_columns} {labels_columns} \n")
  626. return num_samples
  627. class VOCMockData:
  628. _TRAIN_VAL_FILE_NAMES = {
  629. "2007": "VOCtrainval_06-Nov-2007.tar",
  630. "2008": "VOCtrainval_14-Jul-2008.tar",
  631. "2009": "VOCtrainval_11-May-2009.tar",
  632. "2010": "VOCtrainval_03-May-2010.tar",
  633. "2011": "VOCtrainval_25-May-2011.tar",
  634. "2012": "VOCtrainval_11-May-2012.tar",
  635. }
  636. _TEST_FILE_NAMES = {
  637. "2007": "VOCtest_06-Nov-2007.tar",
  638. }
  639. @classmethod
  640. def _make_split_files(cls, root, *, year, trainval):
  641. split_folder = root / "ImageSets"
  642. if trainval:
  643. idcs_map = {
  644. "train": [0, 1, 2],
  645. "val": [3, 4],
  646. }
  647. idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]]
  648. else:
  649. idcs_map = {
  650. "test": [5],
  651. }
  652. ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()}
  653. for task_sub_folder in ("Main", "Segmentation"):
  654. task_folder = split_folder / task_sub_folder
  655. task_folder.mkdir(parents=True, exist_ok=True)
  656. for split, ids in ids_map.items():
  657. with open(task_folder / f"{split}.txt", "w") as fh:
  658. fh.writelines(f"{id}\n" for id in ids)
  659. return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
  660. @classmethod
  661. def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples):
  662. folder = root / name
  663. folder.mkdir(parents=True, exist_ok=True)
  664. for idx in range(num_examples):
  665. cls._make_detection_ann_file(folder, file_name_fn(idx))
  666. @classmethod
  667. def _make_detection_ann_file(cls, root, name):
  668. def add_child(parent, name, text=None):
  669. child = ET.SubElement(parent, name)
  670. child.text = str(text)
  671. return child
  672. def add_name(obj, name="dog"):
  673. add_child(obj, "name", name)
  674. def add_size(obj):
  675. obj = add_child(obj, "size")
  676. size = {"width": 0, "height": 0, "depth": 3}
  677. for name, text in size.items():
  678. add_child(obj, name, text)
  679. def add_bndbox(obj):
  680. obj = add_child(obj, "bndbox")
  681. bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4}
  682. for name, text in bndbox.items():
  683. add_child(obj, name, text)
  684. annotation = ET.Element("annotation")
  685. add_size(annotation)
  686. obj = add_child(annotation, "object")
  687. add_name(obj)
  688. add_bndbox(obj)
  689. with open(root / name, "wb") as fh:
  690. fh.write(ET.tostring(annotation))
  691. @classmethod
  692. def generate(cls, root, *, year, trainval):
  693. archive_folder = root
  694. if year == "2011":
  695. archive_folder = root / "TrainVal"
  696. data_folder = archive_folder / "VOCdevkit"
  697. else:
  698. archive_folder = data_folder = root / "VOCdevkit"
  699. data_folder = data_folder / f"VOC{year}"
  700. data_folder.mkdir(parents=True, exist_ok=True)
  701. ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
  702. for make_folder_fn, name, suffix in [
  703. (create_image_folder, "JPEGImages", ".jpg"),
  704. (create_image_folder, "SegmentationClass", ".png"),
  705. (cls._make_detection_anns_folder, "Annotations", ".xml"),
  706. ]:
  707. make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
  708. make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
  709. return num_samples_map
  710. @register_mock(
  711. configs=[
  712. *combinations_grid(
  713. split=("train", "val", "trainval"),
  714. year=("2007", "2008", "2009", "2010", "2011", "2012"),
  715. task=("detection", "segmentation"),
  716. ),
  717. *combinations_grid(
  718. split=("test",),
  719. year=("2007",),
  720. task=("detection", "segmentation"),
  721. ),
  722. ],
  723. )
  724. def voc(root, config):
  725. trainval = config["split"] != "test"
  726. return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]]
  727. class CelebAMockData:
  728. @classmethod
  729. def _make_ann_file(cls, root, name, data, *, field_names=None):
  730. with open(root / name, "w") as file:
  731. if field_names:
  732. file.write(f"{len(data)}\r\n")
  733. file.write(" ".join(field_names) + "\r\n")
  734. file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data)
  735. _SPLIT_TO_IDX = {
  736. "train": 0,
  737. "val": 1,
  738. "test": 2,
  739. }
  740. @classmethod
  741. def _make_split_file(cls, root):
  742. num_samples_map = {"train": 4, "val": 3, "test": 2}
  743. data = [
  744. (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split])
  745. for split, num_samples in num_samples_map.items()
  746. for idx in range(num_samples)
  747. ]
  748. cls._make_ann_file(root, "list_eval_partition.txt", data)
  749. image_file_names, _ = zip(*data)
  750. return image_file_names, num_samples_map
  751. @classmethod
  752. def _make_identity_file(cls, root, image_file_names):
  753. cls._make_ann_file(
  754. root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names]
  755. )
  756. @classmethod
  757. def _make_attributes_file(cls, root, image_file_names):
  758. field_names = ("5_o_Clock_Shadow", "Young")
  759. data = [
  760. [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]]
  761. for name in image_file_names
  762. ]
  763. cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, ""))
  764. @classmethod
  765. def _make_bounding_boxes_file(cls, root, image_file_names):
  766. field_names = ("image_id", "x_1", "y_1", "width", "height")
  767. data = [
  768. [f"{name} ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]]
  769. for name in image_file_names
  770. ]
  771. cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names)
  772. @classmethod
  773. def _make_landmarks_file(cls, root, image_file_names):
  774. field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y")
  775. data = [
  776. [
  777. name,
  778. *[
  779. f"{coord:4d}" if idx else coord
  780. for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist())
  781. ],
  782. ]
  783. for name in image_file_names
  784. ]
  785. cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names)
  786. @classmethod
  787. def generate(cls, root):
  788. image_file_names, num_samples_map = cls._make_split_file(root)
  789. image_files = create_image_folder(
  790. root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
  791. )
  792. make_zip(root, image_files[0].parent.with_suffix(".zip").name)
  793. for make_ann_file_fn in (
  794. cls._make_identity_file,
  795. cls._make_attributes_file,
  796. cls._make_bounding_boxes_file,
  797. cls._make_landmarks_file,
  798. ):
  799. make_ann_file_fn(root, image_file_names)
  800. return num_samples_map
  801. @register_mock(configs=combinations_grid(split=("train", "val", "test")))
  802. def celeba(root, config):
  803. return CelebAMockData.generate(root)[config["split"]]
  804. @register_mock(configs=combinations_grid(split=("train", "val", "test")))
  805. def country211(root, config):
  806. split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"])
  807. split_folder.mkdir(parents=True, exist_ok=True)
  808. num_examples = {
  809. "train": 3,
  810. "val": 4,
  811. "test": 5,
  812. }[config["split"]]
  813. classes = ("AD", "BS", "GR")
  814. for cls in classes:
  815. create_image_folder(
  816. split_folder,
  817. name=cls,
  818. file_name_fn=lambda idx: f"{idx}.jpg",
  819. num_examples=num_examples,
  820. )
  821. make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
  822. return num_examples * len(classes)
  823. @register_mock(configs=combinations_grid(split=("train", "test")))
  824. def food101(root, config):
  825. data_folder = root / "food-101"
  826. num_images_per_class = 3
  827. image_folder = data_folder / "images"
  828. categories = ["apple_pie", "baby_back_ribs", "waffles"]
  829. image_ids = []
  830. for category in categories:
  831. image_files = create_image_folder(
  832. image_folder,
  833. category,
  834. file_name_fn=lambda idx: f"{idx:04d}.jpg",
  835. num_examples=num_images_per_class,
  836. )
  837. image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files)
  838. meta_folder = data_folder / "meta"
  839. meta_folder.mkdir()
  840. with open(meta_folder / "classes.txt", "w") as file:
  841. for category in categories:
  842. file.write(f"{category}\n")
  843. splits = ["train", "test"]
  844. num_samples_map = {}
  845. for offset, split in enumerate(splits):
  846. image_ids_in_split = image_ids[offset :: len(splits)]
  847. num_samples_map[split] = len(image_ids_in_split)
  848. with open(meta_folder / f"{split}.txt", "w") as file:
  849. for image_id in image_ids_in_split:
  850. file.write(f"{image_id}\n")
  851. make_tar(root, f"{data_folder.name}.tar.gz", compression="gz")
  852. return num_samples_map[config["split"]]
  853. @register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10)))
  854. def dtd(root, config):
  855. data_folder = root / "dtd"
  856. num_images_per_class = 3
  857. image_folder = data_folder / "images"
  858. categories = {"banded", "marbled", "zigzagged"}
  859. image_ids_per_category = {
  860. category: [
  861. str(path.relative_to(path.parents[1]).as_posix())
  862. for path in create_image_folder(
  863. image_folder,
  864. category,
  865. file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
  866. num_examples=num_images_per_class,
  867. )
  868. ]
  869. for category in categories
  870. }
  871. meta_folder = data_folder / "labels"
  872. meta_folder.mkdir()
  873. with open(meta_folder / "labels_joint_anno.txt", "w") as file:
  874. for cls, image_ids in image_ids_per_category.items():
  875. for image_id in image_ids:
  876. joint_categories = random.choices(
  877. list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))
  878. )
  879. file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n")
  880. image_ids = list(itertools.chain(*image_ids_per_category.values()))
  881. splits = ("train", "val", "test")
  882. num_samples_map = {}
  883. for fold in range(1, 11):
  884. random.shuffle(image_ids)
  885. for offset, split in enumerate(splits):
  886. image_ids_in_config = image_ids[offset :: len(splits)]
  887. with open(meta_folder / f"{split}{fold}.txt", "w") as file:
  888. file.write("\n".join(image_ids_in_config) + "\n")
  889. num_samples_map[(split, fold)] = len(image_ids_in_config)
  890. make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")
  891. return num_samples_map[config["split"], config["fold"]]
  892. @register_mock(configs=combinations_grid(split=("train", "test")))
  893. def fer2013(root, config):
  894. split = config["split"]
  895. num_samples = 5 if split == "train" else 3
  896. path = root / f"{split}.csv"
  897. with open(path, "w", newline="") as file:
  898. field_names = ["emotion"] if split == "train" else []
  899. field_names.append("pixels")
  900. file.write(",".join(field_names) + "\n")
  901. writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
  902. for _ in range(num_samples):
  903. rowdict = {
  904. "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)])
  905. }
  906. if split == "train":
  907. rowdict["emotion"] = int(torch.randint(7, ()))
  908. writer.writerow(rowdict)
  909. make_zip(root, f"{path.name}.zip", path)
  910. return num_samples
  911. @register_mock(configs=combinations_grid(split=("train", "test")))
  912. def gtsrb(root, config):
  913. num_examples_per_class = 5 if config["split"] == "train" else 3
  914. classes = ("00000", "00042", "00012")
  915. num_examples = num_examples_per_class * len(classes)
  916. csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
  917. def _make_ann_file(path, num_examples, class_idx):
  918. if class_idx == "random":
  919. class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
  920. with open(path, "w") as csv_file:
  921. writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
  922. writer.writeheader()
  923. for image_idx in range(num_examples):
  924. writer.writerow(
  925. {
  926. "Filename": f"{image_idx:05d}.ppm",
  927. "Width": torch.randint(1, 100, size=()).item(),
  928. "Height": torch.randint(1, 100, size=()).item(),
  929. "Roi.X1": torch.randint(1, 100, size=()).item(),
  930. "Roi.Y1": torch.randint(1, 100, size=()).item(),
  931. "Roi.X2": torch.randint(1, 100, size=()).item(),
  932. "Roi.Y2": torch.randint(1, 100, size=()).item(),
  933. "ClassId": class_idx,
  934. }
  935. )
  936. archive_folder = root / "GTSRB"
  937. if config["split"] == "train":
  938. train_folder = archive_folder / "Training"
  939. train_folder.mkdir(parents=True)
  940. for class_idx in classes:
  941. create_image_folder(
  942. train_folder,
  943. name=class_idx,
  944. file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
  945. num_examples=num_examples_per_class,
  946. )
  947. _make_ann_file(
  948. path=train_folder / class_idx / f"GT-{class_idx}.csv",
  949. num_examples=num_examples_per_class,
  950. class_idx=int(class_idx),
  951. )
  952. make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
  953. else:
  954. test_folder = archive_folder / "Final_Test"
  955. test_folder.mkdir(parents=True)
  956. create_image_folder(
  957. test_folder,
  958. name="Images",
  959. file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
  960. num_examples=num_examples,
  961. )
  962. make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
  963. _make_ann_file(
  964. path=root / "GT-final_test.csv",
  965. num_examples=num_examples,
  966. class_idx="random",
  967. )
  968. make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
  969. return num_examples
  970. @register_mock(configs=combinations_grid(split=("train", "val", "test")))
  971. def clevr(root, config):
  972. data_folder = root / "CLEVR_v1.0"
  973. num_samples_map = {
  974. "train": 3,
  975. "val": 2,
  976. "test": 1,
  977. }
  978. images_folder = data_folder / "images"
  979. image_files = {
  980. split: create_image_folder(
  981. images_folder,
  982. split,
  983. file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
  984. num_examples=num_samples,
  985. )
  986. for split, num_samples in num_samples_map.items()
  987. }
  988. scenes_folder = data_folder / "scenes"
  989. scenes_folder.mkdir()
  990. for split in ["train", "val"]:
  991. with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
  992. json.dump(
  993. {
  994. "scenes": [
  995. {
  996. "image_filename": image_file.name,
  997. # We currently only return the number of objects in a scene.
  998. # Thus, it is sufficient for now to only mock the number of elements.
  999. "objects": [None] * int(torch.randint(1, 5, ())),
  1000. }
  1001. for image_file in image_files[split]
  1002. ]
  1003. },
  1004. file,
  1005. )
  1006. make_zip(root, f"{data_folder.name}.zip", data_folder)
  1007. return num_samples_map[config["split"]]
  1008. class OxfordIIITPetMockData:
  1009. @classmethod
  1010. def _meta_to_split_and_classification_ann(cls, meta, idx):
  1011. image_id = "_".join(
  1012. [
  1013. *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
  1014. str(idx),
  1015. ]
  1016. )
  1017. class_id = str(meta["label"] + 1)
  1018. species = "1" if meta["species"] == "cat" else "2"
  1019. breed_id = "-1"
  1020. return (image_id, class_id, species, breed_id)
  1021. @classmethod
  1022. def generate(self, root):
  1023. classification_anns_meta = (
  1024. dict(cls="Abyssinian", label=0, species="cat"),
  1025. dict(cls="Keeshond", label=18, species="dog"),
  1026. dict(cls="Yorkshire Terrier", label=36, species="dog"),
  1027. )
  1028. split_and_classification_anns = [
  1029. self._meta_to_split_and_classification_ann(meta, idx)
  1030. for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
  1031. ]
  1032. image_ids, *_ = zip(*split_and_classification_anns)
  1033. image_files = create_image_folder(
  1034. root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
  1035. )
  1036. anns_folder = root / "annotations"
  1037. anns_folder.mkdir()
  1038. random.shuffle(split_and_classification_anns)
  1039. splits = ("trainval", "test")
  1040. num_samples_map = {}
  1041. for offset, split in enumerate(splits):
  1042. split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)]
  1043. with open(anns_folder / f"{split}.txt", "w") as file:
  1044. writer = csv.writer(file, delimiter=" ")
  1045. for split_and_classification_ann in split_and_classification_anns_in_split:
  1046. writer.writerow(split_and_classification_ann)
  1047. num_samples_map[split] = len(split_and_classification_anns_in_split)
  1048. segmentation_files = create_image_folder(
  1049. anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
  1050. )
  1051. # The dataset has some rogue files
  1052. for path in image_files[:3]:
  1053. path.with_suffix(".mat").touch()
  1054. for path in segmentation_files:
  1055. path.with_name(f".{path.name}").touch()
  1056. make_tar(root, "images.tar.gz", compression="gz")
  1057. make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz")
  1058. return num_samples_map
  1059. @register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test")))
  1060. def oxford_iiit_pet(root, config):
  1061. return OxfordIIITPetMockData.generate(root)[config["split"]]
  1062. class _CUB200MockData:
  1063. @classmethod
  1064. def _category_folder(cls, category, idx):
  1065. return f"{idx:03d}.{category}"
  1066. @classmethod
  1067. def _file_stem(cls, category, idx):
  1068. return f"{category}_{idx:04d}"
  1069. @classmethod
  1070. def _make_images(cls, images_folder):
  1071. image_files = []
  1072. for category_idx, category in [
  1073. (1, "Black_footed_Albatross"),
  1074. (100, "Brown_Pelican"),
  1075. (200, "Common_Yellowthroat"),
  1076. ]:
  1077. image_files.extend(
  1078. create_image_folder(
  1079. images_folder,
  1080. cls._category_folder(category, category_idx),
  1081. lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg",
  1082. num_examples=5,
  1083. )
  1084. )
  1085. return image_files
  1086. class CUB2002011MockData(_CUB200MockData):
  1087. @classmethod
  1088. def _make_archive(cls, root):
  1089. archive_folder = root / "CUB_200_2011"
  1090. images_folder = archive_folder / "images"
  1091. image_files = cls._make_images(images_folder)
  1092. image_ids = list(range(1, len(image_files) + 1))
  1093. with open(archive_folder / "images.txt", "w") as file:
  1094. file.write(
  1095. "\n".join(
  1096. f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files)
  1097. )
  1098. )
  1099. split_ids = torch.randint(2, (len(image_ids),)).tolist()
  1100. counts = Counter(split_ids)
  1101. num_samples_map = {"train": counts[1], "test": counts[0]}
  1102. with open(archive_folder / "train_test_split.txt", "w") as file:
  1103. file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids)))
  1104. with open(archive_folder / "bounding_boxes.txt", "w") as file:
  1105. file.write(
  1106. "\n".join(
  1107. " ".join(
  1108. str(item)
  1109. for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()]
  1110. )
  1111. for image_id in image_ids
  1112. )
  1113. )
  1114. make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz")
  1115. return image_files, num_samples_map
  1116. @classmethod
  1117. def _make_segmentations(cls, root, image_files):
  1118. segmentations_folder = root / "segmentations"
  1119. for image_file in image_files:
  1120. folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1]))
  1121. folder.mkdir(exist_ok=True, parents=True)
  1122. create_image_file(
  1123. folder,
  1124. image_file.with_suffix(".png").name,
  1125. size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()],
  1126. )
  1127. make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
  1128. @classmethod
  1129. def generate(cls, root):
  1130. image_files, num_samples_map = cls._make_archive(root)
  1131. cls._make_segmentations(root, image_files)
  1132. return num_samples_map
  1133. class CUB2002010MockData(_CUB200MockData):
  1134. @classmethod
  1135. def _make_hidden_rouge_file(cls, *files):
  1136. for file in files:
  1137. (file.parent / f"._{file.name}").touch()
  1138. @classmethod
  1139. def _make_splits(cls, root, image_files):
  1140. split_folder = root / "lists"
  1141. split_folder.mkdir()
  1142. random.shuffle(image_files)
  1143. splits = ("train", "test")
  1144. num_samples_map = {}
  1145. for offset, split in enumerate(splits):
  1146. image_files_in_split = image_files[offset :: len(splits)]
  1147. split_file = split_folder / f"{split}.txt"
  1148. with open(split_file, "w") as file:
  1149. file.write(
  1150. "\n".join(
  1151. sorted(
  1152. str(image_file.relative_to(image_file.parents[1]).as_posix())
  1153. for image_file in image_files_in_split
  1154. )
  1155. )
  1156. )
  1157. cls._make_hidden_rouge_file(split_file)
  1158. num_samples_map[split] = len(image_files_in_split)
  1159. make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz")
  1160. return num_samples_map
  1161. @classmethod
  1162. def _make_anns(cls, root, image_files):
  1163. from scipy.io import savemat
  1164. anns_folder = root / "annotations-mat"
  1165. for image_file in image_files:
  1166. ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1])
  1167. ann_file.parent.mkdir(parents=True, exist_ok=True)
  1168. savemat(
  1169. ann_file,
  1170. {
  1171. "seg": torch.randint(
  1172. 256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8
  1173. ).numpy(),
  1174. "bbox": dict(
  1175. zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist())
  1176. ),
  1177. },
  1178. )
  1179. readme_file = anns_folder / "README.txt"
  1180. readme_file.touch()
  1181. cls._make_hidden_rouge_file(readme_file)
  1182. make_tar(root, "annotations.tgz", anns_folder, compression="gz")
  1183. @classmethod
  1184. def generate(cls, root):
  1185. images_folder = root / "images"
  1186. image_files = cls._make_images(images_folder)
  1187. cls._make_hidden_rouge_file(*image_files)
  1188. make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz")
  1189. num_samples_map = cls._make_splits(root, image_files)
  1190. cls._make_anns(root, image_files)
  1191. return num_samples_map
  1192. @register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011")))
  1193. def cub200(root, config):
  1194. num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root)
  1195. return num_samples_map[config["split"]]
  1196. @register_mock(configs=[dict()])
  1197. def eurosat(root, config):
  1198. data_folder = root / "2750"
  1199. data_folder.mkdir(parents=True)
  1200. num_examples_per_class = 3
  1201. categories = ["AnnualCrop", "Forest"]
  1202. for category in categories:
  1203. create_image_folder(
  1204. root=data_folder,
  1205. name=category,
  1206. file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg",
  1207. num_examples=num_examples_per_class,
  1208. )
  1209. make_zip(root, "EuroSAT.zip", data_folder)
  1210. return len(categories) * num_examples_per_class
  1211. @register_mock(configs=combinations_grid(split=("train", "test", "extra")))
  1212. def svhn(root, config):
  1213. import scipy.io as sio
  1214. num_samples = {
  1215. "train": 2,
  1216. "test": 3,
  1217. "extra": 4,
  1218. }[config["split"]]
  1219. sio.savemat(
  1220. root / f"{config['split']}_32x32.mat",
  1221. {
  1222. "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8),
  1223. "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8),
  1224. },
  1225. )
  1226. return num_samples
  1227. @register_mock(configs=combinations_grid(split=("train", "val", "test")))
  1228. def pcam(root, config):
  1229. import h5py
  1230. num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
  1231. split = "valid" if config["split"] == "val" else config["split"]
  1232. images_io = io.BytesIO()
  1233. with h5py.File(images_io, "w") as f:
  1234. f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
  1235. targets_io = io.BytesIO()
  1236. with h5py.File(targets_io, "w") as f:
  1237. f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
  1238. # Create .gz compressed files
  1239. images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz"
  1240. targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz"
  1241. for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)):
  1242. compressed_data = gzip.compress(uncompressed_file_io.getbuffer())
  1243. with open(compressed_file_name, "wb") as compressed_file:
  1244. compressed_file.write(compressed_data)
  1245. return num_images
  1246. @register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test")))
  1247. def stanford_cars(root, config):
  1248. import scipy.io as io
  1249. from numpy.core.records import fromarrays
  1250. split = config["split"]
  1251. num_samples = {"train": 5, "test": 7}[split]
  1252. num_categories = 3
  1253. if split == "train":
  1254. images_folder_name = "cars_train"
  1255. devkit = root / "devkit"
  1256. devkit.mkdir()
  1257. annotations_mat_path = devkit / "cars_train_annos.mat"
  1258. else:
  1259. images_folder_name = "cars_test"
  1260. annotations_mat_path = root / "cars_test_annos_withlabels.mat"
  1261. create_image_folder(
  1262. root=root,
  1263. name=images_folder_name,
  1264. file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
  1265. num_examples=num_samples,
  1266. )
  1267. make_tar(root, f"cars_{split}.tgz", images_folder_name)
  1268. bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8)
  1269. classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8)
  1270. fnames = [f"{i:5d}.jpg" for i in range(num_samples)]
  1271. rec_array = fromarrays(
  1272. [bbox, bbox, bbox, bbox, classes, fnames],
  1273. names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"],
  1274. )
  1275. io.savemat(annotations_mat_path, {"annotations": rec_array})
  1276. if split == "train":
  1277. make_tar(root, "car_devkit.tgz", devkit, compression="gz")
  1278. return num_samples
  1279. @register_mock(configs=combinations_grid(split=("train", "test")))
  1280. def usps(root, config):
  1281. num_samples = {"train": 15, "test": 7}[config["split"]]
  1282. with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh:
  1283. lines = []
  1284. for _ in range(num_samples):
  1285. label = make_tensor(1, low=1, high=11, dtype=torch.int)
  1286. values = make_tensor(256, low=-1, high=1, dtype=torch.float)
  1287. lines.append(
  1288. " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))])
  1289. )
  1290. fh.write("\n".join(lines).encode())
  1291. return num_samples