1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582 |
- import bz2
- import collections.abc
- import csv
- import functools
- import gzip
- import io
- import itertools
- import json
- import lzma
- import pathlib
- import pickle
- import random
- import shutil
- import unittest.mock
- import xml.etree.ElementTree as ET
- from collections import Counter, defaultdict
- import numpy as np
- import pytest
- import torch
- from common_utils import combinations_grid
- from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
- from torch.nn.functional import one_hot
- from torch.testing import make_tensor as _make_tensor
- from torchvision.prototype import datasets
- make_tensor = functools.partial(_make_tensor, device="cpu")
- make_scalar = functools.partial(make_tensor, ())
- __all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
- class DatasetMock:
- def __init__(self, name, *, mock_data_fn, configs):
- # FIXME: error handling for unknown names
- self.name = name
- self.mock_data_fn = mock_data_fn
- self.configs = configs
- def _parse_mock_info(self, mock_info):
- if mock_info is None:
- raise pytest.UsageError(
- f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
- f"integer indicating the number of samples for the current `config`."
- )
- elif isinstance(mock_info, int):
- mock_info = dict(num_samples=mock_info)
- elif not isinstance(mock_info, dict):
- raise pytest.UsageError(
- f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object "
- f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no "
- f"additional information is required for specific tests, the number of samples can also be returned as "
- f"an integer."
- )
- elif "num_samples" not in mock_info:
- raise pytest.UsageError(
- f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a "
- f"`'num_samples'` entry indicating the number of samples."
- )
- return mock_info
- def load(self, config):
- # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
- # test/test_prototype_builtin_datasets.py
- root = pathlib.Path(datasets.home()) / self.name
- # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
- # this will only download **and** preprocess if the file is not present. In other words, if we already place
- # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
- # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
- # `root` only when it is requested.
- tmp_mock_data_folder = root / "__mock__"
- tmp_mock_data_folder.mkdir(parents=True)
- mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
- def patched_download(resource, root, **kwargs):
- src = tmp_mock_data_folder / resource.file_name
- if not src.exists():
- raise pytest.UsageError(
- f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
- f"but it was not created by the mock data function."
- )
- dst = root / resource.file_name
- shutil.move(str(src), str(root))
- return dst
- with unittest.mock.patch(
- "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
- ):
- dataset = datasets.load(self.name, **config)
- extra_files = list(tmp_mock_data_folder.glob("**/*"))
- if extra_files:
- raise pytest.UsageError(
- (
- f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
- f"but they were not loaded:\n\n"
- )
- + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
- )
- tmp_mock_data_folder.rmdir()
- return dataset, mock_info
- def config_id(name, config):
- parts = [name]
- for name, value in config.items():
- if isinstance(value, bool):
- part = ("" if value else "no_") + name
- else:
- part = str(value)
- parts.append(part)
- return "-".join(parts)
- def parametrize_dataset_mocks(*dataset_mocks, marks=None):
- mocks = {}
- for mock in dataset_mocks:
- if isinstance(mock, DatasetMock):
- mocks[mock.name] = mock
- elif isinstance(mock, collections.abc.Mapping):
- mocks.update(mock)
- else:
- raise pytest.UsageError(
- f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, "
- f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, "
- f"but got {mock} instead."
- )
- dataset_mocks = mocks
- if marks is None:
- marks = {}
- elif not isinstance(marks, collections.abc.Mapping):
- raise pytest.UsageError()
- return pytest.mark.parametrize(
- ("dataset_mock", "config"),
- [
- pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ()))
- for name, dataset_mock in dataset_mocks.items()
- for config in dataset_mock.configs
- ],
- )
- DATASET_MOCKS = {}
- def register_mock(name=None, *, configs):
- def wrapper(mock_data_fn):
- nonlocal name
- if name is None:
- name = mock_data_fn.__name__
- DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs)
- return mock_data_fn
- return wrapper
- class MNISTMockData:
- _DTYPES_ID = {
- torch.uint8: 8,
- torch.int8: 9,
- torch.int16: 11,
- torch.int32: 12,
- torch.float32: 13,
- torch.float64: 14,
- }
- @classmethod
- def _magic(cls, dtype, ndim):
- return cls._DTYPES_ID[dtype] * 256 + ndim + 1
- @staticmethod
- def _encode(t):
- return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]
- @staticmethod
- def _big_endian_dtype(dtype):
- np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
- return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")
- @classmethod
- def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
- with compressor(root / filename, "wb") as fh:
- for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
- fh.write(cls._encode(meta))
- data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)
- fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())
- @classmethod
- def generate(
- cls,
- root,
- *,
- num_categories,
- num_samples=None,
- images_file,
- labels_file,
- image_size=(28, 28),
- image_dtype=torch.uint8,
- label_size=(),
- label_dtype=torch.uint8,
- compressor=None,
- ):
- if num_samples is None:
- num_samples = num_categories
- if compressor is None:
- compressor = gzip.open
- cls._create_binary_file(
- root,
- images_file,
- num_samples=num_samples,
- shape=image_size,
- dtype=image_dtype,
- compressor=compressor,
- high=float("inf"),
- )
- cls._create_binary_file(
- root,
- labels_file,
- num_samples=num_samples,
- shape=label_size,
- dtype=label_dtype,
- compressor=compressor,
- high=num_categories,
- )
- return num_samples
- def mnist(root, config):
- prefix = "train" if config["split"] == "train" else "t10k"
- return MNISTMockData.generate(
- root,
- num_categories=10,
- images_file=f"{prefix}-images-idx3-ubyte.gz",
- labels_file=f"{prefix}-labels-idx1-ubyte.gz",
- )
- DATASET_MOCKS.update(
- {
- name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test")))
- for name in ["mnist", "fashionmnist", "kmnist"]
- }
- )
- @register_mock(
- configs=combinations_grid(
- split=("train", "test"),
- image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
- )
- )
- def emnist(root, config):
- num_samples_map = {}
- file_names = set()
- for split, image_set in itertools.product(
- ("train", "test"),
- ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
- ):
- prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}"
- images_file = f"{prefix}-images-idx3-ubyte.gz"
- labels_file = f"{prefix}-labels-idx1-ubyte.gz"
- file_names.update({images_file, labels_file})
- num_samples_map[(split, image_set)] = MNISTMockData.generate(
- root,
- # The image sets that merge some lower case letters in their respective upper case variant, still use dense
- # labels in the data files. Thus, num_categories != len(categories) there.
- num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62,
- images_file=images_file,
- labels_file=labels_file,
- )
- make_zip(root, "emnist-gzip.zip", *file_names)
- return num_samples_map[(config["split"], config["image_set"])]
- @register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist")))
- def qmnist(root, config):
- num_categories = 10
- if config["split"] == "train":
- num_samples = num_samples_gen = num_categories + 2
- prefix = "qmnist-train"
- suffix = ".gz"
- compressor = gzip.open
- elif config["split"].startswith("test"):
- # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
- # more than 10000 images for the dataset to not be empty.
- num_samples_gen = 10001
- num_samples = {
- "test": num_samples_gen,
- "test10k": min(num_samples_gen, 10_000),
- "test50k": num_samples_gen - 10_000,
- }[config["split"]]
- prefix = "qmnist-test"
- suffix = ".gz"
- compressor = gzip.open
- else: # config["split"] == "nist"
- num_samples = num_samples_gen = num_categories + 3
- prefix = "xnist"
- suffix = ".xz"
- compressor = lzma.open
- MNISTMockData.generate(
- root,
- num_categories=num_categories,
- num_samples=num_samples_gen,
- images_file=f"{prefix}-images-idx3-ubyte{suffix}",
- labels_file=f"{prefix}-labels-idx2-int{suffix}",
- label_size=(8,),
- label_dtype=torch.int32,
- compressor=compressor,
- )
- return num_samples
- class CIFARMockData:
- NUM_PIXELS = 32 * 32 * 3
- @classmethod
- def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
- content = {
- "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
- labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
- }
- with open(pathlib.Path(root) / name, "wb") as fh:
- pickle.dump(content, fh)
- @classmethod
- def generate(
- cls,
- root,
- name,
- *,
- folder,
- train_files,
- test_files,
- num_categories,
- labels_key,
- ):
- folder = root / folder
- folder.mkdir()
- files = (*train_files, *test_files)
- for file in files:
- cls._create_batch_file(
- folder,
- file,
- num_categories=num_categories,
- labels_key=labels_key,
- )
- make_tar(root, name, folder, compression="gz")
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def cifar10(root, config):
- train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
- test_files = ["test_batch"]
- CIFARMockData.generate(
- root=root,
- name="cifar-10-python.tar.gz",
- folder=pathlib.Path("cifar-10-batches-py"),
- train_files=train_files,
- test_files=test_files,
- num_categories=10,
- labels_key="labels",
- )
- return len(train_files if config["split"] == "train" else test_files)
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def cifar100(root, config):
- train_files = ["train"]
- test_files = ["test"]
- CIFARMockData.generate(
- root=root,
- name="cifar-100-python.tar.gz",
- folder=pathlib.Path("cifar-100-python"),
- train_files=train_files,
- test_files=test_files,
- num_categories=100,
- labels_key="fine_labels",
- )
- return len(train_files if config["split"] == "train" else test_files)
- @register_mock(configs=[dict()])
- def caltech101(root, config):
- def create_ann_file(root, name):
- import scipy.io
- box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
- obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()
- scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))
- def create_ann_folder(root, name, file_name_fn, num_examples):
- root = pathlib.Path(root) / name
- root.mkdir(parents=True)
- for idx in range(num_examples):
- create_ann_file(root, file_name_fn(idx))
- images_root = root / "101_ObjectCategories"
- anns_root = root / "Annotations"
- image_category_map = {
- "Faces": "Faces_2",
- "Faces_easy": "Faces_3",
- "Motorbikes": "Motorbikes_16",
- "airplanes": "Airplanes_Side_2",
- }
- categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
- num_images_per_category = 2
- for category in categories:
- create_image_folder(
- root=images_root,
- name=category,
- file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
- num_examples=num_images_per_category,
- )
- create_ann_folder(
- root=anns_root,
- name=image_category_map.get(category, category),
- file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
- num_examples=num_images_per_category,
- )
- (images_root / "BACKGROUND_Goodle").mkdir()
- make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")
- make_tar(root, f"{anns_root.name}.tar", anns_root)
- return num_images_per_category * len(categories)
- @register_mock(configs=[dict()])
- def caltech256(root, config):
- dir = root / "256_ObjectCategories"
- num_images_per_category = 2
- categories = [
- (1, "ak47"),
- (127, "laptop-101"),
- (198, "spider"),
- (257, "clutter"),
- ]
- for category_idx, category in categories:
- files = create_image_folder(
- dir,
- name=f"{category_idx:03d}.{category}",
- file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
- num_examples=num_images_per_category,
- )
- if category == "spider":
- open(files[0].parent / "RENAME2", "w").close()
- make_tar(root, f"{dir.name}.tar", dir)
- return num_images_per_category * len(categories)
- @register_mock(configs=combinations_grid(split=("train", "val", "test")))
- def imagenet(root, config):
- from scipy.io import savemat
- info = datasets.info("imagenet")
- if config["split"] == "train":
- num_samples = len(info["wnids"])
- archive_name = "ILSVRC2012_img_train.tar"
- files = []
- for wnid in info["wnids"]:
- create_image_folder(
- root=root,
- name=wnid,
- file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
- num_examples=1,
- )
- files.append(make_tar(root, f"{wnid}.tar"))
- elif config["split"] == "val":
- num_samples = 3
- archive_name = "ILSVRC2012_img_val.tar"
- files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
- devkit_root = root / "ILSVRC2012_devkit_t12"
- data_root = devkit_root / "data"
- data_root.mkdir(parents=True)
- with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
- for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist():
- file.write(f"{label}\n")
- num_children = 0
- synsets = [
- (idx, wnid, category, "", num_children, [], 0, 0)
- for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1)
- ]
- num_children = 1
- synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
- synsets = np.array(
- synsets,
- dtype=np.dtype(
- [
- ("ILSVRC2012_ID", "O"),
- ("WNID", "O"),
- ("words", "O"),
- ("gloss", "O"),
- ("num_children", "O"),
- ("children", "O"),
- ("wordnet_height", "O"),
- ("num_train_images", "O"),
- ]
- ),
- )
- savemat(data_root / "meta.mat", dict(synsets=synsets))
- make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
- else: # config["split"] == "test"
- num_samples = 5
- archive_name = "ILSVRC2012_img_test_v10102019.tar"
- files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
- make_tar(root, archive_name, *files)
- return num_samples
- class CocoMockData:
- @classmethod
- def _make_annotations_json(
- cls,
- root,
- name,
- *,
- images_meta,
- fn,
- ):
- num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
- num_anns_total = int(num_anns_per_image.sum())
- ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])
- anns_meta = []
- for image_meta, num_anns in zip(images_meta, num_anns_per_image):
- for _ in range(num_anns):
- ann_id = int(next(ann_ids_iter))
- anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
- anns_meta.sort(key=lambda ann: ann["id"])
- with open(root / name, "w") as file:
- json.dump(dict(images=images_meta, annotations=anns_meta), file)
- return num_anns_per_image
- @staticmethod
- def _make_instances_data(ann_id, image_meta):
- def make_rle_segmentation():
- height, width = image_meta["height"], image_meta["width"]
- numel = height * width
- counts = []
- while sum(counts) <= numel:
- counts.append(int(torch.randint(5, 8, ())))
- if sum(counts) > numel:
- counts[-1] -= sum(counts) - numel
- return dict(counts=counts, size=[height, width])
- return dict(
- segmentation=make_rle_segmentation(),
- bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
- iscrowd=True,
- area=float(make_scalar(dtype=torch.float32)),
- category_id=int(make_scalar(dtype=torch.int64)),
- )
- @staticmethod
- def _make_captions_data(ann_id, image_meta):
- return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")
- @classmethod
- def _make_annotations(cls, root, name, *, images_meta):
- num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
- for annotations, fn in (
- ("instances", cls._make_instances_data),
- ("captions", cls._make_captions_data),
- ):
- num_anns_per_image += cls._make_annotations_json(
- root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
- )
- return int(num_anns_per_image.sum())
- @classmethod
- def generate(
- cls,
- root,
- *,
- split,
- year,
- num_samples,
- ):
- annotations_dir = root / "annotations"
- annotations_dir.mkdir()
- for split_ in ("train", "val"):
- config_name = f"{split_}{year}"
- images_meta = [
- dict(
- file_name=f"{idx:012d}.jpg",
- id=idx,
- width=width,
- height=height,
- )
- for idx, (height, width) in enumerate(
- torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
- )
- ]
- if split_ == split:
- create_image_folder(
- root,
- config_name,
- file_name_fn=lambda idx: images_meta[idx]["file_name"],
- num_examples=num_samples,
- size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
- )
- make_zip(root, f"{config_name}.zip")
- cls._make_annotations(
- annotations_dir,
- config_name,
- images_meta=images_meta,
- )
- make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)
- return num_samples
- @register_mock(
- configs=combinations_grid(
- split=("train", "val"),
- year=("2017", "2014"),
- annotations=("instances", "captions", None),
- )
- )
- def coco(root, config):
- return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
- class SBDMockData:
- _NUM_CATEGORIES = 20
- @classmethod
- def _make_split_files(cls, root_map, *, split):
- splits_and_idcs = [
- ("train", [0, 1, 2]),
- ("val", [3]),
- ]
- if split == "train_noval":
- splits_and_idcs.append(("train_noval", [0, 2]))
- ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
- for split, ids in ids_map.items():
- with open(root_map[split] / f"{split}.txt", "w") as fh:
- fh.writelines(f"{id}\n" for id in ids)
- return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
- @classmethod
- def _make_anns_folder(cls, root, name, ids):
- from scipy.io import savemat
- anns_folder = root / name
- anns_folder.mkdir()
- sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist()
- for id, size in zip(ids, sizes):
- savemat(
- anns_folder / f"{id}.mat",
- {
- "GTcls": {
- "Boundaries": cls._make_boundaries(size),
- "Segmentation": cls._make_segmentation(size),
- }
- },
- )
- return sizes
- @classmethod
- def _make_boundaries(cls, size):
- from scipy.sparse import csc_matrix
- return [
- [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES)
- ]
- @classmethod
- def _make_segmentation(cls, size):
- return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
- @classmethod
- def generate(cls, root, *, split):
- archive_folder = root / "benchmark_RELEASE"
- dataset_folder = archive_folder / "dataset"
- dataset_folder.mkdir(parents=True, exist_ok=True)
- ids, num_samples_map = cls._make_split_files(
- defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
- )
- sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
- create_image_folder(
- dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
- )
- make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
- return num_samples_map[split]
- @register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
- def sbd(root, config):
- return SBDMockData.generate(root, split=config["split"])
- @register_mock(configs=[dict()])
- def semeion(root, config):
- num_samples = 3
- num_categories = 10
- images = torch.rand(num_samples, 256)
- labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories)
- with open(root / "semeion.data", "w") as fh:
- for image, one_hot_label in zip(images, labels):
- image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
- labels_columns = " ".join([str(label.item()) for label in one_hot_label])
- fh.write(f"{image_columns} {labels_columns} \n")
- return num_samples
- class VOCMockData:
- _TRAIN_VAL_FILE_NAMES = {
- "2007": "VOCtrainval_06-Nov-2007.tar",
- "2008": "VOCtrainval_14-Jul-2008.tar",
- "2009": "VOCtrainval_11-May-2009.tar",
- "2010": "VOCtrainval_03-May-2010.tar",
- "2011": "VOCtrainval_25-May-2011.tar",
- "2012": "VOCtrainval_11-May-2012.tar",
- }
- _TEST_FILE_NAMES = {
- "2007": "VOCtest_06-Nov-2007.tar",
- }
- @classmethod
- def _make_split_files(cls, root, *, year, trainval):
- split_folder = root / "ImageSets"
- if trainval:
- idcs_map = {
- "train": [0, 1, 2],
- "val": [3, 4],
- }
- idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]]
- else:
- idcs_map = {
- "test": [5],
- }
- ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()}
- for task_sub_folder in ("Main", "Segmentation"):
- task_folder = split_folder / task_sub_folder
- task_folder.mkdir(parents=True, exist_ok=True)
- for split, ids in ids_map.items():
- with open(task_folder / f"{split}.txt", "w") as fh:
- fh.writelines(f"{id}\n" for id in ids)
- return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
- @classmethod
- def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples):
- folder = root / name
- folder.mkdir(parents=True, exist_ok=True)
- for idx in range(num_examples):
- cls._make_detection_ann_file(folder, file_name_fn(idx))
- @classmethod
- def _make_detection_ann_file(cls, root, name):
- def add_child(parent, name, text=None):
- child = ET.SubElement(parent, name)
- child.text = str(text)
- return child
- def add_name(obj, name="dog"):
- add_child(obj, "name", name)
- def add_size(obj):
- obj = add_child(obj, "size")
- size = {"width": 0, "height": 0, "depth": 3}
- for name, text in size.items():
- add_child(obj, name, text)
- def add_bndbox(obj):
- obj = add_child(obj, "bndbox")
- bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4}
- for name, text in bndbox.items():
- add_child(obj, name, text)
- annotation = ET.Element("annotation")
- add_size(annotation)
- obj = add_child(annotation, "object")
- add_name(obj)
- add_bndbox(obj)
- with open(root / name, "wb") as fh:
- fh.write(ET.tostring(annotation))
- @classmethod
- def generate(cls, root, *, year, trainval):
- archive_folder = root
- if year == "2011":
- archive_folder = root / "TrainVal"
- data_folder = archive_folder / "VOCdevkit"
- else:
- archive_folder = data_folder = root / "VOCdevkit"
- data_folder = data_folder / f"VOC{year}"
- data_folder.mkdir(parents=True, exist_ok=True)
- ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
- for make_folder_fn, name, suffix in [
- (create_image_folder, "JPEGImages", ".jpg"),
- (create_image_folder, "SegmentationClass", ".png"),
- (cls._make_detection_anns_folder, "Annotations", ".xml"),
- ]:
- make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
- make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
- return num_samples_map
- @register_mock(
- configs=[
- *combinations_grid(
- split=("train", "val", "trainval"),
- year=("2007", "2008", "2009", "2010", "2011", "2012"),
- task=("detection", "segmentation"),
- ),
- *combinations_grid(
- split=("test",),
- year=("2007",),
- task=("detection", "segmentation"),
- ),
- ],
- )
- def voc(root, config):
- trainval = config["split"] != "test"
- return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]]
- class CelebAMockData:
- @classmethod
- def _make_ann_file(cls, root, name, data, *, field_names=None):
- with open(root / name, "w") as file:
- if field_names:
- file.write(f"{len(data)}\r\n")
- file.write(" ".join(field_names) + "\r\n")
- file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data)
- _SPLIT_TO_IDX = {
- "train": 0,
- "val": 1,
- "test": 2,
- }
- @classmethod
- def _make_split_file(cls, root):
- num_samples_map = {"train": 4, "val": 3, "test": 2}
- data = [
- (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split])
- for split, num_samples in num_samples_map.items()
- for idx in range(num_samples)
- ]
- cls._make_ann_file(root, "list_eval_partition.txt", data)
- image_file_names, _ = zip(*data)
- return image_file_names, num_samples_map
- @classmethod
- def _make_identity_file(cls, root, image_file_names):
- cls._make_ann_file(
- root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names]
- )
- @classmethod
- def _make_attributes_file(cls, root, image_file_names):
- field_names = ("5_o_Clock_Shadow", "Young")
- data = [
- [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]]
- for name in image_file_names
- ]
- cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, ""))
- @classmethod
- def _make_bounding_boxes_file(cls, root, image_file_names):
- field_names = ("image_id", "x_1", "y_1", "width", "height")
- data = [
- [f"{name} ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]]
- for name in image_file_names
- ]
- cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names)
- @classmethod
- def _make_landmarks_file(cls, root, image_file_names):
- field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y")
- data = [
- [
- name,
- *[
- f"{coord:4d}" if idx else coord
- for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist())
- ],
- ]
- for name in image_file_names
- ]
- cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names)
- @classmethod
- def generate(cls, root):
- image_file_names, num_samples_map = cls._make_split_file(root)
- image_files = create_image_folder(
- root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
- )
- make_zip(root, image_files[0].parent.with_suffix(".zip").name)
- for make_ann_file_fn in (
- cls._make_identity_file,
- cls._make_attributes_file,
- cls._make_bounding_boxes_file,
- cls._make_landmarks_file,
- ):
- make_ann_file_fn(root, image_file_names)
- return num_samples_map
- @register_mock(configs=combinations_grid(split=("train", "val", "test")))
- def celeba(root, config):
- return CelebAMockData.generate(root)[config["split"]]
- @register_mock(configs=combinations_grid(split=("train", "val", "test")))
- def country211(root, config):
- split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"])
- split_folder.mkdir(parents=True, exist_ok=True)
- num_examples = {
- "train": 3,
- "val": 4,
- "test": 5,
- }[config["split"]]
- classes = ("AD", "BS", "GR")
- for cls in classes:
- create_image_folder(
- split_folder,
- name=cls,
- file_name_fn=lambda idx: f"{idx}.jpg",
- num_examples=num_examples,
- )
- make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
- return num_examples * len(classes)
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def food101(root, config):
- data_folder = root / "food-101"
- num_images_per_class = 3
- image_folder = data_folder / "images"
- categories = ["apple_pie", "baby_back_ribs", "waffles"]
- image_ids = []
- for category in categories:
- image_files = create_image_folder(
- image_folder,
- category,
- file_name_fn=lambda idx: f"{idx:04d}.jpg",
- num_examples=num_images_per_class,
- )
- image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files)
- meta_folder = data_folder / "meta"
- meta_folder.mkdir()
- with open(meta_folder / "classes.txt", "w") as file:
- for category in categories:
- file.write(f"{category}\n")
- splits = ["train", "test"]
- num_samples_map = {}
- for offset, split in enumerate(splits):
- image_ids_in_split = image_ids[offset :: len(splits)]
- num_samples_map[split] = len(image_ids_in_split)
- with open(meta_folder / f"{split}.txt", "w") as file:
- for image_id in image_ids_in_split:
- file.write(f"{image_id}\n")
- make_tar(root, f"{data_folder.name}.tar.gz", compression="gz")
- return num_samples_map[config["split"]]
- @register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10)))
- def dtd(root, config):
- data_folder = root / "dtd"
- num_images_per_class = 3
- image_folder = data_folder / "images"
- categories = {"banded", "marbled", "zigzagged"}
- image_ids_per_category = {
- category: [
- str(path.relative_to(path.parents[1]).as_posix())
- for path in create_image_folder(
- image_folder,
- category,
- file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
- num_examples=num_images_per_class,
- )
- ]
- for category in categories
- }
- meta_folder = data_folder / "labels"
- meta_folder.mkdir()
- with open(meta_folder / "labels_joint_anno.txt", "w") as file:
- for cls, image_ids in image_ids_per_category.items():
- for image_id in image_ids:
- joint_categories = random.choices(
- list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))
- )
- file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n")
- image_ids = list(itertools.chain(*image_ids_per_category.values()))
- splits = ("train", "val", "test")
- num_samples_map = {}
- for fold in range(1, 11):
- random.shuffle(image_ids)
- for offset, split in enumerate(splits):
- image_ids_in_config = image_ids[offset :: len(splits)]
- with open(meta_folder / f"{split}{fold}.txt", "w") as file:
- file.write("\n".join(image_ids_in_config) + "\n")
- num_samples_map[(split, fold)] = len(image_ids_in_config)
- make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")
- return num_samples_map[config["split"], config["fold"]]
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def fer2013(root, config):
- split = config["split"]
- num_samples = 5 if split == "train" else 3
- path = root / f"{split}.csv"
- with open(path, "w", newline="") as file:
- field_names = ["emotion"] if split == "train" else []
- field_names.append("pixels")
- file.write(",".join(field_names) + "\n")
- writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- for _ in range(num_samples):
- rowdict = {
- "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)])
- }
- if split == "train":
- rowdict["emotion"] = int(torch.randint(7, ()))
- writer.writerow(rowdict)
- make_zip(root, f"{path.name}.zip", path)
- return num_samples
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def gtsrb(root, config):
- num_examples_per_class = 5 if config["split"] == "train" else 3
- classes = ("00000", "00042", "00012")
- num_examples = num_examples_per_class * len(classes)
- csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
- def _make_ann_file(path, num_examples, class_idx):
- if class_idx == "random":
- class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
- with open(path, "w") as csv_file:
- writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
- writer.writeheader()
- for image_idx in range(num_examples):
- writer.writerow(
- {
- "Filename": f"{image_idx:05d}.ppm",
- "Width": torch.randint(1, 100, size=()).item(),
- "Height": torch.randint(1, 100, size=()).item(),
- "Roi.X1": torch.randint(1, 100, size=()).item(),
- "Roi.Y1": torch.randint(1, 100, size=()).item(),
- "Roi.X2": torch.randint(1, 100, size=()).item(),
- "Roi.Y2": torch.randint(1, 100, size=()).item(),
- "ClassId": class_idx,
- }
- )
- archive_folder = root / "GTSRB"
- if config["split"] == "train":
- train_folder = archive_folder / "Training"
- train_folder.mkdir(parents=True)
- for class_idx in classes:
- create_image_folder(
- train_folder,
- name=class_idx,
- file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
- num_examples=num_examples_per_class,
- )
- _make_ann_file(
- path=train_folder / class_idx / f"GT-{class_idx}.csv",
- num_examples=num_examples_per_class,
- class_idx=int(class_idx),
- )
- make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
- else:
- test_folder = archive_folder / "Final_Test"
- test_folder.mkdir(parents=True)
- create_image_folder(
- test_folder,
- name="Images",
- file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
- num_examples=num_examples,
- )
- make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
- _make_ann_file(
- path=root / "GT-final_test.csv",
- num_examples=num_examples,
- class_idx="random",
- )
- make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
- return num_examples
- @register_mock(configs=combinations_grid(split=("train", "val", "test")))
- def clevr(root, config):
- data_folder = root / "CLEVR_v1.0"
- num_samples_map = {
- "train": 3,
- "val": 2,
- "test": 1,
- }
- images_folder = data_folder / "images"
- image_files = {
- split: create_image_folder(
- images_folder,
- split,
- file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
- num_examples=num_samples,
- )
- for split, num_samples in num_samples_map.items()
- }
- scenes_folder = data_folder / "scenes"
- scenes_folder.mkdir()
- for split in ["train", "val"]:
- with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
- json.dump(
- {
- "scenes": [
- {
- "image_filename": image_file.name,
- # We currently only return the number of objects in a scene.
- # Thus, it is sufficient for now to only mock the number of elements.
- "objects": [None] * int(torch.randint(1, 5, ())),
- }
- for image_file in image_files[split]
- ]
- },
- file,
- )
- make_zip(root, f"{data_folder.name}.zip", data_folder)
- return num_samples_map[config["split"]]
- class OxfordIIITPetMockData:
- @classmethod
- def _meta_to_split_and_classification_ann(cls, meta, idx):
- image_id = "_".join(
- [
- *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
- str(idx),
- ]
- )
- class_id = str(meta["label"] + 1)
- species = "1" if meta["species"] == "cat" else "2"
- breed_id = "-1"
- return (image_id, class_id, species, breed_id)
- @classmethod
- def generate(self, root):
- classification_anns_meta = (
- dict(cls="Abyssinian", label=0, species="cat"),
- dict(cls="Keeshond", label=18, species="dog"),
- dict(cls="Yorkshire Terrier", label=36, species="dog"),
- )
- split_and_classification_anns = [
- self._meta_to_split_and_classification_ann(meta, idx)
- for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
- ]
- image_ids, *_ = zip(*split_and_classification_anns)
- image_files = create_image_folder(
- root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
- )
- anns_folder = root / "annotations"
- anns_folder.mkdir()
- random.shuffle(split_and_classification_anns)
- splits = ("trainval", "test")
- num_samples_map = {}
- for offset, split in enumerate(splits):
- split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)]
- with open(anns_folder / f"{split}.txt", "w") as file:
- writer = csv.writer(file, delimiter=" ")
- for split_and_classification_ann in split_and_classification_anns_in_split:
- writer.writerow(split_and_classification_ann)
- num_samples_map[split] = len(split_and_classification_anns_in_split)
- segmentation_files = create_image_folder(
- anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
- )
- # The dataset has some rogue files
- for path in image_files[:3]:
- path.with_suffix(".mat").touch()
- for path in segmentation_files:
- path.with_name(f".{path.name}").touch()
- make_tar(root, "images.tar.gz", compression="gz")
- make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz")
- return num_samples_map
- @register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test")))
- def oxford_iiit_pet(root, config):
- return OxfordIIITPetMockData.generate(root)[config["split"]]
- class _CUB200MockData:
- @classmethod
- def _category_folder(cls, category, idx):
- return f"{idx:03d}.{category}"
- @classmethod
- def _file_stem(cls, category, idx):
- return f"{category}_{idx:04d}"
- @classmethod
- def _make_images(cls, images_folder):
- image_files = []
- for category_idx, category in [
- (1, "Black_footed_Albatross"),
- (100, "Brown_Pelican"),
- (200, "Common_Yellowthroat"),
- ]:
- image_files.extend(
- create_image_folder(
- images_folder,
- cls._category_folder(category, category_idx),
- lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg",
- num_examples=5,
- )
- )
- return image_files
- class CUB2002011MockData(_CUB200MockData):
- @classmethod
- def _make_archive(cls, root):
- archive_folder = root / "CUB_200_2011"
- images_folder = archive_folder / "images"
- image_files = cls._make_images(images_folder)
- image_ids = list(range(1, len(image_files) + 1))
- with open(archive_folder / "images.txt", "w") as file:
- file.write(
- "\n".join(
- f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files)
- )
- )
- split_ids = torch.randint(2, (len(image_ids),)).tolist()
- counts = Counter(split_ids)
- num_samples_map = {"train": counts[1], "test": counts[0]}
- with open(archive_folder / "train_test_split.txt", "w") as file:
- file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids)))
- with open(archive_folder / "bounding_boxes.txt", "w") as file:
- file.write(
- "\n".join(
- " ".join(
- str(item)
- for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()]
- )
- for image_id in image_ids
- )
- )
- make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz")
- return image_files, num_samples_map
- @classmethod
- def _make_segmentations(cls, root, image_files):
- segmentations_folder = root / "segmentations"
- for image_file in image_files:
- folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1]))
- folder.mkdir(exist_ok=True, parents=True)
- create_image_file(
- folder,
- image_file.with_suffix(".png").name,
- size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()],
- )
- make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
- @classmethod
- def generate(cls, root):
- image_files, num_samples_map = cls._make_archive(root)
- cls._make_segmentations(root, image_files)
- return num_samples_map
- class CUB2002010MockData(_CUB200MockData):
- @classmethod
- def _make_hidden_rouge_file(cls, *files):
- for file in files:
- (file.parent / f"._{file.name}").touch()
- @classmethod
- def _make_splits(cls, root, image_files):
- split_folder = root / "lists"
- split_folder.mkdir()
- random.shuffle(image_files)
- splits = ("train", "test")
- num_samples_map = {}
- for offset, split in enumerate(splits):
- image_files_in_split = image_files[offset :: len(splits)]
- split_file = split_folder / f"{split}.txt"
- with open(split_file, "w") as file:
- file.write(
- "\n".join(
- sorted(
- str(image_file.relative_to(image_file.parents[1]).as_posix())
- for image_file in image_files_in_split
- )
- )
- )
- cls._make_hidden_rouge_file(split_file)
- num_samples_map[split] = len(image_files_in_split)
- make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz")
- return num_samples_map
- @classmethod
- def _make_anns(cls, root, image_files):
- from scipy.io import savemat
- anns_folder = root / "annotations-mat"
- for image_file in image_files:
- ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1])
- ann_file.parent.mkdir(parents=True, exist_ok=True)
- savemat(
- ann_file,
- {
- "seg": torch.randint(
- 256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8
- ).numpy(),
- "bbox": dict(
- zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist())
- ),
- },
- )
- readme_file = anns_folder / "README.txt"
- readme_file.touch()
- cls._make_hidden_rouge_file(readme_file)
- make_tar(root, "annotations.tgz", anns_folder, compression="gz")
- @classmethod
- def generate(cls, root):
- images_folder = root / "images"
- image_files = cls._make_images(images_folder)
- cls._make_hidden_rouge_file(*image_files)
- make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz")
- num_samples_map = cls._make_splits(root, image_files)
- cls._make_anns(root, image_files)
- return num_samples_map
- @register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011")))
- def cub200(root, config):
- num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root)
- return num_samples_map[config["split"]]
- @register_mock(configs=[dict()])
- def eurosat(root, config):
- data_folder = root / "2750"
- data_folder.mkdir(parents=True)
- num_examples_per_class = 3
- categories = ["AnnualCrop", "Forest"]
- for category in categories:
- create_image_folder(
- root=data_folder,
- name=category,
- file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg",
- num_examples=num_examples_per_class,
- )
- make_zip(root, "EuroSAT.zip", data_folder)
- return len(categories) * num_examples_per_class
- @register_mock(configs=combinations_grid(split=("train", "test", "extra")))
- def svhn(root, config):
- import scipy.io as sio
- num_samples = {
- "train": 2,
- "test": 3,
- "extra": 4,
- }[config["split"]]
- sio.savemat(
- root / f"{config['split']}_32x32.mat",
- {
- "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8),
- "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8),
- },
- )
- return num_samples
- @register_mock(configs=combinations_grid(split=("train", "val", "test")))
- def pcam(root, config):
- import h5py
- num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
- split = "valid" if config["split"] == "val" else config["split"]
- images_io = io.BytesIO()
- with h5py.File(images_io, "w") as f:
- f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
- targets_io = io.BytesIO()
- with h5py.File(targets_io, "w") as f:
- f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
- # Create .gz compressed files
- images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz"
- targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz"
- for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)):
- compressed_data = gzip.compress(uncompressed_file_io.getbuffer())
- with open(compressed_file_name, "wb") as compressed_file:
- compressed_file.write(compressed_data)
- return num_images
- @register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test")))
- def stanford_cars(root, config):
- import scipy.io as io
- from numpy.core.records import fromarrays
- split = config["split"]
- num_samples = {"train": 5, "test": 7}[split]
- num_categories = 3
- if split == "train":
- images_folder_name = "cars_train"
- devkit = root / "devkit"
- devkit.mkdir()
- annotations_mat_path = devkit / "cars_train_annos.mat"
- else:
- images_folder_name = "cars_test"
- annotations_mat_path = root / "cars_test_annos_withlabels.mat"
- create_image_folder(
- root=root,
- name=images_folder_name,
- file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
- num_examples=num_samples,
- )
- make_tar(root, f"cars_{split}.tgz", images_folder_name)
- bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8)
- classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8)
- fnames = [f"{i:5d}.jpg" for i in range(num_samples)]
- rec_array = fromarrays(
- [bbox, bbox, bbox, bbox, classes, fnames],
- names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"],
- )
- io.savemat(annotations_mat_path, {"annotations": rec_array})
- if split == "train":
- make_tar(root, "car_devkit.tgz", devkit, compression="gz")
- return num_samples
- @register_mock(configs=combinations_grid(split=("train", "test")))
- def usps(root, config):
- num_samples = {"train": 15, "test": 7}[config["split"]]
- with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh:
- lines = []
- for _ in range(num_samples):
- label = make_tensor(1, low=1, high=11, dtype=torch.int)
- values = make_tensor(256, low=-1, high=1, dtype=torch.float)
- lines.append(
- " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))])
- )
- fh.write("\n".join(lines).encode())
- return num_samples
|