mnist.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548
  1. import codecs
  2. import os
  3. import os.path
  4. import shutil
  5. import string
  6. import sys
  7. import warnings
  8. from typing import Any, Callable, Dict, List, Optional, Tuple
  9. from urllib.error import URLError
  10. import numpy as np
  11. import torch
  12. from PIL import Image
  13. from .utils import _flip_byte_order, check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
  14. from .vision import VisionDataset
  15. class MNIST(VisionDataset):
  16. """`MNIST <http://yann.lecun.com/exdb/mnist/>`_ Dataset.
  17. Args:
  18. root (string): Root directory of dataset where ``MNIST/raw/train-images-idx3-ubyte``
  19. and ``MNIST/raw/t10k-images-idx3-ubyte`` exist.
  20. train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
  21. otherwise from ``t10k-images-idx3-ubyte``.
  22. download (bool, optional): If True, downloads the dataset from the internet and
  23. puts it in root directory. If dataset is already downloaded, it is not
  24. downloaded again.
  25. transform (callable, optional): A function/transform that takes in an PIL image
  26. and returns a transformed version. E.g, ``transforms.RandomCrop``
  27. target_transform (callable, optional): A function/transform that takes in the
  28. target and transforms it.
  29. """
  30. mirrors = [
  31. "http://yann.lecun.com/exdb/mnist/",
  32. "https://ossci-datasets.s3.amazonaws.com/mnist/",
  33. ]
  34. resources = [
  35. ("train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873"),
  36. ("train-labels-idx1-ubyte.gz", "d53e105ee54ea40749a09fcbcd1e9432"),
  37. ("t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3"),
  38. ("t10k-labels-idx1-ubyte.gz", "ec29112dd5afa0611ce80d1b7f02629c"),
  39. ]
  40. training_file = "training.pt"
  41. test_file = "test.pt"
  42. classes = [
  43. "0 - zero",
  44. "1 - one",
  45. "2 - two",
  46. "3 - three",
  47. "4 - four",
  48. "5 - five",
  49. "6 - six",
  50. "7 - seven",
  51. "8 - eight",
  52. "9 - nine",
  53. ]
  54. @property
  55. def train_labels(self):
  56. warnings.warn("train_labels has been renamed targets")
  57. return self.targets
  58. @property
  59. def test_labels(self):
  60. warnings.warn("test_labels has been renamed targets")
  61. return self.targets
  62. @property
  63. def train_data(self):
  64. warnings.warn("train_data has been renamed data")
  65. return self.data
  66. @property
  67. def test_data(self):
  68. warnings.warn("test_data has been renamed data")
  69. return self.data
  70. def __init__(
  71. self,
  72. root: str,
  73. train: bool = True,
  74. transform: Optional[Callable] = None,
  75. target_transform: Optional[Callable] = None,
  76. download: bool = False,
  77. ) -> None:
  78. super().__init__(root, transform=transform, target_transform=target_transform)
  79. self.train = train # training set or test set
  80. if self._check_legacy_exist():
  81. self.data, self.targets = self._load_legacy_data()
  82. return
  83. if download:
  84. self.download()
  85. if not self._check_exists():
  86. raise RuntimeError("Dataset not found. You can use download=True to download it")
  87. self.data, self.targets = self._load_data()
  88. def _check_legacy_exist(self):
  89. processed_folder_exists = os.path.exists(self.processed_folder)
  90. if not processed_folder_exists:
  91. return False
  92. return all(
  93. check_integrity(os.path.join(self.processed_folder, file)) for file in (self.training_file, self.test_file)
  94. )
  95. def _load_legacy_data(self):
  96. # This is for BC only. We no longer cache the data in a custom binary, but simply read from the raw data
  97. # directly.
  98. data_file = self.training_file if self.train else self.test_file
  99. return torch.load(os.path.join(self.processed_folder, data_file))
  100. def _load_data(self):
  101. image_file = f"{'train' if self.train else 't10k'}-images-idx3-ubyte"
  102. data = read_image_file(os.path.join(self.raw_folder, image_file))
  103. label_file = f"{'train' if self.train else 't10k'}-labels-idx1-ubyte"
  104. targets = read_label_file(os.path.join(self.raw_folder, label_file))
  105. return data, targets
  106. def __getitem__(self, index: int) -> Tuple[Any, Any]:
  107. """
  108. Args:
  109. index (int): Index
  110. Returns:
  111. tuple: (image, target) where target is index of the target class.
  112. """
  113. img, target = self.data[index], int(self.targets[index])
  114. # doing this so that it is consistent with all other datasets
  115. # to return a PIL Image
  116. img = Image.fromarray(img.numpy(), mode="L")
  117. if self.transform is not None:
  118. img = self.transform(img)
  119. if self.target_transform is not None:
  120. target = self.target_transform(target)
  121. return img, target
  122. def __len__(self) -> int:
  123. return len(self.data)
  124. @property
  125. def raw_folder(self) -> str:
  126. return os.path.join(self.root, self.__class__.__name__, "raw")
  127. @property
  128. def processed_folder(self) -> str:
  129. return os.path.join(self.root, self.__class__.__name__, "processed")
  130. @property
  131. def class_to_idx(self) -> Dict[str, int]:
  132. return {_class: i for i, _class in enumerate(self.classes)}
  133. def _check_exists(self) -> bool:
  134. return all(
  135. check_integrity(os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0]))
  136. for url, _ in self.resources
  137. )
  138. def download(self) -> None:
  139. """Download the MNIST data if it doesn't exist already."""
  140. if self._check_exists():
  141. return
  142. os.makedirs(self.raw_folder, exist_ok=True)
  143. # download files
  144. for filename, md5 in self.resources:
  145. for mirror in self.mirrors:
  146. url = f"{mirror}{filename}"
  147. try:
  148. print(f"Downloading {url}")
  149. download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5)
  150. except URLError as error:
  151. print(f"Failed to download (trying next):\n{error}")
  152. continue
  153. finally:
  154. print()
  155. break
  156. else:
  157. raise RuntimeError(f"Error downloading {filename}")
  158. def extra_repr(self) -> str:
  159. split = "Train" if self.train is True else "Test"
  160. return f"Split: {split}"
  161. class FashionMNIST(MNIST):
  162. """`Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ Dataset.
  163. Args:
  164. root (string): Root directory of dataset where ``FashionMNIST/raw/train-images-idx3-ubyte``
  165. and ``FashionMNIST/raw/t10k-images-idx3-ubyte`` exist.
  166. train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
  167. otherwise from ``t10k-images-idx3-ubyte``.
  168. download (bool, optional): If True, downloads the dataset from the internet and
  169. puts it in root directory. If dataset is already downloaded, it is not
  170. downloaded again.
  171. transform (callable, optional): A function/transform that takes in an PIL image
  172. and returns a transformed version. E.g, ``transforms.RandomCrop``
  173. target_transform (callable, optional): A function/transform that takes in the
  174. target and transforms it.
  175. """
  176. mirrors = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/"]
  177. resources = [
  178. ("train-images-idx3-ubyte.gz", "8d4fb7e6c68d591d4c3dfef9ec88bf0d"),
  179. ("train-labels-idx1-ubyte.gz", "25c81989df183df01b3e8a0aad5dffbe"),
  180. ("t10k-images-idx3-ubyte.gz", "bef4ecab320f06d8554ea6380940ec79"),
  181. ("t10k-labels-idx1-ubyte.gz", "bb300cfdad3c16e7a12a480ee83cd310"),
  182. ]
  183. classes = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
  184. class KMNIST(MNIST):
  185. """`Kuzushiji-MNIST <https://github.com/rois-codh/kmnist>`_ Dataset.
  186. Args:
  187. root (string): Root directory of dataset where ``KMNIST/raw/train-images-idx3-ubyte``
  188. and ``KMNIST/raw/t10k-images-idx3-ubyte`` exist.
  189. train (bool, optional): If True, creates dataset from ``train-images-idx3-ubyte``,
  190. otherwise from ``t10k-images-idx3-ubyte``.
  191. download (bool, optional): If True, downloads the dataset from the internet and
  192. puts it in root directory. If dataset is already downloaded, it is not
  193. downloaded again.
  194. transform (callable, optional): A function/transform that takes in an PIL image
  195. and returns a transformed version. E.g, ``transforms.RandomCrop``
  196. target_transform (callable, optional): A function/transform that takes in the
  197. target and transforms it.
  198. """
  199. mirrors = ["http://codh.rois.ac.jp/kmnist/dataset/kmnist/"]
  200. resources = [
  201. ("train-images-idx3-ubyte.gz", "bdb82020997e1d708af4cf47b453dcf7"),
  202. ("train-labels-idx1-ubyte.gz", "e144d726b3acfaa3e44228e80efcd344"),
  203. ("t10k-images-idx3-ubyte.gz", "5c965bf0a639b31b8f53240b1b52f4d7"),
  204. ("t10k-labels-idx1-ubyte.gz", "7320c461ea6c1c855c0b718fb2a4b134"),
  205. ]
  206. classes = ["o", "ki", "su", "tsu", "na", "ha", "ma", "ya", "re", "wo"]
  207. class EMNIST(MNIST):
  208. """`EMNIST <https://www.westernsydney.edu.au/bens/home/reproducible_research/emnist>`_ Dataset.
  209. Args:
  210. root (string): Root directory of dataset where ``EMNIST/raw/train-images-idx3-ubyte``
  211. and ``EMNIST/raw/t10k-images-idx3-ubyte`` exist.
  212. split (string): The dataset has 6 different splits: ``byclass``, ``bymerge``,
  213. ``balanced``, ``letters``, ``digits`` and ``mnist``. This argument specifies
  214. which one to use.
  215. train (bool, optional): If True, creates dataset from ``training.pt``,
  216. otherwise from ``test.pt``.
  217. download (bool, optional): If True, downloads the dataset from the internet and
  218. puts it in root directory. If dataset is already downloaded, it is not
  219. downloaded again.
  220. transform (callable, optional): A function/transform that takes in an PIL image
  221. and returns a transformed version. E.g, ``transforms.RandomCrop``
  222. target_transform (callable, optional): A function/transform that takes in the
  223. target and transforms it.
  224. """
  225. url = "https://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip"
  226. md5 = "58c8d27c78d21e728a6bc7b3cc06412e"
  227. splits = ("byclass", "bymerge", "balanced", "letters", "digits", "mnist")
  228. # Merged Classes assumes Same structure for both uppercase and lowercase version
  229. _merged_classes = {"c", "i", "j", "k", "l", "m", "o", "p", "s", "u", "v", "w", "x", "y", "z"}
  230. _all_classes = set(string.digits + string.ascii_letters)
  231. classes_split_dict = {
  232. "byclass": sorted(list(_all_classes)),
  233. "bymerge": sorted(list(_all_classes - _merged_classes)),
  234. "balanced": sorted(list(_all_classes - _merged_classes)),
  235. "letters": ["N/A"] + list(string.ascii_lowercase),
  236. "digits": list(string.digits),
  237. "mnist": list(string.digits),
  238. }
  239. def __init__(self, root: str, split: str, **kwargs: Any) -> None:
  240. self.split = verify_str_arg(split, "split", self.splits)
  241. self.training_file = self._training_file(split)
  242. self.test_file = self._test_file(split)
  243. super().__init__(root, **kwargs)
  244. self.classes = self.classes_split_dict[self.split]
  245. @staticmethod
  246. def _training_file(split) -> str:
  247. return f"training_{split}.pt"
  248. @staticmethod
  249. def _test_file(split) -> str:
  250. return f"test_{split}.pt"
  251. @property
  252. def _file_prefix(self) -> str:
  253. return f"emnist-{self.split}-{'train' if self.train else 'test'}"
  254. @property
  255. def images_file(self) -> str:
  256. return os.path.join(self.raw_folder, f"{self._file_prefix}-images-idx3-ubyte")
  257. @property
  258. def labels_file(self) -> str:
  259. return os.path.join(self.raw_folder, f"{self._file_prefix}-labels-idx1-ubyte")
  260. def _load_data(self):
  261. return read_image_file(self.images_file), read_label_file(self.labels_file)
  262. def _check_exists(self) -> bool:
  263. return all(check_integrity(file) for file in (self.images_file, self.labels_file))
  264. def download(self) -> None:
  265. """Download the EMNIST data if it doesn't exist already."""
  266. if self._check_exists():
  267. return
  268. os.makedirs(self.raw_folder, exist_ok=True)
  269. download_and_extract_archive(self.url, download_root=self.raw_folder, md5=self.md5)
  270. gzip_folder = os.path.join(self.raw_folder, "gzip")
  271. for gzip_file in os.listdir(gzip_folder):
  272. if gzip_file.endswith(".gz"):
  273. extract_archive(os.path.join(gzip_folder, gzip_file), self.raw_folder)
  274. shutil.rmtree(gzip_folder)
  275. class QMNIST(MNIST):
  276. """`QMNIST <https://github.com/facebookresearch/qmnist>`_ Dataset.
  277. Args:
  278. root (string): Root directory of dataset whose ``raw``
  279. subdir contains binary files of the datasets.
  280. what (string,optional): Can be 'train', 'test', 'test10k',
  281. 'test50k', or 'nist' for respectively the mnist compatible
  282. training set, the 60k qmnist testing set, the 10k qmnist
  283. examples that match the mnist testing set, the 50k
  284. remaining qmnist testing examples, or all the nist
  285. digits. The default is to select 'train' or 'test'
  286. according to the compatibility argument 'train'.
  287. compat (bool,optional): A boolean that says whether the target
  288. for each example is class number (for compatibility with
  289. the MNIST dataloader) or a torch vector containing the
  290. full qmnist information. Default=True.
  291. download (bool, optional): If True, downloads the dataset from
  292. the internet and puts it in root directory. If dataset is
  293. already downloaded, it is not downloaded again.
  294. transform (callable, optional): A function/transform that
  295. takes in an PIL image and returns a transformed
  296. version. E.g, ``transforms.RandomCrop``
  297. target_transform (callable, optional): A function/transform
  298. that takes in the target and transforms it.
  299. train (bool,optional,compatibility): When argument 'what' is
  300. not specified, this boolean decides whether to load the
  301. training set or the testing set. Default: True.
  302. """
  303. subsets = {"train": "train", "test": "test", "test10k": "test", "test50k": "test", "nist": "nist"}
  304. resources: Dict[str, List[Tuple[str, str]]] = { # type: ignore[assignment]
  305. "train": [
  306. (
  307. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-train-images-idx3-ubyte.gz",
  308. "ed72d4157d28c017586c42bc6afe6370",
  309. ),
  310. (
  311. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-train-labels-idx2-int.gz",
  312. "0058f8dd561b90ffdd0f734c6a30e5e4",
  313. ),
  314. ],
  315. "test": [
  316. (
  317. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-test-images-idx3-ubyte.gz",
  318. "1394631089c404de565df7b7aeaf9412",
  319. ),
  320. (
  321. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/qmnist-test-labels-idx2-int.gz",
  322. "5b5b05890a5e13444e108efe57b788aa",
  323. ),
  324. ],
  325. "nist": [
  326. (
  327. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/xnist-images-idx3-ubyte.xz",
  328. "7f124b3b8ab81486c9d8c2749c17f834",
  329. ),
  330. (
  331. "https://raw.githubusercontent.com/facebookresearch/qmnist/master/xnist-labels-idx2-int.xz",
  332. "5ed0e788978e45d4a8bd4b7caec3d79d",
  333. ),
  334. ],
  335. }
  336. classes = [
  337. "0 - zero",
  338. "1 - one",
  339. "2 - two",
  340. "3 - three",
  341. "4 - four",
  342. "5 - five",
  343. "6 - six",
  344. "7 - seven",
  345. "8 - eight",
  346. "9 - nine",
  347. ]
  348. def __init__(
  349. self, root: str, what: Optional[str] = None, compat: bool = True, train: bool = True, **kwargs: Any
  350. ) -> None:
  351. if what is None:
  352. what = "train" if train else "test"
  353. self.what = verify_str_arg(what, "what", tuple(self.subsets.keys()))
  354. self.compat = compat
  355. self.data_file = what + ".pt"
  356. self.training_file = self.data_file
  357. self.test_file = self.data_file
  358. super().__init__(root, train, **kwargs)
  359. @property
  360. def images_file(self) -> str:
  361. (url, _), _ = self.resources[self.subsets[self.what]]
  362. return os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0])
  363. @property
  364. def labels_file(self) -> str:
  365. _, (url, _) = self.resources[self.subsets[self.what]]
  366. return os.path.join(self.raw_folder, os.path.splitext(os.path.basename(url))[0])
  367. def _check_exists(self) -> bool:
  368. return all(check_integrity(file) for file in (self.images_file, self.labels_file))
  369. def _load_data(self):
  370. data = read_sn3_pascalvincent_tensor(self.images_file)
  371. if data.dtype != torch.uint8:
  372. raise TypeError(f"data should be of dtype torch.uint8 instead of {data.dtype}")
  373. if data.ndimension() != 3:
  374. raise ValueError("data should have 3 dimensions instead of {data.ndimension()}")
  375. targets = read_sn3_pascalvincent_tensor(self.labels_file).long()
  376. if targets.ndimension() != 2:
  377. raise ValueError(f"targets should have 2 dimensions instead of {targets.ndimension()}")
  378. if self.what == "test10k":
  379. data = data[0:10000, :, :].clone()
  380. targets = targets[0:10000, :].clone()
  381. elif self.what == "test50k":
  382. data = data[10000:, :, :].clone()
  383. targets = targets[10000:, :].clone()
  384. return data, targets
  385. def download(self) -> None:
  386. """Download the QMNIST data if it doesn't exist already.
  387. Note that we only download what has been asked for (argument 'what').
  388. """
  389. if self._check_exists():
  390. return
  391. os.makedirs(self.raw_folder, exist_ok=True)
  392. split = self.resources[self.subsets[self.what]]
  393. for url, md5 in split:
  394. download_and_extract_archive(url, self.raw_folder, md5=md5)
  395. def __getitem__(self, index: int) -> Tuple[Any, Any]:
  396. # redefined to handle the compat flag
  397. img, target = self.data[index], self.targets[index]
  398. img = Image.fromarray(img.numpy(), mode="L")
  399. if self.transform is not None:
  400. img = self.transform(img)
  401. if self.compat:
  402. target = int(target[0])
  403. if self.target_transform is not None:
  404. target = self.target_transform(target)
  405. return img, target
  406. def extra_repr(self) -> str:
  407. return f"Split: {self.what}"
  408. def get_int(b: bytes) -> int:
  409. return int(codecs.encode(b, "hex"), 16)
  410. SN3_PASCALVINCENT_TYPEMAP = {
  411. 8: torch.uint8,
  412. 9: torch.int8,
  413. 11: torch.int16,
  414. 12: torch.int32,
  415. 13: torch.float32,
  416. 14: torch.float64,
  417. }
  418. def read_sn3_pascalvincent_tensor(path: str, strict: bool = True) -> torch.Tensor:
  419. """Read a SN3 file in "Pascal Vincent" format (Lush file 'libidx/idx-io.lsh').
  420. Argument may be a filename, compressed filename, or file object.
  421. """
  422. # read
  423. with open(path, "rb") as f:
  424. data = f.read()
  425. # parse
  426. magic = get_int(data[0:4])
  427. nd = magic % 256
  428. ty = magic // 256
  429. assert 1 <= nd <= 3
  430. assert 8 <= ty <= 14
  431. torch_type = SN3_PASCALVINCENT_TYPEMAP[ty]
  432. s = [get_int(data[4 * (i + 1) : 4 * (i + 2)]) for i in range(nd)]
  433. parsed = torch.frombuffer(bytearray(data), dtype=torch_type, offset=(4 * (nd + 1)))
  434. # The MNIST format uses the big endian byte order, while `torch.frombuffer` uses whatever the system uses. In case
  435. # that is little endian and the dtype has more than one byte, we need to flip them.
  436. if sys.byteorder == "little" and parsed.element_size() > 1:
  437. parsed = _flip_byte_order(parsed)
  438. assert parsed.shape[0] == np.prod(s) or not strict
  439. return parsed.view(*s)
  440. def read_label_file(path: str) -> torch.Tensor:
  441. x = read_sn3_pascalvincent_tensor(path, strict=False)
  442. if x.dtype != torch.uint8:
  443. raise TypeError(f"x should be of dtype torch.uint8 instead of {x.dtype}")
  444. if x.ndimension() != 1:
  445. raise ValueError(f"x should have 1 dimension instead of {x.ndimension()}")
  446. return x.long()
  447. def read_image_file(path: str) -> torch.Tensor:
  448. x = read_sn3_pascalvincent_tensor(path, strict=False)
  449. if x.dtype != torch.uint8:
  450. raise TypeError(f"x should be of dtype torch.uint8 instead of {x.dtype}")
  451. if x.ndimension() != 3:
  452. raise ValueError(f"x should have 3 dimension instead of {x.ndimension()}")
  453. return x