memmap.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. from contextlib import nullcontext
  2. import numpy as np
  3. from .numeric import uint8, ndarray, dtype
  4. from numpy.compat import os_fspath, is_pathlib_path
  5. from numpy.core.overrides import set_module
  6. __all__ = ['memmap']
  7. dtypedescr = dtype
  8. valid_filemodes = ["r", "c", "r+", "w+"]
  9. writeable_filemodes = ["r+", "w+"]
  10. mode_equivalents = {
  11. "readonly":"r",
  12. "copyonwrite":"c",
  13. "readwrite":"r+",
  14. "write":"w+"
  15. }
  16. @set_module('numpy')
  17. class memmap(ndarray):
  18. """Create a memory-map to an array stored in a *binary* file on disk.
  19. Memory-mapped files are used for accessing small segments of large files
  20. on disk, without reading the entire file into memory. NumPy's
  21. memmap's are array-like objects. This differs from Python's ``mmap``
  22. module, which uses file-like objects.
  23. This subclass of ndarray has some unpleasant interactions with
  24. some operations, because it doesn't quite fit properly as a subclass.
  25. An alternative to using this subclass is to create the ``mmap``
  26. object yourself, then create an ndarray with ndarray.__new__ directly,
  27. passing the object created in its 'buffer=' parameter.
  28. This class may at some point be turned into a factory function
  29. which returns a view into an mmap buffer.
  30. Flush the memmap instance to write the changes to the file. Currently there
  31. is no API to close the underlying ``mmap``. It is tricky to ensure the
  32. resource is actually closed, since it may be shared between different
  33. memmap instances.
  34. Parameters
  35. ----------
  36. filename : str, file-like object, or pathlib.Path instance
  37. The file name or file object to be used as the array data buffer.
  38. dtype : data-type, optional
  39. The data-type used to interpret the file contents.
  40. Default is `uint8`.
  41. mode : {'r+', 'r', 'w+', 'c'}, optional
  42. The file is opened in this mode:
  43. +------+-------------------------------------------------------------+
  44. | 'r' | Open existing file for reading only. |
  45. +------+-------------------------------------------------------------+
  46. | 'r+' | Open existing file for reading and writing. |
  47. +------+-------------------------------------------------------------+
  48. | 'w+' | Create or overwrite existing file for reading and writing. |
  49. +------+-------------------------------------------------------------+
  50. | 'c' | Copy-on-write: assignments affect data in memory, but |
  51. | | changes are not saved to disk. The file on disk is |
  52. | | read-only. |
  53. +------+-------------------------------------------------------------+
  54. Default is 'r+'.
  55. offset : int, optional
  56. In the file, array data starts at this offset. Since `offset` is
  57. measured in bytes, it should normally be a multiple of the byte-size
  58. of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
  59. file are valid; The file will be extended to accommodate the
  60. additional data. By default, ``memmap`` will start at the beginning of
  61. the file, even if ``filename`` is a file pointer ``fp`` and
  62. ``fp.tell() != 0``.
  63. shape : tuple, optional
  64. The desired shape of the array. If ``mode == 'r'`` and the number
  65. of remaining bytes after `offset` is not a multiple of the byte-size
  66. of `dtype`, you must specify `shape`. By default, the returned array
  67. will be 1-D with the number of elements determined by file size
  68. and data-type.
  69. order : {'C', 'F'}, optional
  70. Specify the order of the ndarray memory layout:
  71. :term:`row-major`, C-style or :term:`column-major`,
  72. Fortran-style. This only has an effect if the shape is
  73. greater than 1-D. The default order is 'C'.
  74. Attributes
  75. ----------
  76. filename : str or pathlib.Path instance
  77. Path to the mapped file.
  78. offset : int
  79. Offset position in the file.
  80. mode : str
  81. File mode.
  82. Methods
  83. -------
  84. flush
  85. Flush any changes in memory to file on disk.
  86. When you delete a memmap object, flush is called first to write
  87. changes to disk.
  88. See also
  89. --------
  90. lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
  91. Notes
  92. -----
  93. The memmap object can be used anywhere an ndarray is accepted.
  94. Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
  95. ``True``.
  96. Memory-mapped files cannot be larger than 2GB on 32-bit systems.
  97. When a memmap causes a file to be created or extended beyond its
  98. current size in the filesystem, the contents of the new part are
  99. unspecified. On systems with POSIX filesystem semantics, the extended
  100. part will be filled with zero bytes.
  101. Examples
  102. --------
  103. >>> data = np.arange(12, dtype='float32')
  104. >>> data.resize((3,4))
  105. This example uses a temporary file so that doctest doesn't write
  106. files to your directory. You would use a 'normal' filename.
  107. >>> from tempfile import mkdtemp
  108. >>> import os.path as path
  109. >>> filename = path.join(mkdtemp(), 'newfile.dat')
  110. Create a memmap with dtype and shape that matches our data:
  111. >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
  112. >>> fp
  113. memmap([[0., 0., 0., 0.],
  114. [0., 0., 0., 0.],
  115. [0., 0., 0., 0.]], dtype=float32)
  116. Write data to memmap array:
  117. >>> fp[:] = data[:]
  118. >>> fp
  119. memmap([[ 0., 1., 2., 3.],
  120. [ 4., 5., 6., 7.],
  121. [ 8., 9., 10., 11.]], dtype=float32)
  122. >>> fp.filename == path.abspath(filename)
  123. True
  124. Flushes memory changes to disk in order to read them back
  125. >>> fp.flush()
  126. Load the memmap and verify data was stored:
  127. >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  128. >>> newfp
  129. memmap([[ 0., 1., 2., 3.],
  130. [ 4., 5., 6., 7.],
  131. [ 8., 9., 10., 11.]], dtype=float32)
  132. Read-only memmap:
  133. >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
  134. >>> fpr.flags.writeable
  135. False
  136. Copy-on-write memmap:
  137. >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
  138. >>> fpc.flags.writeable
  139. True
  140. It's possible to assign to copy-on-write array, but values are only
  141. written into the memory copy of the array, and not written to disk:
  142. >>> fpc
  143. memmap([[ 0., 1., 2., 3.],
  144. [ 4., 5., 6., 7.],
  145. [ 8., 9., 10., 11.]], dtype=float32)
  146. >>> fpc[0,:] = 0
  147. >>> fpc
  148. memmap([[ 0., 0., 0., 0.],
  149. [ 4., 5., 6., 7.],
  150. [ 8., 9., 10., 11.]], dtype=float32)
  151. File on disk is unchanged:
  152. >>> fpr
  153. memmap([[ 0., 1., 2., 3.],
  154. [ 4., 5., 6., 7.],
  155. [ 8., 9., 10., 11.]], dtype=float32)
  156. Offset into a memmap:
  157. >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
  158. >>> fpo
  159. memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
  160. """
  161. __array_priority__ = -100.0
  162. def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
  163. shape=None, order='C'):
  164. # Import here to minimize 'import numpy' overhead
  165. import mmap
  166. import os.path
  167. try:
  168. mode = mode_equivalents[mode]
  169. except KeyError as e:
  170. if mode not in valid_filemodes:
  171. raise ValueError(
  172. "mode must be one of {!r} (got {!r})"
  173. .format(valid_filemodes + list(mode_equivalents.keys()), mode)
  174. ) from None
  175. if mode == 'w+' and shape is None:
  176. raise ValueError("shape must be given")
  177. if hasattr(filename, 'read'):
  178. f_ctx = nullcontext(filename)
  179. else:
  180. f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
  181. with f_ctx as fid:
  182. fid.seek(0, 2)
  183. flen = fid.tell()
  184. descr = dtypedescr(dtype)
  185. _dbytes = descr.itemsize
  186. if shape is None:
  187. bytes = flen - offset
  188. if bytes % _dbytes:
  189. raise ValueError("Size of available data is not a "
  190. "multiple of the data-type size.")
  191. size = bytes // _dbytes
  192. shape = (size,)
  193. else:
  194. if not isinstance(shape, tuple):
  195. shape = (shape,)
  196. size = np.intp(1) # avoid default choice of np.int_, which might overflow
  197. for k in shape:
  198. size *= k
  199. bytes = int(offset + size*_dbytes)
  200. if mode in ('w+', 'r+') and flen < bytes:
  201. fid.seek(bytes - 1, 0)
  202. fid.write(b'\0')
  203. fid.flush()
  204. if mode == 'c':
  205. acc = mmap.ACCESS_COPY
  206. elif mode == 'r':
  207. acc = mmap.ACCESS_READ
  208. else:
  209. acc = mmap.ACCESS_WRITE
  210. start = offset - offset % mmap.ALLOCATIONGRANULARITY
  211. bytes -= start
  212. array_offset = offset - start
  213. mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
  214. self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
  215. offset=array_offset, order=order)
  216. self._mmap = mm
  217. self.offset = offset
  218. self.mode = mode
  219. if is_pathlib_path(filename):
  220. # special case - if we were constructed with a pathlib.path,
  221. # then filename is a path object, not a string
  222. self.filename = filename.resolve()
  223. elif hasattr(fid, "name") and isinstance(fid.name, str):
  224. # py3 returns int for TemporaryFile().name
  225. self.filename = os.path.abspath(fid.name)
  226. # same as memmap copies (e.g. memmap + 1)
  227. else:
  228. self.filename = None
  229. return self
  230. def __array_finalize__(self, obj):
  231. if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
  232. self._mmap = obj._mmap
  233. self.filename = obj.filename
  234. self.offset = obj.offset
  235. self.mode = obj.mode
  236. else:
  237. self._mmap = None
  238. self.filename = None
  239. self.offset = None
  240. self.mode = None
  241. def flush(self):
  242. """
  243. Write any changes in the array to the file on disk.
  244. For further information, see `memmap`.
  245. Parameters
  246. ----------
  247. None
  248. See Also
  249. --------
  250. memmap
  251. """
  252. if self.base is not None and hasattr(self.base, 'flush'):
  253. self.base.flush()
  254. def __array_wrap__(self, arr, context=None):
  255. arr = super().__array_wrap__(arr, context)
  256. # Return a memmap if a memmap was given as the output of the
  257. # ufunc. Leave the arr class unchanged if self is not a memmap
  258. # to keep original memmap subclasses behavior
  259. if self is arr or type(self) is not memmap:
  260. return arr
  261. # Return scalar instead of 0d memmap, e.g. for np.sum with
  262. # axis=None
  263. if arr.shape == ():
  264. return arr[()]
  265. # Return ndarray otherwise
  266. return arr.view(np.ndarray)
  267. def __getitem__(self, index):
  268. res = super().__getitem__(index)
  269. if type(res) is memmap and res._mmap is None:
  270. return res.view(type=ndarray)
  271. return res