123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337 |
- from contextlib import nullcontext
- import numpy as np
- from .numeric import uint8, ndarray, dtype
- from numpy.compat import os_fspath, is_pathlib_path
- from numpy.core.overrides import set_module
- __all__ = ['memmap']
- dtypedescr = dtype
- valid_filemodes = ["r", "c", "r+", "w+"]
- writeable_filemodes = ["r+", "w+"]
- mode_equivalents = {
- "readonly":"r",
- "copyonwrite":"c",
- "readwrite":"r+",
- "write":"w+"
- }
- @set_module('numpy')
- class memmap(ndarray):
- """Create a memory-map to an array stored in a *binary* file on disk.
- Memory-mapped files are used for accessing small segments of large files
- on disk, without reading the entire file into memory. NumPy's
- memmap's are array-like objects. This differs from Python's ``mmap``
- module, which uses file-like objects.
- This subclass of ndarray has some unpleasant interactions with
- some operations, because it doesn't quite fit properly as a subclass.
- An alternative to using this subclass is to create the ``mmap``
- object yourself, then create an ndarray with ndarray.__new__ directly,
- passing the object created in its 'buffer=' parameter.
- This class may at some point be turned into a factory function
- which returns a view into an mmap buffer.
- Flush the memmap instance to write the changes to the file. Currently there
- is no API to close the underlying ``mmap``. It is tricky to ensure the
- resource is actually closed, since it may be shared between different
- memmap instances.
- Parameters
- ----------
- filename : str, file-like object, or pathlib.Path instance
- The file name or file object to be used as the array data buffer.
- dtype : data-type, optional
- The data-type used to interpret the file contents.
- Default is `uint8`.
- mode : {'r+', 'r', 'w+', 'c'}, optional
- The file is opened in this mode:
- +------+-------------------------------------------------------------+
- | 'r' | Open existing file for reading only. |
- +------+-------------------------------------------------------------+
- | 'r+' | Open existing file for reading and writing. |
- +------+-------------------------------------------------------------+
- | 'w+' | Create or overwrite existing file for reading and writing. |
- +------+-------------------------------------------------------------+
- | 'c' | Copy-on-write: assignments affect data in memory, but |
- | | changes are not saved to disk. The file on disk is |
- | | read-only. |
- +------+-------------------------------------------------------------+
- Default is 'r+'.
- offset : int, optional
- In the file, array data starts at this offset. Since `offset` is
- measured in bytes, it should normally be a multiple of the byte-size
- of `dtype`. When ``mode != 'r'``, even positive offsets beyond end of
- file are valid; The file will be extended to accommodate the
- additional data. By default, ``memmap`` will start at the beginning of
- the file, even if ``filename`` is a file pointer ``fp`` and
- ``fp.tell() != 0``.
- shape : tuple, optional
- The desired shape of the array. If ``mode == 'r'`` and the number
- of remaining bytes after `offset` is not a multiple of the byte-size
- of `dtype`, you must specify `shape`. By default, the returned array
- will be 1-D with the number of elements determined by file size
- and data-type.
- order : {'C', 'F'}, optional
- Specify the order of the ndarray memory layout:
- :term:`row-major`, C-style or :term:`column-major`,
- Fortran-style. This only has an effect if the shape is
- greater than 1-D. The default order is 'C'.
- Attributes
- ----------
- filename : str or pathlib.Path instance
- Path to the mapped file.
- offset : int
- Offset position in the file.
- mode : str
- File mode.
- Methods
- -------
- flush
- Flush any changes in memory to file on disk.
- When you delete a memmap object, flush is called first to write
- changes to disk.
- See also
- --------
- lib.format.open_memmap : Create or load a memory-mapped ``.npy`` file.
- Notes
- -----
- The memmap object can be used anywhere an ndarray is accepted.
- Given a memmap ``fp``, ``isinstance(fp, numpy.ndarray)`` returns
- ``True``.
- Memory-mapped files cannot be larger than 2GB on 32-bit systems.
- When a memmap causes a file to be created or extended beyond its
- current size in the filesystem, the contents of the new part are
- unspecified. On systems with POSIX filesystem semantics, the extended
- part will be filled with zero bytes.
- Examples
- --------
- >>> data = np.arange(12, dtype='float32')
- >>> data.resize((3,4))
- This example uses a temporary file so that doctest doesn't write
- files to your directory. You would use a 'normal' filename.
- >>> from tempfile import mkdtemp
- >>> import os.path as path
- >>> filename = path.join(mkdtemp(), 'newfile.dat')
- Create a memmap with dtype and shape that matches our data:
- >>> fp = np.memmap(filename, dtype='float32', mode='w+', shape=(3,4))
- >>> fp
- memmap([[0., 0., 0., 0.],
- [0., 0., 0., 0.],
- [0., 0., 0., 0.]], dtype=float32)
- Write data to memmap array:
- >>> fp[:] = data[:]
- >>> fp
- memmap([[ 0., 1., 2., 3.],
- [ 4., 5., 6., 7.],
- [ 8., 9., 10., 11.]], dtype=float32)
- >>> fp.filename == path.abspath(filename)
- True
- Flushes memory changes to disk in order to read them back
- >>> fp.flush()
- Load the memmap and verify data was stored:
- >>> newfp = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
- >>> newfp
- memmap([[ 0., 1., 2., 3.],
- [ 4., 5., 6., 7.],
- [ 8., 9., 10., 11.]], dtype=float32)
- Read-only memmap:
- >>> fpr = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
- >>> fpr.flags.writeable
- False
- Copy-on-write memmap:
- >>> fpc = np.memmap(filename, dtype='float32', mode='c', shape=(3,4))
- >>> fpc.flags.writeable
- True
- It's possible to assign to copy-on-write array, but values are only
- written into the memory copy of the array, and not written to disk:
- >>> fpc
- memmap([[ 0., 1., 2., 3.],
- [ 4., 5., 6., 7.],
- [ 8., 9., 10., 11.]], dtype=float32)
- >>> fpc[0,:] = 0
- >>> fpc
- memmap([[ 0., 0., 0., 0.],
- [ 4., 5., 6., 7.],
- [ 8., 9., 10., 11.]], dtype=float32)
- File on disk is unchanged:
- >>> fpr
- memmap([[ 0., 1., 2., 3.],
- [ 4., 5., 6., 7.],
- [ 8., 9., 10., 11.]], dtype=float32)
- Offset into a memmap:
- >>> fpo = np.memmap(filename, dtype='float32', mode='r', offset=16)
- >>> fpo
- memmap([ 4., 5., 6., 7., 8., 9., 10., 11.], dtype=float32)
- """
- __array_priority__ = -100.0
- def __new__(subtype, filename, dtype=uint8, mode='r+', offset=0,
- shape=None, order='C'):
- # Import here to minimize 'import numpy' overhead
- import mmap
- import os.path
- try:
- mode = mode_equivalents[mode]
- except KeyError as e:
- if mode not in valid_filemodes:
- raise ValueError(
- "mode must be one of {!r} (got {!r})"
- .format(valid_filemodes + list(mode_equivalents.keys()), mode)
- ) from None
- if mode == 'w+' and shape is None:
- raise ValueError("shape must be given")
- if hasattr(filename, 'read'):
- f_ctx = nullcontext(filename)
- else:
- f_ctx = open(os_fspath(filename), ('r' if mode == 'c' else mode)+'b')
- with f_ctx as fid:
- fid.seek(0, 2)
- flen = fid.tell()
- descr = dtypedescr(dtype)
- _dbytes = descr.itemsize
- if shape is None:
- bytes = flen - offset
- if bytes % _dbytes:
- raise ValueError("Size of available data is not a "
- "multiple of the data-type size.")
- size = bytes // _dbytes
- shape = (size,)
- else:
- if not isinstance(shape, tuple):
- shape = (shape,)
- size = np.intp(1) # avoid default choice of np.int_, which might overflow
- for k in shape:
- size *= k
- bytes = int(offset + size*_dbytes)
- if mode in ('w+', 'r+') and flen < bytes:
- fid.seek(bytes - 1, 0)
- fid.write(b'\0')
- fid.flush()
- if mode == 'c':
- acc = mmap.ACCESS_COPY
- elif mode == 'r':
- acc = mmap.ACCESS_READ
- else:
- acc = mmap.ACCESS_WRITE
- start = offset - offset % mmap.ALLOCATIONGRANULARITY
- bytes -= start
- array_offset = offset - start
- mm = mmap.mmap(fid.fileno(), bytes, access=acc, offset=start)
- self = ndarray.__new__(subtype, shape, dtype=descr, buffer=mm,
- offset=array_offset, order=order)
- self._mmap = mm
- self.offset = offset
- self.mode = mode
- if is_pathlib_path(filename):
- # special case - if we were constructed with a pathlib.path,
- # then filename is a path object, not a string
- self.filename = filename.resolve()
- elif hasattr(fid, "name") and isinstance(fid.name, str):
- # py3 returns int for TemporaryFile().name
- self.filename = os.path.abspath(fid.name)
- # same as memmap copies (e.g. memmap + 1)
- else:
- self.filename = None
- return self
- def __array_finalize__(self, obj):
- if hasattr(obj, '_mmap') and np.may_share_memory(self, obj):
- self._mmap = obj._mmap
- self.filename = obj.filename
- self.offset = obj.offset
- self.mode = obj.mode
- else:
- self._mmap = None
- self.filename = None
- self.offset = None
- self.mode = None
- def flush(self):
- """
- Write any changes in the array to the file on disk.
- For further information, see `memmap`.
- Parameters
- ----------
- None
- See Also
- --------
- memmap
- """
- if self.base is not None and hasattr(self.base, 'flush'):
- self.base.flush()
- def __array_wrap__(self, arr, context=None):
- arr = super().__array_wrap__(arr, context)
- # Return a memmap if a memmap was given as the output of the
- # ufunc. Leave the arr class unchanged if self is not a memmap
- # to keep original memmap subclasses behavior
- if self is arr or type(self) is not memmap:
- return arr
- # Return scalar instead of 0d memmap, e.g. for np.sum with
- # axis=None
- if arr.shape == ():
- return arr[()]
- # Return ndarray otherwise
- return arr.view(np.ndarray)
- def __getitem__(self, index):
- res = super().__getitem__(index)
- if type(res) is memmap and res._mmap is None:
- return res.view(type=ndarray)
- return res
|