_miobase.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. # Authors: Travis Oliphant, Matthew Brett
  2. """
  3. Base classes for MATLAB file stream reading.
  4. MATLAB is a registered trademark of the Mathworks inc.
  5. """
  6. import operator
  7. import functools
  8. import numpy as np
  9. from scipy._lib import doccer
  10. from . import _byteordercodes as boc
  11. __all__ = [
  12. 'MatFileReader', 'MatReadError', 'MatReadWarning',
  13. 'MatVarReader', 'MatWriteError', 'arr_dtype_number',
  14. 'arr_to_chars', 'convert_dtypes', 'doc_dict',
  15. 'docfiller', 'get_matfile_version',
  16. 'matdims', 'read_dtype'
  17. ]
  18. class MatReadError(Exception):
  19. """Exception indicating a read issue."""
  20. class MatWriteError(Exception):
  21. """Exception indicating a write issue."""
  22. class MatReadWarning(UserWarning):
  23. """Warning class for read issues."""
  24. doc_dict = \
  25. {'file_arg':
  26. '''file_name : str
  27. Name of the mat file (do not need .mat extension if
  28. appendmat==True) Can also pass open file-like object.''',
  29. 'append_arg':
  30. '''appendmat : bool, optional
  31. True to append the .mat extension to the end of the given
  32. filename, if not already present. Default is True.''',
  33. 'load_args':
  34. '''byte_order : str or None, optional
  35. None by default, implying byte order guessed from mat
  36. file. Otherwise can be one of ('native', '=', 'little', '<',
  37. 'BIG', '>').
  38. mat_dtype : bool, optional
  39. If True, return arrays in same dtype as would be loaded into
  40. MATLAB (instead of the dtype with which they are saved).
  41. squeeze_me : bool, optional
  42. Whether to squeeze unit matrix dimensions or not.
  43. chars_as_strings : bool, optional
  44. Whether to convert char arrays to string arrays.
  45. matlab_compatible : bool, optional
  46. Returns matrices as would be loaded by MATLAB (implies
  47. squeeze_me=False, chars_as_strings=False, mat_dtype=True,
  48. struct_as_record=True).''',
  49. 'struct_arg':
  50. '''struct_as_record : bool, optional
  51. Whether to load MATLAB structs as NumPy record arrays, or as
  52. old-style NumPy arrays with dtype=object. Setting this flag to
  53. False replicates the behavior of SciPy version 0.7.x (returning
  54. numpy object arrays). The default setting is True, because it
  55. allows easier round-trip load and save of MATLAB files.''',
  56. 'matstream_arg':
  57. '''mat_stream : file-like
  58. Object with file API, open for reading.''',
  59. 'long_fields':
  60. '''long_field_names : bool, optional
  61. * False - maximum field name length in a structure is 31 characters
  62. which is the documented maximum length. This is the default.
  63. * True - maximum field name length in a structure is 63 characters
  64. which works for MATLAB 7.6''',
  65. 'do_compression':
  66. '''do_compression : bool, optional
  67. Whether to compress matrices on write. Default is False.''',
  68. 'oned_as':
  69. '''oned_as : {'row', 'column'}, optional
  70. If 'column', write 1-D NumPy arrays as column vectors.
  71. If 'row', write 1D NumPy arrays as row vectors.''',
  72. 'unicode_strings':
  73. '''unicode_strings : bool, optional
  74. If True, write strings as Unicode, else MATLAB usual encoding.'''}
  75. docfiller = doccer.filldoc(doc_dict)
  76. '''
  77. Note on architecture
  78. ======================
  79. There are three sets of parameters relevant for reading files. The
  80. first are *file read parameters* - containing options that are common
  81. for reading the whole file, and therefore every variable within that
  82. file. At the moment these are:
  83. * mat_stream
  84. * dtypes (derived from byte code)
  85. * byte_order
  86. * chars_as_strings
  87. * squeeze_me
  88. * struct_as_record (MATLAB 5 files)
  89. * class_dtypes (derived from order code, MATLAB 5 files)
  90. * codecs (MATLAB 5 files)
  91. * uint16_codec (MATLAB 5 files)
  92. Another set of parameters are those that apply only to the current
  93. variable being read - the *header*:
  94. * header related variables (different for v4 and v5 mat files)
  95. * is_complex
  96. * mclass
  97. * var_stream
  98. With the header, we need ``next_position`` to tell us where the next
  99. variable in the stream is.
  100. Then, for each element in a matrix, there can be *element read
  101. parameters*. An element is, for example, one element in a MATLAB cell
  102. array. At the moment, these are:
  103. * mat_dtype
  104. The file-reading object contains the *file read parameters*. The
  105. *header* is passed around as a data object, or may be read and discarded
  106. in a single function. The *element read parameters* - the mat_dtype in
  107. this instance, is passed into a general post-processing function - see
  108. ``mio_utils`` for details.
  109. '''
  110. def convert_dtypes(dtype_template, order_code):
  111. ''' Convert dtypes in mapping to given order
  112. Parameters
  113. ----------
  114. dtype_template : mapping
  115. mapping with values returning numpy dtype from ``np.dtype(val)``
  116. order_code : str
  117. an order code suitable for using in ``dtype.newbyteorder()``
  118. Returns
  119. -------
  120. dtypes : mapping
  121. mapping where values have been replaced by
  122. ``np.dtype(val).newbyteorder(order_code)``
  123. '''
  124. dtypes = dtype_template.copy()
  125. for k in dtypes:
  126. dtypes[k] = np.dtype(dtypes[k]).newbyteorder(order_code)
  127. return dtypes
  128. def read_dtype(mat_stream, a_dtype):
  129. """
  130. Generic get of byte stream data of known type
  131. Parameters
  132. ----------
  133. mat_stream : file_like object
  134. MATLAB (tm) mat file stream
  135. a_dtype : dtype
  136. dtype of array to read. `a_dtype` is assumed to be correct
  137. endianness.
  138. Returns
  139. -------
  140. arr : ndarray
  141. Array of dtype `a_dtype` read from stream.
  142. """
  143. num_bytes = a_dtype.itemsize
  144. arr = np.ndarray(shape=(),
  145. dtype=a_dtype,
  146. buffer=mat_stream.read(num_bytes),
  147. order='F')
  148. return arr
  149. def matfile_version(file_name, *, appendmat=True):
  150. """
  151. Return major, minor tuple depending on apparent mat file type
  152. Where:
  153. #. 0,x -> version 4 format mat files
  154. #. 1,x -> version 5 format mat files
  155. #. 2,x -> version 7.3 format mat files (HDF format)
  156. Parameters
  157. ----------
  158. file_name : str
  159. Name of the mat file (do not need .mat extension if
  160. appendmat==True). Can also pass open file-like object.
  161. appendmat : bool, optional
  162. True to append the .mat extension to the end of the given
  163. filename, if not already present. Default is True.
  164. Returns
  165. -------
  166. major_version : {0, 1, 2}
  167. major MATLAB File format version
  168. minor_version : int
  169. minor MATLAB file format version
  170. Raises
  171. ------
  172. MatReadError
  173. If the file is empty.
  174. ValueError
  175. The matfile version is unknown.
  176. Notes
  177. -----
  178. Has the side effect of setting the file read pointer to 0
  179. """
  180. from ._mio import _open_file_context
  181. with _open_file_context(file_name, appendmat=appendmat) as fileobj:
  182. return _get_matfile_version(fileobj)
  183. get_matfile_version = matfile_version
  184. def _get_matfile_version(fileobj):
  185. # Mat4 files have a zero somewhere in first 4 bytes
  186. fileobj.seek(0)
  187. mopt_bytes = fileobj.read(4)
  188. if len(mopt_bytes) == 0:
  189. raise MatReadError("Mat file appears to be empty")
  190. mopt_ints = np.ndarray(shape=(4,), dtype=np.uint8, buffer=mopt_bytes)
  191. if 0 in mopt_ints:
  192. fileobj.seek(0)
  193. return (0,0)
  194. # For 5 format or 7.3 format we need to read an integer in the
  195. # header. Bytes 124 through 128 contain a version integer and an
  196. # endian test string
  197. fileobj.seek(124)
  198. tst_str = fileobj.read(4)
  199. fileobj.seek(0)
  200. maj_ind = int(tst_str[2] == b'I'[0])
  201. maj_val = int(tst_str[maj_ind])
  202. min_val = int(tst_str[1 - maj_ind])
  203. ret = (maj_val, min_val)
  204. if maj_val in (1, 2):
  205. return ret
  206. raise ValueError('Unknown mat file type, version %s, %s' % ret)
  207. def matdims(arr, oned_as='column'):
  208. """
  209. Determine equivalent MATLAB dimensions for given array
  210. Parameters
  211. ----------
  212. arr : ndarray
  213. Input array
  214. oned_as : {'column', 'row'}, optional
  215. Whether 1-D arrays are returned as MATLAB row or column matrices.
  216. Default is 'column'.
  217. Returns
  218. -------
  219. dims : tuple
  220. Shape tuple, in the form MATLAB expects it.
  221. Notes
  222. -----
  223. We had to decide what shape a 1 dimensional array would be by
  224. default. ``np.atleast_2d`` thinks it is a row vector. The
  225. default for a vector in MATLAB (e.g., ``>> 1:12``) is a row vector.
  226. Versions of scipy up to and including 0.11 resulted (accidentally)
  227. in 1-D arrays being read as column vectors. For the moment, we
  228. maintain the same tradition here.
  229. Examples
  230. --------
  231. >>> matdims(np.array(1)) # NumPy scalar
  232. (1, 1)
  233. >>> matdims(np.array([1])) # 1-D array, 1 element
  234. (1, 1)
  235. >>> matdims(np.array([1,2])) # 1-D array, 2 elements
  236. (2, 1)
  237. >>> matdims(np.array([[2],[3]])) # 2-D array, column vector
  238. (2, 1)
  239. >>> matdims(np.array([[2,3]])) # 2-D array, row vector
  240. (1, 2)
  241. >>> matdims(np.array([[[2,3]]])) # 3-D array, rowish vector
  242. (1, 1, 2)
  243. >>> matdims(np.array([])) # empty 1-D array
  244. (0, 0)
  245. >>> matdims(np.array([[]])) # empty 2-D array
  246. (0, 0)
  247. >>> matdims(np.array([[[]]])) # empty 3-D array
  248. (0, 0, 0)
  249. Optional argument flips 1-D shape behavior.
  250. >>> matdims(np.array([1,2]), 'row') # 1-D array, 2 elements
  251. (1, 2)
  252. The argument has to make sense though
  253. >>> matdims(np.array([1,2]), 'bizarre')
  254. Traceback (most recent call last):
  255. ...
  256. ValueError: 1-D option "bizarre" is strange
  257. """
  258. shape = arr.shape
  259. if shape == (): # scalar
  260. return (1, 1)
  261. if len(shape) == 1: # 1D
  262. if shape[0] == 0:
  263. return (0, 0)
  264. elif oned_as == 'column':
  265. return shape + (1,)
  266. elif oned_as == 'row':
  267. return (1,) + shape
  268. else:
  269. raise ValueError('1-D option "%s" is strange'
  270. % oned_as)
  271. return shape
  272. class MatVarReader:
  273. ''' Abstract class defining required interface for var readers'''
  274. def __init__(self, file_reader):
  275. pass
  276. def read_header(self):
  277. ''' Returns header '''
  278. pass
  279. def array_from_header(self, header):
  280. ''' Reads array given header '''
  281. pass
  282. class MatFileReader:
  283. """ Base object for reading mat files
  284. To make this class functional, you will need to override the
  285. following methods:
  286. matrix_getter_factory - gives object to fetch next matrix from stream
  287. guess_byte_order - guesses file byte order from file
  288. """
  289. @docfiller
  290. def __init__(self, mat_stream,
  291. byte_order=None,
  292. mat_dtype=False,
  293. squeeze_me=False,
  294. chars_as_strings=True,
  295. matlab_compatible=False,
  296. struct_as_record=True,
  297. verify_compressed_data_integrity=True,
  298. simplify_cells=False):
  299. '''
  300. Initializer for mat file reader
  301. mat_stream : file-like
  302. object with file API, open for reading
  303. %(load_args)s
  304. '''
  305. # Initialize stream
  306. self.mat_stream = mat_stream
  307. self.dtypes = {}
  308. if not byte_order:
  309. byte_order = self.guess_byte_order()
  310. else:
  311. byte_order = boc.to_numpy_code(byte_order)
  312. self.byte_order = byte_order
  313. self.struct_as_record = struct_as_record
  314. if matlab_compatible:
  315. self.set_matlab_compatible()
  316. else:
  317. self.squeeze_me = squeeze_me
  318. self.chars_as_strings = chars_as_strings
  319. self.mat_dtype = mat_dtype
  320. self.verify_compressed_data_integrity = verify_compressed_data_integrity
  321. self.simplify_cells = simplify_cells
  322. if simplify_cells:
  323. self.squeeze_me = True
  324. self.struct_as_record = False
  325. def set_matlab_compatible(self):
  326. ''' Sets options to return arrays as MATLAB loads them '''
  327. self.mat_dtype = True
  328. self.squeeze_me = False
  329. self.chars_as_strings = False
  330. def guess_byte_order(self):
  331. ''' As we do not know what file type we have, assume native '''
  332. return boc.native_code
  333. def end_of_stream(self):
  334. b = self.mat_stream.read(1)
  335. curpos = self.mat_stream.tell()
  336. self.mat_stream.seek(curpos-1)
  337. return len(b) == 0
  338. def arr_dtype_number(arr, num):
  339. ''' Return dtype for given number of items per element'''
  340. return np.dtype(arr.dtype.str[:2] + str(num))
  341. def arr_to_chars(arr):
  342. ''' Convert string array to char array '''
  343. dims = list(arr.shape)
  344. if not dims:
  345. dims = [1]
  346. dims.append(int(arr.dtype.str[2:]))
  347. arr = np.ndarray(shape=dims,
  348. dtype=arr_dtype_number(arr, 1),
  349. buffer=arr)
  350. empties = [arr == np.array('', dtype=arr.dtype)]
  351. if not np.any(empties):
  352. return arr
  353. arr = arr.copy()
  354. arr[tuple(empties)] = ' '
  355. return arr