_mio4.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. ''' Classes for read / write of matlab (TM) 4 files
  2. '''
  3. import sys
  4. import warnings
  5. import numpy as np
  6. import scipy.sparse
  7. from ._miobase import (MatFileReader, docfiller, matdims, read_dtype,
  8. convert_dtypes, arr_to_chars, arr_dtype_number)
  9. from ._mio_utils import squeeze_element, chars_to_strings
  10. from functools import reduce
  11. __all__ = [
  12. 'MatFile4Reader', 'MatFile4Writer', 'SYS_LITTLE_ENDIAN',
  13. 'VarHeader4', 'VarReader4', 'VarWriter4', 'arr_to_2d', 'mclass_info',
  14. 'mdtypes_template', 'miDOUBLE', 'miINT16', 'miINT32', 'miSINGLE',
  15. 'miUINT16', 'miUINT8', 'mxCHAR_CLASS', 'mxFULL_CLASS', 'mxSPARSE_CLASS',
  16. 'np_to_mtypes', 'order_codes'
  17. ]
  18. SYS_LITTLE_ENDIAN = sys.byteorder == 'little'
  19. miDOUBLE = 0
  20. miSINGLE = 1
  21. miINT32 = 2
  22. miINT16 = 3
  23. miUINT16 = 4
  24. miUINT8 = 5
  25. mdtypes_template = {
  26. miDOUBLE: 'f8',
  27. miSINGLE: 'f4',
  28. miINT32: 'i4',
  29. miINT16: 'i2',
  30. miUINT16: 'u2',
  31. miUINT8: 'u1',
  32. 'header': [('mopt', 'i4'),
  33. ('mrows', 'i4'),
  34. ('ncols', 'i4'),
  35. ('imagf', 'i4'),
  36. ('namlen', 'i4')],
  37. 'U1': 'U1',
  38. }
  39. np_to_mtypes = {
  40. 'f8': miDOUBLE,
  41. 'c32': miDOUBLE,
  42. 'c24': miDOUBLE,
  43. 'c16': miDOUBLE,
  44. 'f4': miSINGLE,
  45. 'c8': miSINGLE,
  46. 'i4': miINT32,
  47. 'i2': miINT16,
  48. 'u2': miUINT16,
  49. 'u1': miUINT8,
  50. 'S1': miUINT8,
  51. }
  52. # matrix classes
  53. mxFULL_CLASS = 0
  54. mxCHAR_CLASS = 1
  55. mxSPARSE_CLASS = 2
  56. order_codes = {
  57. 0: '<',
  58. 1: '>',
  59. 2: 'VAX D-float', # !
  60. 3: 'VAX G-float',
  61. 4: 'Cray', # !!
  62. }
  63. mclass_info = {
  64. mxFULL_CLASS: 'double',
  65. mxCHAR_CLASS: 'char',
  66. mxSPARSE_CLASS: 'sparse',
  67. }
  68. class VarHeader4:
  69. # Mat4 variables never logical or global
  70. is_logical = False
  71. is_global = False
  72. def __init__(self,
  73. name,
  74. dtype,
  75. mclass,
  76. dims,
  77. is_complex):
  78. self.name = name
  79. self.dtype = dtype
  80. self.mclass = mclass
  81. self.dims = dims
  82. self.is_complex = is_complex
  83. class VarReader4:
  84. ''' Class to read matlab 4 variables '''
  85. def __init__(self, file_reader):
  86. self.file_reader = file_reader
  87. self.mat_stream = file_reader.mat_stream
  88. self.dtypes = file_reader.dtypes
  89. self.chars_as_strings = file_reader.chars_as_strings
  90. self.squeeze_me = file_reader.squeeze_me
  91. def read_header(self):
  92. ''' Read and return header for variable '''
  93. data = read_dtype(self.mat_stream, self.dtypes['header'])
  94. name = self.mat_stream.read(int(data['namlen'])).strip(b'\x00')
  95. if data['mopt'] < 0 or data['mopt'] > 5000:
  96. raise ValueError('Mat 4 mopt wrong format, byteswapping problem?')
  97. M, rest = divmod(data['mopt'], 1000) # order code
  98. if M not in (0, 1):
  99. warnings.warn("We do not support byte ordering '%s'; returned "
  100. "data may be corrupt" % order_codes[M],
  101. UserWarning)
  102. O, rest = divmod(rest, 100) # unused, should be 0
  103. if O != 0:
  104. raise ValueError('O in MOPT integer should be 0, wrong format?')
  105. P, rest = divmod(rest, 10) # data type code e.g miDOUBLE (see above)
  106. T = rest # matrix type code e.g., mxFULL_CLASS (see above)
  107. dims = (data['mrows'], data['ncols'])
  108. is_complex = data['imagf'] == 1
  109. dtype = self.dtypes[P]
  110. return VarHeader4(
  111. name,
  112. dtype,
  113. T,
  114. dims,
  115. is_complex)
  116. def array_from_header(self, hdr, process=True):
  117. mclass = hdr.mclass
  118. if mclass == mxFULL_CLASS:
  119. arr = self.read_full_array(hdr)
  120. elif mclass == mxCHAR_CLASS:
  121. arr = self.read_char_array(hdr)
  122. if process and self.chars_as_strings:
  123. arr = chars_to_strings(arr)
  124. elif mclass == mxSPARSE_CLASS:
  125. # no current processing (below) makes sense for sparse
  126. return self.read_sparse_array(hdr)
  127. else:
  128. raise TypeError('No reader for class code %s' % mclass)
  129. if process and self.squeeze_me:
  130. return squeeze_element(arr)
  131. return arr
  132. def read_sub_array(self, hdr, copy=True):
  133. ''' Mat4 read using header `hdr` dtype and dims
  134. Parameters
  135. ----------
  136. hdr : object
  137. object with attributes ``dtype``, ``dims``. dtype is assumed to be
  138. the correct endianness
  139. copy : bool, optional
  140. copies array before return if True (default True)
  141. (buffer is usually read only)
  142. Returns
  143. -------
  144. arr : ndarray
  145. of dtype given by `hdr` ``dtype`` and shape given by `hdr` ``dims``
  146. '''
  147. dt = hdr.dtype
  148. dims = hdr.dims
  149. num_bytes = dt.itemsize
  150. for d in dims:
  151. num_bytes *= d
  152. buffer = self.mat_stream.read(int(num_bytes))
  153. if len(buffer) != num_bytes:
  154. raise ValueError("Not enough bytes to read matrix '%s'; is this "
  155. "a badly-formed file? Consider listing matrices "
  156. "with `whosmat` and loading named matrices with "
  157. "`variable_names` kwarg to `loadmat`" % hdr.name)
  158. arr = np.ndarray(shape=dims,
  159. dtype=dt,
  160. buffer=buffer,
  161. order='F')
  162. if copy:
  163. arr = arr.copy()
  164. return arr
  165. def read_full_array(self, hdr):
  166. ''' Full (rather than sparse) matrix getter
  167. Read matrix (array) can be real or complex
  168. Parameters
  169. ----------
  170. hdr : ``VarHeader4`` instance
  171. Returns
  172. -------
  173. arr : ndarray
  174. complex array if ``hdr.is_complex`` is True, otherwise a real
  175. numeric array
  176. '''
  177. if hdr.is_complex:
  178. # avoid array copy to save memory
  179. res = self.read_sub_array(hdr, copy=False)
  180. res_j = self.read_sub_array(hdr, copy=False)
  181. return res + (res_j * 1j)
  182. return self.read_sub_array(hdr)
  183. def read_char_array(self, hdr):
  184. ''' latin-1 text matrix (char matrix) reader
  185. Parameters
  186. ----------
  187. hdr : ``VarHeader4`` instance
  188. Returns
  189. -------
  190. arr : ndarray
  191. with dtype 'U1', shape given by `hdr` ``dims``
  192. '''
  193. arr = self.read_sub_array(hdr).astype(np.uint8)
  194. S = arr.tobytes().decode('latin-1')
  195. return np.ndarray(shape=hdr.dims,
  196. dtype=np.dtype('U1'),
  197. buffer=np.array(S)).copy()
  198. def read_sparse_array(self, hdr):
  199. ''' Read and return sparse matrix type
  200. Parameters
  201. ----------
  202. hdr : ``VarHeader4`` instance
  203. Returns
  204. -------
  205. arr : ``scipy.sparse.coo_matrix``
  206. with dtype ``float`` and shape read from the sparse matrix data
  207. Notes
  208. -----
  209. MATLAB 4 real sparse arrays are saved in a N+1 by 3 array format, where
  210. N is the number of non-zero values. Column 1 values [0:N] are the
  211. (1-based) row indices of the each non-zero value, column 2 [0:N] are the
  212. column indices, column 3 [0:N] are the (real) values. The last values
  213. [-1,0:2] of the rows, column indices are shape[0] and shape[1]
  214. respectively of the output matrix. The last value for the values column
  215. is a padding 0. mrows and ncols values from the header give the shape of
  216. the stored matrix, here [N+1, 3]. Complex data are saved as a 4 column
  217. matrix, where the fourth column contains the imaginary component; the
  218. last value is again 0. Complex sparse data do *not* have the header
  219. ``imagf`` field set to True; the fact that the data are complex is only
  220. detectable because there are 4 storage columns.
  221. '''
  222. res = self.read_sub_array(hdr)
  223. tmp = res[:-1,:]
  224. # All numbers are float64 in Matlab, but SciPy sparse expects int shape
  225. dims = (int(res[-1,0]), int(res[-1,1]))
  226. I = np.ascontiguousarray(tmp[:,0],dtype='intc') # fixes byte order also
  227. J = np.ascontiguousarray(tmp[:,1],dtype='intc')
  228. I -= 1 # for 1-based indexing
  229. J -= 1
  230. if res.shape[1] == 3:
  231. V = np.ascontiguousarray(tmp[:,2],dtype='float')
  232. else:
  233. V = np.ascontiguousarray(tmp[:,2],dtype='complex')
  234. V.imag = tmp[:,3]
  235. return scipy.sparse.coo_matrix((V,(I,J)), dims)
  236. def shape_from_header(self, hdr):
  237. '''Read the shape of the array described by the header.
  238. The file position after this call is unspecified.
  239. '''
  240. mclass = hdr.mclass
  241. if mclass == mxFULL_CLASS:
  242. shape = tuple(map(int, hdr.dims))
  243. elif mclass == mxCHAR_CLASS:
  244. shape = tuple(map(int, hdr.dims))
  245. if self.chars_as_strings:
  246. shape = shape[:-1]
  247. elif mclass == mxSPARSE_CLASS:
  248. dt = hdr.dtype
  249. dims = hdr.dims
  250. if not (len(dims) == 2 and dims[0] >= 1 and dims[1] >= 1):
  251. return ()
  252. # Read only the row and column counts
  253. self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
  254. rows = np.ndarray(shape=(), dtype=dt,
  255. buffer=self.mat_stream.read(dt.itemsize))
  256. self.mat_stream.seek(dt.itemsize * (dims[0] - 1), 1)
  257. cols = np.ndarray(shape=(), dtype=dt,
  258. buffer=self.mat_stream.read(dt.itemsize))
  259. shape = (int(rows), int(cols))
  260. else:
  261. raise TypeError('No reader for class code %s' % mclass)
  262. if self.squeeze_me:
  263. shape = tuple([x for x in shape if x != 1])
  264. return shape
  265. class MatFile4Reader(MatFileReader):
  266. ''' Reader for Mat4 files '''
  267. @docfiller
  268. def __init__(self, mat_stream, *args, **kwargs):
  269. ''' Initialize matlab 4 file reader
  270. %(matstream_arg)s
  271. %(load_args)s
  272. '''
  273. super().__init__(mat_stream, *args, **kwargs)
  274. self._matrix_reader = None
  275. def guess_byte_order(self):
  276. self.mat_stream.seek(0)
  277. mopt = read_dtype(self.mat_stream, np.dtype('i4'))
  278. self.mat_stream.seek(0)
  279. if mopt == 0:
  280. return '<'
  281. if mopt < 0 or mopt > 5000:
  282. # Number must have been byteswapped
  283. return SYS_LITTLE_ENDIAN and '>' or '<'
  284. # Not byteswapped
  285. return SYS_LITTLE_ENDIAN and '<' or '>'
  286. def initialize_read(self):
  287. ''' Run when beginning read of variables
  288. Sets up readers from parameters in `self`
  289. '''
  290. self.dtypes = convert_dtypes(mdtypes_template, self.byte_order)
  291. self._matrix_reader = VarReader4(self)
  292. def read_var_header(self):
  293. ''' Read and return header, next position
  294. Parameters
  295. ----------
  296. None
  297. Returns
  298. -------
  299. header : object
  300. object that can be passed to self.read_var_array, and that
  301. has attributes ``name`` and ``is_global``
  302. next_position : int
  303. position in stream of next variable
  304. '''
  305. hdr = self._matrix_reader.read_header()
  306. n = reduce(lambda x, y: x*y, hdr.dims, 1) # fast product
  307. remaining_bytes = hdr.dtype.itemsize * n
  308. if hdr.is_complex and not hdr.mclass == mxSPARSE_CLASS:
  309. remaining_bytes *= 2
  310. next_position = self.mat_stream.tell() + remaining_bytes
  311. return hdr, next_position
  312. def read_var_array(self, header, process=True):
  313. ''' Read array, given `header`
  314. Parameters
  315. ----------
  316. header : header object
  317. object with fields defining variable header
  318. process : {True, False}, optional
  319. If True, apply recursive post-processing during loading of array.
  320. Returns
  321. -------
  322. arr : array
  323. array with post-processing applied or not according to
  324. `process`.
  325. '''
  326. return self._matrix_reader.array_from_header(header, process)
  327. def get_variables(self, variable_names=None):
  328. ''' get variables from stream as dictionary
  329. Parameters
  330. ----------
  331. variable_names : None or str or sequence of str, optional
  332. variable name, or sequence of variable names to get from Mat file /
  333. file stream. If None, then get all variables in file.
  334. '''
  335. if isinstance(variable_names, str):
  336. variable_names = [variable_names]
  337. elif variable_names is not None:
  338. variable_names = list(variable_names)
  339. self.mat_stream.seek(0)
  340. # set up variable reader
  341. self.initialize_read()
  342. mdict = {}
  343. while not self.end_of_stream():
  344. hdr, next_position = self.read_var_header()
  345. name = 'None' if hdr.name is None else hdr.name.decode('latin1')
  346. if variable_names is not None and name not in variable_names:
  347. self.mat_stream.seek(next_position)
  348. continue
  349. mdict[name] = self.read_var_array(hdr)
  350. self.mat_stream.seek(next_position)
  351. if variable_names is not None:
  352. variable_names.remove(name)
  353. if len(variable_names) == 0:
  354. break
  355. return mdict
  356. def list_variables(self):
  357. ''' list variables from stream '''
  358. self.mat_stream.seek(0)
  359. # set up variable reader
  360. self.initialize_read()
  361. vars = []
  362. while not self.end_of_stream():
  363. hdr, next_position = self.read_var_header()
  364. name = 'None' if hdr.name is None else hdr.name.decode('latin1')
  365. shape = self._matrix_reader.shape_from_header(hdr)
  366. info = mclass_info.get(hdr.mclass, 'unknown')
  367. vars.append((name, shape, info))
  368. self.mat_stream.seek(next_position)
  369. return vars
  370. def arr_to_2d(arr, oned_as='row'):
  371. ''' Make ``arr`` exactly two dimensional
  372. If `arr` has more than 2 dimensions, raise a ValueError
  373. Parameters
  374. ----------
  375. arr : array
  376. oned_as : {'row', 'column'}, optional
  377. Whether to reshape 1-D vectors as row vectors or column vectors.
  378. See documentation for ``matdims`` for more detail
  379. Returns
  380. -------
  381. arr2d : array
  382. 2-D version of the array
  383. '''
  384. dims = matdims(arr, oned_as)
  385. if len(dims) > 2:
  386. raise ValueError('Matlab 4 files cannot save arrays with more than '
  387. '2 dimensions')
  388. return arr.reshape(dims)
  389. class VarWriter4:
  390. def __init__(self, file_writer):
  391. self.file_stream = file_writer.file_stream
  392. self.oned_as = file_writer.oned_as
  393. def write_bytes(self, arr):
  394. self.file_stream.write(arr.tobytes(order='F'))
  395. def write_string(self, s):
  396. self.file_stream.write(s)
  397. def write_header(self, name, shape, P=miDOUBLE, T=mxFULL_CLASS, imagf=0):
  398. ''' Write header for given data options
  399. Parameters
  400. ----------
  401. name : str
  402. name of variable
  403. shape : sequence
  404. Shape of array as it will be read in matlab
  405. P : int, optional
  406. code for mat4 data type, one of ``miDOUBLE, miSINGLE, miINT32,
  407. miINT16, miUINT16, miUINT8``
  408. T : int, optional
  409. code for mat4 matrix class, one of ``mxFULL_CLASS, mxCHAR_CLASS,
  410. mxSPARSE_CLASS``
  411. imagf : int, optional
  412. flag indicating complex
  413. '''
  414. header = np.empty((), mdtypes_template['header'])
  415. M = not SYS_LITTLE_ENDIAN
  416. O = 0
  417. header['mopt'] = (M * 1000 +
  418. O * 100 +
  419. P * 10 +
  420. T)
  421. header['mrows'] = shape[0]
  422. header['ncols'] = shape[1]
  423. header['imagf'] = imagf
  424. header['namlen'] = len(name) + 1
  425. self.write_bytes(header)
  426. data = name + '\0'
  427. self.write_string(data.encode('latin1'))
  428. def write(self, arr, name):
  429. ''' Write matrix `arr`, with name `name`
  430. Parameters
  431. ----------
  432. arr : array_like
  433. array to write
  434. name : str
  435. name in matlab workspace
  436. '''
  437. # we need to catch sparse first, because np.asarray returns an
  438. # an object array for scipy.sparse
  439. if scipy.sparse.issparse(arr):
  440. self.write_sparse(arr, name)
  441. return
  442. arr = np.asarray(arr)
  443. dt = arr.dtype
  444. if not dt.isnative:
  445. arr = arr.astype(dt.newbyteorder('='))
  446. dtt = dt.type
  447. if dtt is np.object_:
  448. raise TypeError('Cannot save object arrays in Mat4')
  449. elif dtt is np.void:
  450. raise TypeError('Cannot save void type arrays')
  451. elif dtt in (np.unicode_, np.string_):
  452. self.write_char(arr, name)
  453. return
  454. self.write_numeric(arr, name)
  455. def write_numeric(self, arr, name):
  456. arr = arr_to_2d(arr, self.oned_as)
  457. imagf = arr.dtype.kind == 'c'
  458. try:
  459. P = np_to_mtypes[arr.dtype.str[1:]]
  460. except KeyError:
  461. if imagf:
  462. arr = arr.astype('c128')
  463. else:
  464. arr = arr.astype('f8')
  465. P = miDOUBLE
  466. self.write_header(name,
  467. arr.shape,
  468. P=P,
  469. T=mxFULL_CLASS,
  470. imagf=imagf)
  471. if imagf:
  472. self.write_bytes(arr.real)
  473. self.write_bytes(arr.imag)
  474. else:
  475. self.write_bytes(arr)
  476. def write_char(self, arr, name):
  477. arr = arr_to_chars(arr)
  478. arr = arr_to_2d(arr, self.oned_as)
  479. dims = arr.shape
  480. self.write_header(
  481. name,
  482. dims,
  483. P=miUINT8,
  484. T=mxCHAR_CLASS)
  485. if arr.dtype.kind == 'U':
  486. # Recode unicode to latin1
  487. n_chars = np.prod(dims)
  488. st_arr = np.ndarray(shape=(),
  489. dtype=arr_dtype_number(arr, n_chars),
  490. buffer=arr)
  491. st = st_arr.item().encode('latin-1')
  492. arr = np.ndarray(shape=dims, dtype='S1', buffer=st)
  493. self.write_bytes(arr)
  494. def write_sparse(self, arr, name):
  495. ''' Sparse matrices are 2-D
  496. See docstring for VarReader4.read_sparse_array
  497. '''
  498. A = arr.tocoo() # convert to sparse COO format (ijv)
  499. imagf = A.dtype.kind == 'c'
  500. ijv = np.zeros((A.nnz + 1, 3+imagf), dtype='f8')
  501. ijv[:-1,0] = A.row
  502. ijv[:-1,1] = A.col
  503. ijv[:-1,0:2] += 1 # 1 based indexing
  504. if imagf:
  505. ijv[:-1,2] = A.data.real
  506. ijv[:-1,3] = A.data.imag
  507. else:
  508. ijv[:-1,2] = A.data
  509. ijv[-1,0:2] = A.shape
  510. self.write_header(
  511. name,
  512. ijv.shape,
  513. P=miDOUBLE,
  514. T=mxSPARSE_CLASS)
  515. self.write_bytes(ijv)
  516. class MatFile4Writer:
  517. ''' Class for writing matlab 4 format files '''
  518. def __init__(self, file_stream, oned_as=None):
  519. self.file_stream = file_stream
  520. if oned_as is None:
  521. oned_as = 'row'
  522. self.oned_as = oned_as
  523. self._matrix_writer = None
  524. def put_variables(self, mdict, write_header=None):
  525. ''' Write variables in `mdict` to stream
  526. Parameters
  527. ----------
  528. mdict : mapping
  529. mapping with method ``items`` return name, contents pairs
  530. where ``name`` which will appeak in the matlab workspace in
  531. file load, and ``contents`` is something writeable to a
  532. matlab file, such as a NumPy array.
  533. write_header : {None, True, False}
  534. If True, then write the matlab file header before writing the
  535. variables. If None (the default) then write the file header
  536. if we are at position 0 in the stream. By setting False
  537. here, and setting the stream position to the end of the file,
  538. you can append variables to a matlab file
  539. '''
  540. # there is no header for a matlab 4 mat file, so we ignore the
  541. # ``write_header`` input argument. It's there for compatibility
  542. # with the matlab 5 version of this method
  543. self._matrix_writer = VarWriter4(self)
  544. for name, var in mdict.items():
  545. self._matrix_writer.write(var, name)