_netcdf.py 38 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088
  1. """
  2. NetCDF reader/writer module.
  3. This module is used to read and create NetCDF files. NetCDF files are
  4. accessed through the `netcdf_file` object. Data written to and from NetCDF
  5. files are contained in `netcdf_variable` objects. Attributes are given
  6. as member variables of the `netcdf_file` and `netcdf_variable` objects.
  7. This module implements the Scientific.IO.NetCDF API to read and create
  8. NetCDF files. The same API is also used in the PyNIO and pynetcdf
  9. modules, allowing these modules to be used interchangeably when working
  10. with NetCDF files.
  11. Only NetCDF3 is supported here; for NetCDF4 see
  12. `netCDF4-python <http://unidata.github.io/netcdf4-python/>`__,
  13. which has a similar API.
  14. """
  15. # TODO:
  16. # * properly implement ``_FillValue``.
  17. # * fix character variables.
  18. # * implement PAGESIZE for Python 2.6?
  19. # The Scientific.IO.NetCDF API allows attributes to be added directly to
  20. # instances of ``netcdf_file`` and ``netcdf_variable``. To differentiate
  21. # between user-set attributes and instance attributes, user-set attributes
  22. # are automatically stored in the ``_attributes`` attribute by overloading
  23. #``__setattr__``. This is the reason why the code sometimes uses
  24. #``obj.__dict__['key'] = value``, instead of simply ``obj.key = value``;
  25. # otherwise the key would be inserted into userspace attributes.
  26. __all__ = ['netcdf_file', 'netcdf_variable']
  27. import warnings
  28. import weakref
  29. from operator import mul
  30. from platform import python_implementation
  31. import mmap as mm
  32. import numpy as np
  33. from numpy import frombuffer, dtype, empty, array, asarray
  34. from numpy import little_endian as LITTLE_ENDIAN
  35. from functools import reduce
  36. IS_PYPY = python_implementation() == 'PyPy'
  37. ABSENT = b'\x00\x00\x00\x00\x00\x00\x00\x00'
  38. ZERO = b'\x00\x00\x00\x00'
  39. NC_BYTE = b'\x00\x00\x00\x01'
  40. NC_CHAR = b'\x00\x00\x00\x02'
  41. NC_SHORT = b'\x00\x00\x00\x03'
  42. NC_INT = b'\x00\x00\x00\x04'
  43. NC_FLOAT = b'\x00\x00\x00\x05'
  44. NC_DOUBLE = b'\x00\x00\x00\x06'
  45. NC_DIMENSION = b'\x00\x00\x00\n'
  46. NC_VARIABLE = b'\x00\x00\x00\x0b'
  47. NC_ATTRIBUTE = b'\x00\x00\x00\x0c'
  48. FILL_BYTE = b'\x81'
  49. FILL_CHAR = b'\x00'
  50. FILL_SHORT = b'\x80\x01'
  51. FILL_INT = b'\x80\x00\x00\x01'
  52. FILL_FLOAT = b'\x7C\xF0\x00\x00'
  53. FILL_DOUBLE = b'\x47\x9E\x00\x00\x00\x00\x00\x00'
  54. TYPEMAP = {NC_BYTE: ('b', 1),
  55. NC_CHAR: ('c', 1),
  56. NC_SHORT: ('h', 2),
  57. NC_INT: ('i', 4),
  58. NC_FLOAT: ('f', 4),
  59. NC_DOUBLE: ('d', 8)}
  60. FILLMAP = {NC_BYTE: FILL_BYTE,
  61. NC_CHAR: FILL_CHAR,
  62. NC_SHORT: FILL_SHORT,
  63. NC_INT: FILL_INT,
  64. NC_FLOAT: FILL_FLOAT,
  65. NC_DOUBLE: FILL_DOUBLE}
  66. REVERSE = {('b', 1): NC_BYTE,
  67. ('B', 1): NC_CHAR,
  68. ('c', 1): NC_CHAR,
  69. ('h', 2): NC_SHORT,
  70. ('i', 4): NC_INT,
  71. ('f', 4): NC_FLOAT,
  72. ('d', 8): NC_DOUBLE,
  73. # these come from asarray(1).dtype.char and asarray('foo').dtype.char,
  74. # used when getting the types from generic attributes.
  75. ('l', 4): NC_INT,
  76. ('S', 1): NC_CHAR}
  77. class netcdf_file:
  78. """
  79. A file object for NetCDF data.
  80. A `netcdf_file` object has two standard attributes: `dimensions` and
  81. `variables`. The values of both are dictionaries, mapping dimension
  82. names to their associated lengths and variable names to variables,
  83. respectively. Application programs should never modify these
  84. dictionaries.
  85. All other attributes correspond to global attributes defined in the
  86. NetCDF file. Global file attributes are created by assigning to an
  87. attribute of the `netcdf_file` object.
  88. Parameters
  89. ----------
  90. filename : string or file-like
  91. string -> filename
  92. mode : {'r', 'w', 'a'}, optional
  93. read-write-append mode, default is 'r'
  94. mmap : None or bool, optional
  95. Whether to mmap `filename` when reading. Default is True
  96. when `filename` is a file name, False when `filename` is a
  97. file-like object. Note that when mmap is in use, data arrays
  98. returned refer directly to the mmapped data on disk, and the
  99. file cannot be closed as long as references to it exist.
  100. version : {1, 2}, optional
  101. version of netcdf to read / write, where 1 means *Classic
  102. format* and 2 means *64-bit offset format*. Default is 1. See
  103. `here <https://www.unidata.ucar.edu/software/netcdf/docs/netcdf_introduction.html#select_format>`__
  104. for more info.
  105. maskandscale : bool, optional
  106. Whether to automatically scale and/or mask data based on attributes.
  107. Default is False.
  108. Notes
  109. -----
  110. The major advantage of this module over other modules is that it doesn't
  111. require the code to be linked to the NetCDF libraries. This module is
  112. derived from `pupynere <https://bitbucket.org/robertodealmeida/pupynere/>`_.
  113. NetCDF files are a self-describing binary data format. The file contains
  114. metadata that describes the dimensions and variables in the file. More
  115. details about NetCDF files can be found `here
  116. <https://www.unidata.ucar.edu/software/netcdf/guide_toc.html>`__. There
  117. are three main sections to a NetCDF data structure:
  118. 1. Dimensions
  119. 2. Variables
  120. 3. Attributes
  121. The dimensions section records the name and length of each dimension used
  122. by the variables. The variables would then indicate which dimensions it
  123. uses and any attributes such as data units, along with containing the data
  124. values for the variable. It is good practice to include a
  125. variable that is the same name as a dimension to provide the values for
  126. that axes. Lastly, the attributes section would contain additional
  127. information such as the name of the file creator or the instrument used to
  128. collect the data.
  129. When writing data to a NetCDF file, there is often the need to indicate the
  130. 'record dimension'. A record dimension is the unbounded dimension for a
  131. variable. For example, a temperature variable may have dimensions of
  132. latitude, longitude and time. If one wants to add more temperature data to
  133. the NetCDF file as time progresses, then the temperature variable should
  134. have the time dimension flagged as the record dimension.
  135. In addition, the NetCDF file header contains the position of the data in
  136. the file, so access can be done in an efficient manner without loading
  137. unnecessary data into memory. It uses the ``mmap`` module to create
  138. Numpy arrays mapped to the data on disk, for the same purpose.
  139. Note that when `netcdf_file` is used to open a file with mmap=True
  140. (default for read-only), arrays returned by it refer to data
  141. directly on the disk. The file should not be closed, and cannot be cleanly
  142. closed when asked, if such arrays are alive. You may want to copy data arrays
  143. obtained from mmapped Netcdf file if they are to be processed after the file
  144. is closed, see the example below.
  145. Examples
  146. --------
  147. To create a NetCDF file:
  148. >>> from scipy.io import netcdf_file
  149. >>> import numpy as np
  150. >>> f = netcdf_file('simple.nc', 'w')
  151. >>> f.history = 'Created for a test'
  152. >>> f.createDimension('time', 10)
  153. >>> time = f.createVariable('time', 'i', ('time',))
  154. >>> time[:] = np.arange(10)
  155. >>> time.units = 'days since 2008-01-01'
  156. >>> f.close()
  157. Note the assignment of ``arange(10)`` to ``time[:]``. Exposing the slice
  158. of the time variable allows for the data to be set in the object, rather
  159. than letting ``arange(10)`` overwrite the ``time`` variable.
  160. To read the NetCDF file we just created:
  161. >>> from scipy.io import netcdf_file
  162. >>> f = netcdf_file('simple.nc', 'r')
  163. >>> print(f.history)
  164. b'Created for a test'
  165. >>> time = f.variables['time']
  166. >>> print(time.units)
  167. b'days since 2008-01-01'
  168. >>> print(time.shape)
  169. (10,)
  170. >>> print(time[-1])
  171. 9
  172. NetCDF files, when opened read-only, return arrays that refer
  173. directly to memory-mapped data on disk:
  174. >>> data = time[:]
  175. If the data is to be processed after the file is closed, it needs
  176. to be copied to main memory:
  177. >>> data = time[:].copy()
  178. >>> f.close()
  179. >>> data.mean()
  180. 4.5
  181. A NetCDF file can also be used as context manager:
  182. >>> from scipy.io import netcdf_file
  183. >>> with netcdf_file('simple.nc', 'r') as f:
  184. ... print(f.history)
  185. b'Created for a test'
  186. """
  187. def __init__(self, filename, mode='r', mmap=None, version=1,
  188. maskandscale=False):
  189. """Initialize netcdf_file from fileobj (str or file-like)."""
  190. if mode not in 'rwa':
  191. raise ValueError("Mode must be either 'r', 'w' or 'a'.")
  192. if hasattr(filename, 'seek'): # file-like
  193. self.fp = filename
  194. self.filename = 'None'
  195. if mmap is None:
  196. mmap = False
  197. elif mmap and not hasattr(filename, 'fileno'):
  198. raise ValueError('Cannot use file object for mmap')
  199. else: # maybe it's a string
  200. self.filename = filename
  201. omode = 'r+' if mode == 'a' else mode
  202. self.fp = open(self.filename, '%sb' % omode)
  203. if mmap is None:
  204. # Mmapped files on PyPy cannot be usually closed
  205. # before the GC runs, so it's better to use mmap=False
  206. # as the default.
  207. mmap = (not IS_PYPY)
  208. if mode != 'r':
  209. # Cannot read write-only files
  210. mmap = False
  211. self.use_mmap = mmap
  212. self.mode = mode
  213. self.version_byte = version
  214. self.maskandscale = maskandscale
  215. self.dimensions = {}
  216. self.variables = {}
  217. self._dims = []
  218. self._recs = 0
  219. self._recsize = 0
  220. self._mm = None
  221. self._mm_buf = None
  222. if self.use_mmap:
  223. self._mm = mm.mmap(self.fp.fileno(), 0, access=mm.ACCESS_READ)
  224. self._mm_buf = np.frombuffer(self._mm, dtype=np.int8)
  225. self._attributes = {}
  226. if mode in 'ra':
  227. self._read()
  228. def __setattr__(self, attr, value):
  229. # Store user defined attributes in a separate dict,
  230. # so we can save them to file later.
  231. try:
  232. self._attributes[attr] = value
  233. except AttributeError:
  234. pass
  235. self.__dict__[attr] = value
  236. def close(self):
  237. """Closes the NetCDF file."""
  238. if hasattr(self, 'fp') and not self.fp.closed:
  239. try:
  240. self.flush()
  241. finally:
  242. self.variables = {}
  243. if self._mm_buf is not None:
  244. ref = weakref.ref(self._mm_buf)
  245. self._mm_buf = None
  246. if ref() is None:
  247. # self._mm_buf is gc'd, and we can close the mmap
  248. self._mm.close()
  249. else:
  250. # we cannot close self._mm, since self._mm_buf is
  251. # alive and there may still be arrays referring to it
  252. warnings.warn((
  253. "Cannot close a netcdf_file opened with mmap=True, when "
  254. "netcdf_variables or arrays referring to its data still exist. "
  255. "All data arrays obtained from such files refer directly to "
  256. "data on disk, and must be copied before the file can be cleanly "
  257. "closed. (See netcdf_file docstring for more information on mmap.)"
  258. ), category=RuntimeWarning)
  259. self._mm = None
  260. self.fp.close()
  261. __del__ = close
  262. def __enter__(self):
  263. return self
  264. def __exit__(self, type, value, traceback):
  265. self.close()
  266. def createDimension(self, name, length):
  267. """
  268. Adds a dimension to the Dimension section of the NetCDF data structure.
  269. Note that this function merely adds a new dimension that the variables can
  270. reference. The values for the dimension, if desired, should be added as
  271. a variable using `createVariable`, referring to this dimension.
  272. Parameters
  273. ----------
  274. name : str
  275. Name of the dimension (Eg, 'lat' or 'time').
  276. length : int
  277. Length of the dimension.
  278. See Also
  279. --------
  280. createVariable
  281. """
  282. if length is None and self._dims:
  283. raise ValueError("Only first dimension may be unlimited!")
  284. self.dimensions[name] = length
  285. self._dims.append(name)
  286. def createVariable(self, name, type, dimensions):
  287. """
  288. Create an empty variable for the `netcdf_file` object, specifying its data
  289. type and the dimensions it uses.
  290. Parameters
  291. ----------
  292. name : str
  293. Name of the new variable.
  294. type : dtype or str
  295. Data type of the variable.
  296. dimensions : sequence of str
  297. List of the dimension names used by the variable, in the desired order.
  298. Returns
  299. -------
  300. variable : netcdf_variable
  301. The newly created ``netcdf_variable`` object.
  302. This object has also been added to the `netcdf_file` object as well.
  303. See Also
  304. --------
  305. createDimension
  306. Notes
  307. -----
  308. Any dimensions to be used by the variable should already exist in the
  309. NetCDF data structure or should be created by `createDimension` prior to
  310. creating the NetCDF variable.
  311. """
  312. shape = tuple([self.dimensions[dim] for dim in dimensions])
  313. shape_ = tuple([dim or 0 for dim in shape]) # replace None with 0 for NumPy
  314. type = dtype(type)
  315. typecode, size = type.char, type.itemsize
  316. if (typecode, size) not in REVERSE:
  317. raise ValueError("NetCDF 3 does not support type %s" % type)
  318. data = empty(shape_, dtype=type.newbyteorder("B")) # convert to big endian always for NetCDF 3
  319. self.variables[name] = netcdf_variable(
  320. data, typecode, size, shape, dimensions,
  321. maskandscale=self.maskandscale)
  322. return self.variables[name]
  323. def flush(self):
  324. """
  325. Perform a sync-to-disk flush if the `netcdf_file` object is in write mode.
  326. See Also
  327. --------
  328. sync : Identical function
  329. """
  330. if hasattr(self, 'mode') and self.mode in 'wa':
  331. self._write()
  332. sync = flush
  333. def _write(self):
  334. self.fp.seek(0)
  335. self.fp.write(b'CDF')
  336. self.fp.write(array(self.version_byte, '>b').tobytes())
  337. # Write headers and data.
  338. self._write_numrecs()
  339. self._write_dim_array()
  340. self._write_gatt_array()
  341. self._write_var_array()
  342. def _write_numrecs(self):
  343. # Get highest record count from all record variables.
  344. for var in self.variables.values():
  345. if var.isrec and len(var.data) > self._recs:
  346. self.__dict__['_recs'] = len(var.data)
  347. self._pack_int(self._recs)
  348. def _write_dim_array(self):
  349. if self.dimensions:
  350. self.fp.write(NC_DIMENSION)
  351. self._pack_int(len(self.dimensions))
  352. for name in self._dims:
  353. self._pack_string(name)
  354. length = self.dimensions[name]
  355. self._pack_int(length or 0) # replace None with 0 for record dimension
  356. else:
  357. self.fp.write(ABSENT)
  358. def _write_gatt_array(self):
  359. self._write_att_array(self._attributes)
  360. def _write_att_array(self, attributes):
  361. if attributes:
  362. self.fp.write(NC_ATTRIBUTE)
  363. self._pack_int(len(attributes))
  364. for name, values in attributes.items():
  365. self._pack_string(name)
  366. self._write_att_values(values)
  367. else:
  368. self.fp.write(ABSENT)
  369. def _write_var_array(self):
  370. if self.variables:
  371. self.fp.write(NC_VARIABLE)
  372. self._pack_int(len(self.variables))
  373. # Sort variable names non-recs first, then recs.
  374. def sortkey(n):
  375. v = self.variables[n]
  376. if v.isrec:
  377. return (-1,)
  378. return v._shape
  379. variables = sorted(self.variables, key=sortkey, reverse=True)
  380. # Set the metadata for all variables.
  381. for name in variables:
  382. self._write_var_metadata(name)
  383. # Now that we have the metadata, we know the vsize of
  384. # each record variable, so we can calculate recsize.
  385. self.__dict__['_recsize'] = sum([
  386. var._vsize for var in self.variables.values()
  387. if var.isrec])
  388. # Set the data for all variables.
  389. for name in variables:
  390. self._write_var_data(name)
  391. else:
  392. self.fp.write(ABSENT)
  393. def _write_var_metadata(self, name):
  394. var = self.variables[name]
  395. self._pack_string(name)
  396. self._pack_int(len(var.dimensions))
  397. for dimname in var.dimensions:
  398. dimid = self._dims.index(dimname)
  399. self._pack_int(dimid)
  400. self._write_att_array(var._attributes)
  401. nc_type = REVERSE[var.typecode(), var.itemsize()]
  402. self.fp.write(nc_type)
  403. if not var.isrec:
  404. vsize = var.data.size * var.data.itemsize
  405. vsize += -vsize % 4
  406. else: # record variable
  407. try:
  408. vsize = var.data[0].size * var.data.itemsize
  409. except IndexError:
  410. vsize = 0
  411. rec_vars = len([v for v in self.variables.values()
  412. if v.isrec])
  413. if rec_vars > 1:
  414. vsize += -vsize % 4
  415. self.variables[name].__dict__['_vsize'] = vsize
  416. self._pack_int(vsize)
  417. # Pack a bogus begin, and set the real value later.
  418. self.variables[name].__dict__['_begin'] = self.fp.tell()
  419. self._pack_begin(0)
  420. def _write_var_data(self, name):
  421. var = self.variables[name]
  422. # Set begin in file header.
  423. the_beguine = self.fp.tell()
  424. self.fp.seek(var._begin)
  425. self._pack_begin(the_beguine)
  426. self.fp.seek(the_beguine)
  427. # Write data.
  428. if not var.isrec:
  429. self.fp.write(var.data.tobytes())
  430. count = var.data.size * var.data.itemsize
  431. self._write_var_padding(var, var._vsize - count)
  432. else: # record variable
  433. # Handle rec vars with shape[0] < nrecs.
  434. if self._recs > len(var.data):
  435. shape = (self._recs,) + var.data.shape[1:]
  436. # Resize in-place does not always work since
  437. # the array might not be single-segment
  438. try:
  439. var.data.resize(shape)
  440. except ValueError:
  441. var.__dict__['data'] = np.resize(var.data, shape).astype(var.data.dtype)
  442. pos0 = pos = self.fp.tell()
  443. for rec in var.data:
  444. # Apparently scalars cannot be converted to big endian. If we
  445. # try to convert a ``=i4`` scalar to, say, '>i4' the dtype
  446. # will remain as ``=i4``.
  447. if not rec.shape and (rec.dtype.byteorder == '<' or
  448. (rec.dtype.byteorder == '=' and LITTLE_ENDIAN)):
  449. rec = rec.byteswap()
  450. self.fp.write(rec.tobytes())
  451. # Padding
  452. count = rec.size * rec.itemsize
  453. self._write_var_padding(var, var._vsize - count)
  454. pos += self._recsize
  455. self.fp.seek(pos)
  456. self.fp.seek(pos0 + var._vsize)
  457. def _write_var_padding(self, var, size):
  458. encoded_fill_value = var._get_encoded_fill_value()
  459. num_fills = size // len(encoded_fill_value)
  460. self.fp.write(encoded_fill_value * num_fills)
  461. def _write_att_values(self, values):
  462. if hasattr(values, 'dtype'):
  463. nc_type = REVERSE[values.dtype.char, values.dtype.itemsize]
  464. else:
  465. types = [(int, NC_INT), (float, NC_FLOAT), (str, NC_CHAR)]
  466. # bytes index into scalars in py3k. Check for "string" types
  467. if isinstance(values, (str, bytes)):
  468. sample = values
  469. else:
  470. try:
  471. sample = values[0] # subscriptable?
  472. except TypeError:
  473. sample = values # scalar
  474. for class_, nc_type in types:
  475. if isinstance(sample, class_):
  476. break
  477. typecode, size = TYPEMAP[nc_type]
  478. dtype_ = '>%s' % typecode
  479. # asarray() dies with bytes and '>c' in py3k. Change to 'S'
  480. dtype_ = 'S' if dtype_ == '>c' else dtype_
  481. values = asarray(values, dtype=dtype_)
  482. self.fp.write(nc_type)
  483. if values.dtype.char == 'S':
  484. nelems = values.itemsize
  485. else:
  486. nelems = values.size
  487. self._pack_int(nelems)
  488. if not values.shape and (values.dtype.byteorder == '<' or
  489. (values.dtype.byteorder == '=' and LITTLE_ENDIAN)):
  490. values = values.byteswap()
  491. self.fp.write(values.tobytes())
  492. count = values.size * values.itemsize
  493. self.fp.write(b'\x00' * (-count % 4)) # pad
  494. def _read(self):
  495. # Check magic bytes and version
  496. magic = self.fp.read(3)
  497. if not magic == b'CDF':
  498. raise TypeError("Error: %s is not a valid NetCDF 3 file" %
  499. self.filename)
  500. self.__dict__['version_byte'] = frombuffer(self.fp.read(1), '>b')[0]
  501. # Read file headers and set data.
  502. self._read_numrecs()
  503. self._read_dim_array()
  504. self._read_gatt_array()
  505. self._read_var_array()
  506. def _read_numrecs(self):
  507. self.__dict__['_recs'] = self._unpack_int()
  508. def _read_dim_array(self):
  509. header = self.fp.read(4)
  510. if header not in [ZERO, NC_DIMENSION]:
  511. raise ValueError("Unexpected header.")
  512. count = self._unpack_int()
  513. for dim in range(count):
  514. name = self._unpack_string().decode('latin1')
  515. length = self._unpack_int() or None # None for record dimension
  516. self.dimensions[name] = length
  517. self._dims.append(name) # preserve order
  518. def _read_gatt_array(self):
  519. for k, v in self._read_att_array().items():
  520. self.__setattr__(k, v)
  521. def _read_att_array(self):
  522. header = self.fp.read(4)
  523. if header not in [ZERO, NC_ATTRIBUTE]:
  524. raise ValueError("Unexpected header.")
  525. count = self._unpack_int()
  526. attributes = {}
  527. for attr in range(count):
  528. name = self._unpack_string().decode('latin1')
  529. attributes[name] = self._read_att_values()
  530. return attributes
  531. def _read_var_array(self):
  532. header = self.fp.read(4)
  533. if header not in [ZERO, NC_VARIABLE]:
  534. raise ValueError("Unexpected header.")
  535. begin = 0
  536. dtypes = {'names': [], 'formats': []}
  537. rec_vars = []
  538. count = self._unpack_int()
  539. for var in range(count):
  540. (name, dimensions, shape, attributes,
  541. typecode, size, dtype_, begin_, vsize) = self._read_var()
  542. # https://www.unidata.ucar.edu/software/netcdf/guide_toc.html
  543. # Note that vsize is the product of the dimension lengths
  544. # (omitting the record dimension) and the number of bytes
  545. # per value (determined from the type), increased to the
  546. # next multiple of 4, for each variable. If a record
  547. # variable, this is the amount of space per record. The
  548. # netCDF "record size" is calculated as the sum of the
  549. # vsize's of all the record variables.
  550. #
  551. # The vsize field is actually redundant, because its value
  552. # may be computed from other information in the header. The
  553. # 32-bit vsize field is not large enough to contain the size
  554. # of variables that require more than 2^32 - 4 bytes, so
  555. # 2^32 - 1 is used in the vsize field for such variables.
  556. if shape and shape[0] is None: # record variable
  557. rec_vars.append(name)
  558. # The netCDF "record size" is calculated as the sum of
  559. # the vsize's of all the record variables.
  560. self.__dict__['_recsize'] += vsize
  561. if begin == 0:
  562. begin = begin_
  563. dtypes['names'].append(name)
  564. dtypes['formats'].append(str(shape[1:]) + dtype_)
  565. # Handle padding with a virtual variable.
  566. if typecode in 'bch':
  567. actual_size = reduce(mul, (1,) + shape[1:]) * size
  568. padding = -actual_size % 4
  569. if padding:
  570. dtypes['names'].append('_padding_%d' % var)
  571. dtypes['formats'].append('(%d,)>b' % padding)
  572. # Data will be set later.
  573. data = None
  574. else: # not a record variable
  575. # Calculate size to avoid problems with vsize (above)
  576. a_size = reduce(mul, shape, 1) * size
  577. if self.use_mmap:
  578. data = self._mm_buf[begin_:begin_+a_size].view(dtype=dtype_)
  579. data.shape = shape
  580. else:
  581. pos = self.fp.tell()
  582. self.fp.seek(begin_)
  583. data = frombuffer(self.fp.read(a_size), dtype=dtype_
  584. ).copy()
  585. data.shape = shape
  586. self.fp.seek(pos)
  587. # Add variable.
  588. self.variables[name] = netcdf_variable(
  589. data, typecode, size, shape, dimensions, attributes,
  590. maskandscale=self.maskandscale)
  591. if rec_vars:
  592. # Remove padding when only one record variable.
  593. if len(rec_vars) == 1:
  594. dtypes['names'] = dtypes['names'][:1]
  595. dtypes['formats'] = dtypes['formats'][:1]
  596. # Build rec array.
  597. if self.use_mmap:
  598. rec_array = self._mm_buf[begin:begin+self._recs*self._recsize].view(dtype=dtypes)
  599. rec_array.shape = (self._recs,)
  600. else:
  601. pos = self.fp.tell()
  602. self.fp.seek(begin)
  603. rec_array = frombuffer(self.fp.read(self._recs*self._recsize),
  604. dtype=dtypes).copy()
  605. rec_array.shape = (self._recs,)
  606. self.fp.seek(pos)
  607. for var in rec_vars:
  608. self.variables[var].__dict__['data'] = rec_array[var]
  609. def _read_var(self):
  610. name = self._unpack_string().decode('latin1')
  611. dimensions = []
  612. shape = []
  613. dims = self._unpack_int()
  614. for i in range(dims):
  615. dimid = self._unpack_int()
  616. dimname = self._dims[dimid]
  617. dimensions.append(dimname)
  618. dim = self.dimensions[dimname]
  619. shape.append(dim)
  620. dimensions = tuple(dimensions)
  621. shape = tuple(shape)
  622. attributes = self._read_att_array()
  623. nc_type = self.fp.read(4)
  624. vsize = self._unpack_int()
  625. begin = [self._unpack_int, self._unpack_int64][self.version_byte-1]()
  626. typecode, size = TYPEMAP[nc_type]
  627. dtype_ = '>%s' % typecode
  628. return name, dimensions, shape, attributes, typecode, size, dtype_, begin, vsize
  629. def _read_att_values(self):
  630. nc_type = self.fp.read(4)
  631. n = self._unpack_int()
  632. typecode, size = TYPEMAP[nc_type]
  633. count = n*size
  634. values = self.fp.read(int(count))
  635. self.fp.read(-count % 4) # read padding
  636. if typecode != 'c':
  637. values = frombuffer(values, dtype='>%s' % typecode).copy()
  638. if values.shape == (1,):
  639. values = values[0]
  640. else:
  641. values = values.rstrip(b'\x00')
  642. return values
  643. def _pack_begin(self, begin):
  644. if self.version_byte == 1:
  645. self._pack_int(begin)
  646. elif self.version_byte == 2:
  647. self._pack_int64(begin)
  648. def _pack_int(self, value):
  649. self.fp.write(array(value, '>i').tobytes())
  650. _pack_int32 = _pack_int
  651. def _unpack_int(self):
  652. return int(frombuffer(self.fp.read(4), '>i')[0])
  653. _unpack_int32 = _unpack_int
  654. def _pack_int64(self, value):
  655. self.fp.write(array(value, '>q').tobytes())
  656. def _unpack_int64(self):
  657. return frombuffer(self.fp.read(8), '>q')[0]
  658. def _pack_string(self, s):
  659. count = len(s)
  660. self._pack_int(count)
  661. self.fp.write(s.encode('latin1'))
  662. self.fp.write(b'\x00' * (-count % 4)) # pad
  663. def _unpack_string(self):
  664. count = self._unpack_int()
  665. s = self.fp.read(count).rstrip(b'\x00')
  666. self.fp.read(-count % 4) # read padding
  667. return s
  668. class netcdf_variable:
  669. """
  670. A data object for netcdf files.
  671. `netcdf_variable` objects are constructed by calling the method
  672. `netcdf_file.createVariable` on the `netcdf_file` object. `netcdf_variable`
  673. objects behave much like array objects defined in numpy, except that their
  674. data resides in a file. Data is read by indexing and written by assigning
  675. to an indexed subset; the entire array can be accessed by the index ``[:]``
  676. or (for scalars) by using the methods `getValue` and `assignValue`.
  677. `netcdf_variable` objects also have attribute `shape` with the same meaning
  678. as for arrays, but the shape cannot be modified. There is another read-only
  679. attribute `dimensions`, whose value is the tuple of dimension names.
  680. All other attributes correspond to variable attributes defined in
  681. the NetCDF file. Variable attributes are created by assigning to an
  682. attribute of the `netcdf_variable` object.
  683. Parameters
  684. ----------
  685. data : array_like
  686. The data array that holds the values for the variable.
  687. Typically, this is initialized as empty, but with the proper shape.
  688. typecode : dtype character code
  689. Desired data-type for the data array.
  690. size : int
  691. Desired element size for the data array.
  692. shape : sequence of ints
  693. The shape of the array. This should match the lengths of the
  694. variable's dimensions.
  695. dimensions : sequence of strings
  696. The names of the dimensions used by the variable. Must be in the
  697. same order of the dimension lengths given by `shape`.
  698. attributes : dict, optional
  699. Attribute values (any type) keyed by string names. These attributes
  700. become attributes for the netcdf_variable object.
  701. maskandscale : bool, optional
  702. Whether to automatically scale and/or mask data based on attributes.
  703. Default is False.
  704. Attributes
  705. ----------
  706. dimensions : list of str
  707. List of names of dimensions used by the variable object.
  708. isrec, shape
  709. Properties
  710. See also
  711. --------
  712. isrec, shape
  713. """
  714. def __init__(self, data, typecode, size, shape, dimensions,
  715. attributes=None,
  716. maskandscale=False):
  717. self.data = data
  718. self._typecode = typecode
  719. self._size = size
  720. self._shape = shape
  721. self.dimensions = dimensions
  722. self.maskandscale = maskandscale
  723. self._attributes = attributes or {}
  724. for k, v in self._attributes.items():
  725. self.__dict__[k] = v
  726. def __setattr__(self, attr, value):
  727. # Store user defined attributes in a separate dict,
  728. # so we can save them to file later.
  729. try:
  730. self._attributes[attr] = value
  731. except AttributeError:
  732. pass
  733. self.__dict__[attr] = value
  734. def isrec(self):
  735. """Returns whether the variable has a record dimension or not.
  736. A record dimension is a dimension along which additional data could be
  737. easily appended in the netcdf data structure without much rewriting of
  738. the data file. This attribute is a read-only property of the
  739. `netcdf_variable`.
  740. """
  741. return bool(self.data.shape) and not self._shape[0]
  742. isrec = property(isrec)
  743. def shape(self):
  744. """Returns the shape tuple of the data variable.
  745. This is a read-only attribute and can not be modified in the
  746. same manner of other numpy arrays.
  747. """
  748. return self.data.shape
  749. shape = property(shape)
  750. def getValue(self):
  751. """
  752. Retrieve a scalar value from a `netcdf_variable` of length one.
  753. Raises
  754. ------
  755. ValueError
  756. If the netcdf variable is an array of length greater than one,
  757. this exception will be raised.
  758. """
  759. return self.data.item()
  760. def assignValue(self, value):
  761. """
  762. Assign a scalar value to a `netcdf_variable` of length one.
  763. Parameters
  764. ----------
  765. value : scalar
  766. Scalar value (of compatible type) to assign to a length-one netcdf
  767. variable. This value will be written to file.
  768. Raises
  769. ------
  770. ValueError
  771. If the input is not a scalar, or if the destination is not a length-one
  772. netcdf variable.
  773. """
  774. if not self.data.flags.writeable:
  775. # Work-around for a bug in NumPy. Calling itemset() on a read-only
  776. # memory-mapped array causes a seg. fault.
  777. # See NumPy ticket #1622, and SciPy ticket #1202.
  778. # This check for `writeable` can be removed when the oldest version
  779. # of NumPy still supported by scipy contains the fix for #1622.
  780. raise RuntimeError("variable is not writeable")
  781. self.data.itemset(value)
  782. def typecode(self):
  783. """
  784. Return the typecode of the variable.
  785. Returns
  786. -------
  787. typecode : char
  788. The character typecode of the variable (e.g., 'i' for int).
  789. """
  790. return self._typecode
  791. def itemsize(self):
  792. """
  793. Return the itemsize of the variable.
  794. Returns
  795. -------
  796. itemsize : int
  797. The element size of the variable (e.g., 8 for float64).
  798. """
  799. return self._size
  800. def __getitem__(self, index):
  801. if not self.maskandscale:
  802. return self.data[index]
  803. data = self.data[index].copy()
  804. missing_value = self._get_missing_value()
  805. data = self._apply_missing_value(data, missing_value)
  806. scale_factor = self._attributes.get('scale_factor')
  807. add_offset = self._attributes.get('add_offset')
  808. if add_offset is not None or scale_factor is not None:
  809. data = data.astype(np.float64)
  810. if scale_factor is not None:
  811. data = data * scale_factor
  812. if add_offset is not None:
  813. data += add_offset
  814. return data
  815. def __setitem__(self, index, data):
  816. if self.maskandscale:
  817. missing_value = (
  818. self._get_missing_value() or
  819. getattr(data, 'fill_value', 999999))
  820. self._attributes.setdefault('missing_value', missing_value)
  821. self._attributes.setdefault('_FillValue', missing_value)
  822. data = ((data - self._attributes.get('add_offset', 0.0)) /
  823. self._attributes.get('scale_factor', 1.0))
  824. data = np.ma.asarray(data).filled(missing_value)
  825. if self._typecode not in 'fd' and data.dtype.kind == 'f':
  826. data = np.round(data)
  827. # Expand data for record vars?
  828. if self.isrec:
  829. if isinstance(index, tuple):
  830. rec_index = index[0]
  831. else:
  832. rec_index = index
  833. if isinstance(rec_index, slice):
  834. recs = (rec_index.start or 0) + len(data)
  835. else:
  836. recs = rec_index + 1
  837. if recs > len(self.data):
  838. shape = (recs,) + self._shape[1:]
  839. # Resize in-place does not always work since
  840. # the array might not be single-segment
  841. try:
  842. self.data.resize(shape)
  843. except ValueError:
  844. self.__dict__['data'] = np.resize(self.data, shape).astype(self.data.dtype)
  845. self.data[index] = data
  846. def _default_encoded_fill_value(self):
  847. """
  848. The default encoded fill-value for this Variable's data type.
  849. """
  850. nc_type = REVERSE[self.typecode(), self.itemsize()]
  851. return FILLMAP[nc_type]
  852. def _get_encoded_fill_value(self):
  853. """
  854. Returns the encoded fill value for this variable as bytes.
  855. This is taken from either the _FillValue attribute, or the default fill
  856. value for this variable's data type.
  857. """
  858. if '_FillValue' in self._attributes:
  859. fill_value = np.array(self._attributes['_FillValue'],
  860. dtype=self.data.dtype).tobytes()
  861. if len(fill_value) == self.itemsize():
  862. return fill_value
  863. else:
  864. return self._default_encoded_fill_value()
  865. else:
  866. return self._default_encoded_fill_value()
  867. def _get_missing_value(self):
  868. """
  869. Returns the value denoting "no data" for this variable.
  870. If this variable does not have a missing/fill value, returns None.
  871. If both _FillValue and missing_value are given, give precedence to
  872. _FillValue. The netCDF standard gives special meaning to _FillValue;
  873. missing_value is just used for compatibility with old datasets.
  874. """
  875. if '_FillValue' in self._attributes:
  876. missing_value = self._attributes['_FillValue']
  877. elif 'missing_value' in self._attributes:
  878. missing_value = self._attributes['missing_value']
  879. else:
  880. missing_value = None
  881. return missing_value
  882. @staticmethod
  883. def _apply_missing_value(data, missing_value):
  884. """
  885. Applies the given missing value to the data array.
  886. Returns a numpy.ma array, with any value equal to missing_value masked
  887. out (unless missing_value is None, in which case the original array is
  888. returned).
  889. """
  890. if missing_value is None:
  891. newdata = data
  892. else:
  893. try:
  894. missing_value_isnan = np.isnan(missing_value)
  895. except (TypeError, NotImplementedError):
  896. # some data types (e.g., characters) cannot be tested for NaN
  897. missing_value_isnan = False
  898. if missing_value_isnan:
  899. mymask = np.isnan(data)
  900. else:
  901. mymask = (data == missing_value)
  902. newdata = np.ma.masked_where(mymask, data)
  903. return newdata
  904. NetCDFFile = netcdf_file
  905. NetCDFVariable = netcdf_variable