recfunctions.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590
  1. """
  2. Collection of utilities to manipulate structured arrays.
  3. Most of these functions were initially implemented by John Hunter for
  4. matplotlib. They have been rewritten and extended for convenience.
  5. """
  6. import itertools
  7. import numpy as np
  8. import numpy.ma as ma
  9. from numpy import ndarray, recarray
  10. from numpy.ma import MaskedArray
  11. from numpy.ma.mrecords import MaskedRecords
  12. from numpy.core.overrides import array_function_dispatch
  13. from numpy.lib._iotools import _is_string_like
  14. _check_fill_value = np.ma.core._check_fill_value
  15. __all__ = [
  16. 'append_fields', 'apply_along_fields', 'assign_fields_by_name',
  17. 'drop_fields', 'find_duplicates', 'flatten_descr',
  18. 'get_fieldstructure', 'get_names', 'get_names_flat',
  19. 'join_by', 'merge_arrays', 'rec_append_fields',
  20. 'rec_drop_fields', 'rec_join', 'recursive_fill_fields',
  21. 'rename_fields', 'repack_fields', 'require_fields',
  22. 'stack_arrays', 'structured_to_unstructured', 'unstructured_to_structured',
  23. ]
  24. def _recursive_fill_fields_dispatcher(input, output):
  25. return (input, output)
  26. @array_function_dispatch(_recursive_fill_fields_dispatcher)
  27. def recursive_fill_fields(input, output):
  28. """
  29. Fills fields from output with fields from input,
  30. with support for nested structures.
  31. Parameters
  32. ----------
  33. input : ndarray
  34. Input array.
  35. output : ndarray
  36. Output array.
  37. Notes
  38. -----
  39. * `output` should be at least the same size as `input`
  40. Examples
  41. --------
  42. >>> from numpy.lib import recfunctions as rfn
  43. >>> a = np.array([(1, 10.), (2, 20.)], dtype=[('A', np.int64), ('B', np.float64)])
  44. >>> b = np.zeros((3,), dtype=a.dtype)
  45. >>> rfn.recursive_fill_fields(a, b)
  46. array([(1, 10.), (2, 20.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])
  47. """
  48. newdtype = output.dtype
  49. for field in newdtype.names:
  50. try:
  51. current = input[field]
  52. except ValueError:
  53. continue
  54. if current.dtype.names is not None:
  55. recursive_fill_fields(current, output[field])
  56. else:
  57. output[field][:len(current)] = current
  58. return output
  59. def _get_fieldspec(dtype):
  60. """
  61. Produce a list of name/dtype pairs corresponding to the dtype fields
  62. Similar to dtype.descr, but the second item of each tuple is a dtype, not a
  63. string. As a result, this handles subarray dtypes
  64. Can be passed to the dtype constructor to reconstruct the dtype, noting that
  65. this (deliberately) discards field offsets.
  66. Examples
  67. --------
  68. >>> dt = np.dtype([(('a', 'A'), np.int64), ('b', np.double, 3)])
  69. >>> dt.descr
  70. [(('a', 'A'), '<i8'), ('b', '<f8', (3,))]
  71. >>> _get_fieldspec(dt)
  72. [(('a', 'A'), dtype('int64')), ('b', dtype(('<f8', (3,))))]
  73. """
  74. if dtype.names is None:
  75. # .descr returns a nameless field, so we should too
  76. return [('', dtype)]
  77. else:
  78. fields = ((name, dtype.fields[name]) for name in dtype.names)
  79. # keep any titles, if present
  80. return [
  81. (name if len(f) == 2 else (f[2], name), f[0])
  82. for name, f in fields
  83. ]
  84. def get_names(adtype):
  85. """
  86. Returns the field names of the input datatype as a tuple. Input datatype
  87. must have fields otherwise error is raised.
  88. Parameters
  89. ----------
  90. adtype : dtype
  91. Input datatype
  92. Examples
  93. --------
  94. >>> from numpy.lib import recfunctions as rfn
  95. >>> rfn.get_names(np.empty((1,), dtype=[('A', int)]).dtype)
  96. ('A',)
  97. >>> rfn.get_names(np.empty((1,), dtype=[('A',int), ('B', float)]).dtype)
  98. ('A', 'B')
  99. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  100. >>> rfn.get_names(adtype)
  101. ('a', ('b', ('ba', 'bb')))
  102. """
  103. listnames = []
  104. names = adtype.names
  105. for name in names:
  106. current = adtype[name]
  107. if current.names is not None:
  108. listnames.append((name, tuple(get_names(current))))
  109. else:
  110. listnames.append(name)
  111. return tuple(listnames)
  112. def get_names_flat(adtype):
  113. """
  114. Returns the field names of the input datatype as a tuple. Input datatype
  115. must have fields otherwise error is raised.
  116. Nested structure are flattened beforehand.
  117. Parameters
  118. ----------
  119. adtype : dtype
  120. Input datatype
  121. Examples
  122. --------
  123. >>> from numpy.lib import recfunctions as rfn
  124. >>> rfn.get_names_flat(np.empty((1,), dtype=[('A', int)]).dtype) is None
  125. False
  126. >>> rfn.get_names_flat(np.empty((1,), dtype=[('A',int), ('B', str)]).dtype)
  127. ('A', 'B')
  128. >>> adtype = np.dtype([('a', int), ('b', [('ba', int), ('bb', int)])])
  129. >>> rfn.get_names_flat(adtype)
  130. ('a', 'b', 'ba', 'bb')
  131. """
  132. listnames = []
  133. names = adtype.names
  134. for name in names:
  135. listnames.append(name)
  136. current = adtype[name]
  137. if current.names is not None:
  138. listnames.extend(get_names_flat(current))
  139. return tuple(listnames)
  140. def flatten_descr(ndtype):
  141. """
  142. Flatten a structured data-type description.
  143. Examples
  144. --------
  145. >>> from numpy.lib import recfunctions as rfn
  146. >>> ndtype = np.dtype([('a', '<i4'), ('b', [('ba', '<f8'), ('bb', '<i4')])])
  147. >>> rfn.flatten_descr(ndtype)
  148. (('a', dtype('int32')), ('ba', dtype('float64')), ('bb', dtype('int32')))
  149. """
  150. names = ndtype.names
  151. if names is None:
  152. return (('', ndtype),)
  153. else:
  154. descr = []
  155. for field in names:
  156. (typ, _) = ndtype.fields[field]
  157. if typ.names is not None:
  158. descr.extend(flatten_descr(typ))
  159. else:
  160. descr.append((field, typ))
  161. return tuple(descr)
  162. def _zip_dtype(seqarrays, flatten=False):
  163. newdtype = []
  164. if flatten:
  165. for a in seqarrays:
  166. newdtype.extend(flatten_descr(a.dtype))
  167. else:
  168. for a in seqarrays:
  169. current = a.dtype
  170. if current.names is not None and len(current.names) == 1:
  171. # special case - dtypes of 1 field are flattened
  172. newdtype.extend(_get_fieldspec(current))
  173. else:
  174. newdtype.append(('', current))
  175. return np.dtype(newdtype)
  176. def _zip_descr(seqarrays, flatten=False):
  177. """
  178. Combine the dtype description of a series of arrays.
  179. Parameters
  180. ----------
  181. seqarrays : sequence of arrays
  182. Sequence of arrays
  183. flatten : {boolean}, optional
  184. Whether to collapse nested descriptions.
  185. """
  186. return _zip_dtype(seqarrays, flatten=flatten).descr
  187. def get_fieldstructure(adtype, lastname=None, parents=None,):
  188. """
  189. Returns a dictionary with fields indexing lists of their parent fields.
  190. This function is used to simplify access to fields nested in other fields.
  191. Parameters
  192. ----------
  193. adtype : np.dtype
  194. Input datatype
  195. lastname : optional
  196. Last processed field name (used internally during recursion).
  197. parents : dictionary
  198. Dictionary of parent fields (used interbally during recursion).
  199. Examples
  200. --------
  201. >>> from numpy.lib import recfunctions as rfn
  202. >>> ndtype = np.dtype([('A', int),
  203. ... ('B', [('BA', int),
  204. ... ('BB', [('BBA', int), ('BBB', int)])])])
  205. >>> rfn.get_fieldstructure(ndtype)
  206. ... # XXX: possible regression, order of BBA and BBB is swapped
  207. {'A': [], 'B': [], 'BA': ['B'], 'BB': ['B'], 'BBA': ['B', 'BB'], 'BBB': ['B', 'BB']}
  208. """
  209. if parents is None:
  210. parents = {}
  211. names = adtype.names
  212. for name in names:
  213. current = adtype[name]
  214. if current.names is not None:
  215. if lastname:
  216. parents[name] = [lastname, ]
  217. else:
  218. parents[name] = []
  219. parents.update(get_fieldstructure(current, name, parents))
  220. else:
  221. lastparent = [_ for _ in (parents.get(lastname, []) or [])]
  222. if lastparent:
  223. lastparent.append(lastname)
  224. elif lastname:
  225. lastparent = [lastname, ]
  226. parents[name] = lastparent or []
  227. return parents
  228. def _izip_fields_flat(iterable):
  229. """
  230. Returns an iterator of concatenated fields from a sequence of arrays,
  231. collapsing any nested structure.
  232. """
  233. for element in iterable:
  234. if isinstance(element, np.void):
  235. yield from _izip_fields_flat(tuple(element))
  236. else:
  237. yield element
  238. def _izip_fields(iterable):
  239. """
  240. Returns an iterator of concatenated fields from a sequence of arrays.
  241. """
  242. for element in iterable:
  243. if (hasattr(element, '__iter__') and
  244. not isinstance(element, str)):
  245. yield from _izip_fields(element)
  246. elif isinstance(element, np.void) and len(tuple(element)) == 1:
  247. # this statement is the same from the previous expression
  248. yield from _izip_fields(element)
  249. else:
  250. yield element
  251. def _izip_records(seqarrays, fill_value=None, flatten=True):
  252. """
  253. Returns an iterator of concatenated items from a sequence of arrays.
  254. Parameters
  255. ----------
  256. seqarrays : sequence of arrays
  257. Sequence of arrays.
  258. fill_value : {None, integer}
  259. Value used to pad shorter iterables.
  260. flatten : {True, False},
  261. Whether to
  262. """
  263. # Should we flatten the items, or just use a nested approach
  264. if flatten:
  265. zipfunc = _izip_fields_flat
  266. else:
  267. zipfunc = _izip_fields
  268. for tup in itertools.zip_longest(*seqarrays, fillvalue=fill_value):
  269. yield tuple(zipfunc(tup))
  270. def _fix_output(output, usemask=True, asrecarray=False):
  271. """
  272. Private function: return a recarray, a ndarray, a MaskedArray
  273. or a MaskedRecords depending on the input parameters
  274. """
  275. if not isinstance(output, MaskedArray):
  276. usemask = False
  277. if usemask:
  278. if asrecarray:
  279. output = output.view(MaskedRecords)
  280. else:
  281. output = ma.filled(output)
  282. if asrecarray:
  283. output = output.view(recarray)
  284. return output
  285. def _fix_defaults(output, defaults=None):
  286. """
  287. Update the fill_value and masked data of `output`
  288. from the default given in a dictionary defaults.
  289. """
  290. names = output.dtype.names
  291. (data, mask, fill_value) = (output.data, output.mask, output.fill_value)
  292. for (k, v) in (defaults or {}).items():
  293. if k in names:
  294. fill_value[k] = v
  295. data[k][mask[k]] = v
  296. return output
  297. def _merge_arrays_dispatcher(seqarrays, fill_value=None, flatten=None,
  298. usemask=None, asrecarray=None):
  299. return seqarrays
  300. @array_function_dispatch(_merge_arrays_dispatcher)
  301. def merge_arrays(seqarrays, fill_value=-1, flatten=False,
  302. usemask=False, asrecarray=False):
  303. """
  304. Merge arrays field by field.
  305. Parameters
  306. ----------
  307. seqarrays : sequence of ndarrays
  308. Sequence of arrays
  309. fill_value : {float}, optional
  310. Filling value used to pad missing data on the shorter arrays.
  311. flatten : {False, True}, optional
  312. Whether to collapse nested fields.
  313. usemask : {False, True}, optional
  314. Whether to return a masked array or not.
  315. asrecarray : {False, True}, optional
  316. Whether to return a recarray (MaskedRecords) or not.
  317. Examples
  318. --------
  319. >>> from numpy.lib import recfunctions as rfn
  320. >>> rfn.merge_arrays((np.array([1, 2]), np.array([10., 20., 30.])))
  321. array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  322. dtype=[('f0', '<i8'), ('f1', '<f8')])
  323. >>> rfn.merge_arrays((np.array([1, 2], dtype=np.int64),
  324. ... np.array([10., 20., 30.])), usemask=False)
  325. array([(1, 10.0), (2, 20.0), (-1, 30.0)],
  326. dtype=[('f0', '<i8'), ('f1', '<f8')])
  327. >>> rfn.merge_arrays((np.array([1, 2]).view([('a', np.int64)]),
  328. ... np.array([10., 20., 30.])),
  329. ... usemask=False, asrecarray=True)
  330. rec.array([( 1, 10.), ( 2, 20.), (-1, 30.)],
  331. dtype=[('a', '<i8'), ('f1', '<f8')])
  332. Notes
  333. -----
  334. * Without a mask, the missing value will be filled with something,
  335. depending on what its corresponding type:
  336. * ``-1`` for integers
  337. * ``-1.0`` for floating point numbers
  338. * ``'-'`` for characters
  339. * ``'-1'`` for strings
  340. * ``True`` for boolean values
  341. * XXX: I just obtained these values empirically
  342. """
  343. # Only one item in the input sequence ?
  344. if (len(seqarrays) == 1):
  345. seqarrays = np.asanyarray(seqarrays[0])
  346. # Do we have a single ndarray as input ?
  347. if isinstance(seqarrays, (ndarray, np.void)):
  348. seqdtype = seqarrays.dtype
  349. # Make sure we have named fields
  350. if seqdtype.names is None:
  351. seqdtype = np.dtype([('', seqdtype)])
  352. if not flatten or _zip_dtype((seqarrays,), flatten=True) == seqdtype:
  353. # Minimal processing needed: just make sure everything's a-ok
  354. seqarrays = seqarrays.ravel()
  355. # Find what type of array we must return
  356. if usemask:
  357. if asrecarray:
  358. seqtype = MaskedRecords
  359. else:
  360. seqtype = MaskedArray
  361. elif asrecarray:
  362. seqtype = recarray
  363. else:
  364. seqtype = ndarray
  365. return seqarrays.view(dtype=seqdtype, type=seqtype)
  366. else:
  367. seqarrays = (seqarrays,)
  368. else:
  369. # Make sure we have arrays in the input sequence
  370. seqarrays = [np.asanyarray(_m) for _m in seqarrays]
  371. # Find the sizes of the inputs and their maximum
  372. sizes = tuple(a.size for a in seqarrays)
  373. maxlength = max(sizes)
  374. # Get the dtype of the output (flattening if needed)
  375. newdtype = _zip_dtype(seqarrays, flatten=flatten)
  376. # Initialize the sequences for data and mask
  377. seqdata = []
  378. seqmask = []
  379. # If we expect some kind of MaskedArray, make a special loop.
  380. if usemask:
  381. for (a, n) in zip(seqarrays, sizes):
  382. nbmissing = (maxlength - n)
  383. # Get the data and mask
  384. data = a.ravel().__array__()
  385. mask = ma.getmaskarray(a).ravel()
  386. # Get the filling value (if needed)
  387. if nbmissing:
  388. fval = _check_fill_value(fill_value, a.dtype)
  389. if isinstance(fval, (ndarray, np.void)):
  390. if len(fval.dtype) == 1:
  391. fval = fval.item()[0]
  392. fmsk = True
  393. else:
  394. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  395. fmsk = np.ones((1,), dtype=mask.dtype)
  396. else:
  397. fval = None
  398. fmsk = True
  399. # Store an iterator padding the input to the expected length
  400. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  401. seqmask.append(itertools.chain(mask, [fmsk] * nbmissing))
  402. # Create an iterator for the data
  403. data = tuple(_izip_records(seqdata, flatten=flatten))
  404. output = ma.array(np.fromiter(data, dtype=newdtype, count=maxlength),
  405. mask=list(_izip_records(seqmask, flatten=flatten)))
  406. if asrecarray:
  407. output = output.view(MaskedRecords)
  408. else:
  409. # Same as before, without the mask we don't need...
  410. for (a, n) in zip(seqarrays, sizes):
  411. nbmissing = (maxlength - n)
  412. data = a.ravel().__array__()
  413. if nbmissing:
  414. fval = _check_fill_value(fill_value, a.dtype)
  415. if isinstance(fval, (ndarray, np.void)):
  416. if len(fval.dtype) == 1:
  417. fval = fval.item()[0]
  418. else:
  419. fval = np.array(fval, dtype=a.dtype, ndmin=1)
  420. else:
  421. fval = None
  422. seqdata.append(itertools.chain(data, [fval] * nbmissing))
  423. output = np.fromiter(tuple(_izip_records(seqdata, flatten=flatten)),
  424. dtype=newdtype, count=maxlength)
  425. if asrecarray:
  426. output = output.view(recarray)
  427. # And we're done...
  428. return output
  429. def _drop_fields_dispatcher(base, drop_names, usemask=None, asrecarray=None):
  430. return (base,)
  431. @array_function_dispatch(_drop_fields_dispatcher)
  432. def drop_fields(base, drop_names, usemask=True, asrecarray=False):
  433. """
  434. Return a new array with fields in `drop_names` dropped.
  435. Nested fields are supported.
  436. .. versionchanged:: 1.18.0
  437. `drop_fields` returns an array with 0 fields if all fields are dropped,
  438. rather than returning ``None`` as it did previously.
  439. Parameters
  440. ----------
  441. base : array
  442. Input array
  443. drop_names : string or sequence
  444. String or sequence of strings corresponding to the names of the
  445. fields to drop.
  446. usemask : {False, True}, optional
  447. Whether to return a masked array or not.
  448. asrecarray : string or sequence, optional
  449. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  450. a plain ndarray or masked array with flexible dtype. The default
  451. is False.
  452. Examples
  453. --------
  454. >>> from numpy.lib import recfunctions as rfn
  455. >>> a = np.array([(1, (2, 3.0)), (4, (5, 6.0))],
  456. ... dtype=[('a', np.int64), ('b', [('ba', np.double), ('bb', np.int64)])])
  457. >>> rfn.drop_fields(a, 'a')
  458. array([((2., 3),), ((5., 6),)],
  459. dtype=[('b', [('ba', '<f8'), ('bb', '<i8')])])
  460. >>> rfn.drop_fields(a, 'ba')
  461. array([(1, (3,)), (4, (6,))], dtype=[('a', '<i8'), ('b', [('bb', '<i8')])])
  462. >>> rfn.drop_fields(a, ['ba', 'bb'])
  463. array([(1,), (4,)], dtype=[('a', '<i8')])
  464. """
  465. if _is_string_like(drop_names):
  466. drop_names = [drop_names]
  467. else:
  468. drop_names = set(drop_names)
  469. def _drop_descr(ndtype, drop_names):
  470. names = ndtype.names
  471. newdtype = []
  472. for name in names:
  473. current = ndtype[name]
  474. if name in drop_names:
  475. continue
  476. if current.names is not None:
  477. descr = _drop_descr(current, drop_names)
  478. if descr:
  479. newdtype.append((name, descr))
  480. else:
  481. newdtype.append((name, current))
  482. return newdtype
  483. newdtype = _drop_descr(base.dtype, drop_names)
  484. output = np.empty(base.shape, dtype=newdtype)
  485. output = recursive_fill_fields(base, output)
  486. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  487. def _keep_fields(base, keep_names, usemask=True, asrecarray=False):
  488. """
  489. Return a new array keeping only the fields in `keep_names`,
  490. and preserving the order of those fields.
  491. Parameters
  492. ----------
  493. base : array
  494. Input array
  495. keep_names : string or sequence
  496. String or sequence of strings corresponding to the names of the
  497. fields to keep. Order of the names will be preserved.
  498. usemask : {False, True}, optional
  499. Whether to return a masked array or not.
  500. asrecarray : string or sequence, optional
  501. Whether to return a recarray or a mrecarray (`asrecarray=True`) or
  502. a plain ndarray or masked array with flexible dtype. The default
  503. is False.
  504. """
  505. newdtype = [(n, base.dtype[n]) for n in keep_names]
  506. output = np.empty(base.shape, dtype=newdtype)
  507. output = recursive_fill_fields(base, output)
  508. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  509. def _rec_drop_fields_dispatcher(base, drop_names):
  510. return (base,)
  511. @array_function_dispatch(_rec_drop_fields_dispatcher)
  512. def rec_drop_fields(base, drop_names):
  513. """
  514. Returns a new numpy.recarray with fields in `drop_names` dropped.
  515. """
  516. return drop_fields(base, drop_names, usemask=False, asrecarray=True)
  517. def _rename_fields_dispatcher(base, namemapper):
  518. return (base,)
  519. @array_function_dispatch(_rename_fields_dispatcher)
  520. def rename_fields(base, namemapper):
  521. """
  522. Rename the fields from a flexible-datatype ndarray or recarray.
  523. Nested fields are supported.
  524. Parameters
  525. ----------
  526. base : ndarray
  527. Input array whose fields must be modified.
  528. namemapper : dictionary
  529. Dictionary mapping old field names to their new version.
  530. Examples
  531. --------
  532. >>> from numpy.lib import recfunctions as rfn
  533. >>> a = np.array([(1, (2, [3.0, 30.])), (4, (5, [6.0, 60.]))],
  534. ... dtype=[('a', int),('b', [('ba', float), ('bb', (float, 2))])])
  535. >>> rfn.rename_fields(a, {'a':'A', 'bb':'BB'})
  536. array([(1, (2., [ 3., 30.])), (4, (5., [ 6., 60.]))],
  537. dtype=[('A', '<i8'), ('b', [('ba', '<f8'), ('BB', '<f8', (2,))])])
  538. """
  539. def _recursive_rename_fields(ndtype, namemapper):
  540. newdtype = []
  541. for name in ndtype.names:
  542. newname = namemapper.get(name, name)
  543. current = ndtype[name]
  544. if current.names is not None:
  545. newdtype.append(
  546. (newname, _recursive_rename_fields(current, namemapper))
  547. )
  548. else:
  549. newdtype.append((newname, current))
  550. return newdtype
  551. newdtype = _recursive_rename_fields(base.dtype, namemapper)
  552. return base.view(newdtype)
  553. def _append_fields_dispatcher(base, names, data, dtypes=None,
  554. fill_value=None, usemask=None, asrecarray=None):
  555. yield base
  556. yield from data
  557. @array_function_dispatch(_append_fields_dispatcher)
  558. def append_fields(base, names, data, dtypes=None,
  559. fill_value=-1, usemask=True, asrecarray=False):
  560. """
  561. Add new fields to an existing array.
  562. The names of the fields are given with the `names` arguments,
  563. the corresponding values with the `data` arguments.
  564. If a single field is appended, `names`, `data` and `dtypes` do not have
  565. to be lists but just values.
  566. Parameters
  567. ----------
  568. base : array
  569. Input array to extend.
  570. names : string, sequence
  571. String or sequence of strings corresponding to the names
  572. of the new fields.
  573. data : array or sequence of arrays
  574. Array or sequence of arrays storing the fields to add to the base.
  575. dtypes : sequence of datatypes, optional
  576. Datatype or sequence of datatypes.
  577. If None, the datatypes are estimated from the `data`.
  578. fill_value : {float}, optional
  579. Filling value used to pad missing data on the shorter arrays.
  580. usemask : {False, True}, optional
  581. Whether to return a masked array or not.
  582. asrecarray : {False, True}, optional
  583. Whether to return a recarray (MaskedRecords) or not.
  584. """
  585. # Check the names
  586. if isinstance(names, (tuple, list)):
  587. if len(names) != len(data):
  588. msg = "The number of arrays does not match the number of names"
  589. raise ValueError(msg)
  590. elif isinstance(names, str):
  591. names = [names, ]
  592. data = [data, ]
  593. #
  594. if dtypes is None:
  595. data = [np.array(a, copy=False, subok=True) for a in data]
  596. data = [a.view([(name, a.dtype)]) for (name, a) in zip(names, data)]
  597. else:
  598. if not isinstance(dtypes, (tuple, list)):
  599. dtypes = [dtypes, ]
  600. if len(data) != len(dtypes):
  601. if len(dtypes) == 1:
  602. dtypes = dtypes * len(data)
  603. else:
  604. msg = "The dtypes argument must be None, a dtype, or a list."
  605. raise ValueError(msg)
  606. data = [np.array(a, copy=False, subok=True, dtype=d).view([(n, d)])
  607. for (a, n, d) in zip(data, names, dtypes)]
  608. #
  609. base = merge_arrays(base, usemask=usemask, fill_value=fill_value)
  610. if len(data) > 1:
  611. data = merge_arrays(data, flatten=True, usemask=usemask,
  612. fill_value=fill_value)
  613. else:
  614. data = data.pop()
  615. #
  616. output = ma.masked_all(
  617. max(len(base), len(data)),
  618. dtype=_get_fieldspec(base.dtype) + _get_fieldspec(data.dtype))
  619. output = recursive_fill_fields(base, output)
  620. output = recursive_fill_fields(data, output)
  621. #
  622. return _fix_output(output, usemask=usemask, asrecarray=asrecarray)
  623. def _rec_append_fields_dispatcher(base, names, data, dtypes=None):
  624. yield base
  625. yield from data
  626. @array_function_dispatch(_rec_append_fields_dispatcher)
  627. def rec_append_fields(base, names, data, dtypes=None):
  628. """
  629. Add new fields to an existing array.
  630. The names of the fields are given with the `names` arguments,
  631. the corresponding values with the `data` arguments.
  632. If a single field is appended, `names`, `data` and `dtypes` do not have
  633. to be lists but just values.
  634. Parameters
  635. ----------
  636. base : array
  637. Input array to extend.
  638. names : string, sequence
  639. String or sequence of strings corresponding to the names
  640. of the new fields.
  641. data : array or sequence of arrays
  642. Array or sequence of arrays storing the fields to add to the base.
  643. dtypes : sequence of datatypes, optional
  644. Datatype or sequence of datatypes.
  645. If None, the datatypes are estimated from the `data`.
  646. See Also
  647. --------
  648. append_fields
  649. Returns
  650. -------
  651. appended_array : np.recarray
  652. """
  653. return append_fields(base, names, data=data, dtypes=dtypes,
  654. asrecarray=True, usemask=False)
  655. def _repack_fields_dispatcher(a, align=None, recurse=None):
  656. return (a,)
  657. @array_function_dispatch(_repack_fields_dispatcher)
  658. def repack_fields(a, align=False, recurse=False):
  659. """
  660. Re-pack the fields of a structured array or dtype in memory.
  661. The memory layout of structured datatypes allows fields at arbitrary
  662. byte offsets. This means the fields can be separated by padding bytes,
  663. their offsets can be non-monotonically increasing, and they can overlap.
  664. This method removes any overlaps and reorders the fields in memory so they
  665. have increasing byte offsets, and adds or removes padding bytes depending
  666. on the `align` option, which behaves like the `align` option to
  667. `numpy.dtype`.
  668. If `align=False`, this method produces a "packed" memory layout in which
  669. each field starts at the byte the previous field ended, and any padding
  670. bytes are removed.
  671. If `align=True`, this methods produces an "aligned" memory layout in which
  672. each field's offset is a multiple of its alignment, and the total itemsize
  673. is a multiple of the largest alignment, by adding padding bytes as needed.
  674. Parameters
  675. ----------
  676. a : ndarray or dtype
  677. array or dtype for which to repack the fields.
  678. align : boolean
  679. If true, use an "aligned" memory layout, otherwise use a "packed" layout.
  680. recurse : boolean
  681. If True, also repack nested structures.
  682. Returns
  683. -------
  684. repacked : ndarray or dtype
  685. Copy of `a` with fields repacked, or `a` itself if no repacking was
  686. needed.
  687. Examples
  688. --------
  689. >>> from numpy.lib import recfunctions as rfn
  690. >>> def print_offsets(d):
  691. ... print("offsets:", [d.fields[name][1] for name in d.names])
  692. ... print("itemsize:", d.itemsize)
  693. ...
  694. >>> dt = np.dtype('u1, <i8, <f8', align=True)
  695. >>> dt
  696. dtype({'names': ['f0', 'f1', 'f2'], 'formats': ['u1', '<i8', '<f8'], \
  697. 'offsets': [0, 8, 16], 'itemsize': 24}, align=True)
  698. >>> print_offsets(dt)
  699. offsets: [0, 8, 16]
  700. itemsize: 24
  701. >>> packed_dt = rfn.repack_fields(dt)
  702. >>> packed_dt
  703. dtype([('f0', 'u1'), ('f1', '<i8'), ('f2', '<f8')])
  704. >>> print_offsets(packed_dt)
  705. offsets: [0, 1, 9]
  706. itemsize: 17
  707. """
  708. if not isinstance(a, np.dtype):
  709. dt = repack_fields(a.dtype, align=align, recurse=recurse)
  710. return a.astype(dt, copy=False)
  711. if a.names is None:
  712. return a
  713. fieldinfo = []
  714. for name in a.names:
  715. tup = a.fields[name]
  716. if recurse:
  717. fmt = repack_fields(tup[0], align=align, recurse=True)
  718. else:
  719. fmt = tup[0]
  720. if len(tup) == 3:
  721. name = (tup[2], name)
  722. fieldinfo.append((name, fmt))
  723. dt = np.dtype(fieldinfo, align=align)
  724. return np.dtype((a.type, dt))
  725. def _get_fields_and_offsets(dt, offset=0):
  726. """
  727. Returns a flat list of (dtype, count, offset) tuples of all the
  728. scalar fields in the dtype "dt", including nested fields, in left
  729. to right order.
  730. """
  731. # counts up elements in subarrays, including nested subarrays, and returns
  732. # base dtype and count
  733. def count_elem(dt):
  734. count = 1
  735. while dt.shape != ():
  736. for size in dt.shape:
  737. count *= size
  738. dt = dt.base
  739. return dt, count
  740. fields = []
  741. for name in dt.names:
  742. field = dt.fields[name]
  743. f_dt, f_offset = field[0], field[1]
  744. f_dt, n = count_elem(f_dt)
  745. if f_dt.names is None:
  746. fields.append((np.dtype((f_dt, (n,))), n, f_offset + offset))
  747. else:
  748. subfields = _get_fields_and_offsets(f_dt, f_offset + offset)
  749. size = f_dt.itemsize
  750. for i in range(n):
  751. if i == 0:
  752. # optimization: avoid list comprehension if no subarray
  753. fields.extend(subfields)
  754. else:
  755. fields.extend([(d, c, o + i*size) for d, c, o in subfields])
  756. return fields
  757. def _structured_to_unstructured_dispatcher(arr, dtype=None, copy=None,
  758. casting=None):
  759. return (arr,)
  760. @array_function_dispatch(_structured_to_unstructured_dispatcher)
  761. def structured_to_unstructured(arr, dtype=None, copy=False, casting='unsafe'):
  762. """
  763. Converts an n-D structured array into an (n+1)-D unstructured array.
  764. The new array will have a new last dimension equal in size to the
  765. number of field-elements of the input array. If not supplied, the output
  766. datatype is determined from the numpy type promotion rules applied to all
  767. the field datatypes.
  768. Nested fields, as well as each element of any subarray fields, all count
  769. as a single field-elements.
  770. Parameters
  771. ----------
  772. arr : ndarray
  773. Structured array or dtype to convert. Cannot contain object datatype.
  774. dtype : dtype, optional
  775. The dtype of the output unstructured array.
  776. copy : bool, optional
  777. See copy argument to `numpy.ndarray.astype`. If true, always return a
  778. copy. If false, and `dtype` requirements are satisfied, a view is
  779. returned.
  780. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  781. See casting argument of `numpy.ndarray.astype`. Controls what kind of
  782. data casting may occur.
  783. Returns
  784. -------
  785. unstructured : ndarray
  786. Unstructured array with one more dimension.
  787. Examples
  788. --------
  789. >>> from numpy.lib import recfunctions as rfn
  790. >>> a = np.zeros(4, dtype=[('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  791. >>> a
  792. array([(0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.]),
  793. (0, (0., 0), [0., 0.]), (0, (0., 0), [0., 0.])],
  794. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  795. >>> rfn.structured_to_unstructured(a)
  796. array([[0., 0., 0., 0., 0.],
  797. [0., 0., 0., 0., 0.],
  798. [0., 0., 0., 0., 0.],
  799. [0., 0., 0., 0., 0.]])
  800. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  801. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  802. >>> np.mean(rfn.structured_to_unstructured(b[['x', 'z']]), axis=-1)
  803. array([ 3. , 5.5, 9. , 11. ])
  804. """
  805. if arr.dtype.names is None:
  806. raise ValueError('arr must be a structured array')
  807. fields = _get_fields_and_offsets(arr.dtype)
  808. n_fields = len(fields)
  809. if n_fields == 0 and dtype is None:
  810. raise ValueError("arr has no fields. Unable to guess dtype")
  811. elif n_fields == 0:
  812. # too many bugs elsewhere for this to work now
  813. raise NotImplementedError("arr with no fields is not supported")
  814. dts, counts, offsets = zip(*fields)
  815. names = ['f{}'.format(n) for n in range(n_fields)]
  816. if dtype is None:
  817. out_dtype = np.result_type(*[dt.base for dt in dts])
  818. else:
  819. out_dtype = dtype
  820. # Use a series of views and casts to convert to an unstructured array:
  821. # first view using flattened fields (doesn't work for object arrays)
  822. # Note: dts may include a shape for subarrays
  823. flattened_fields = np.dtype({'names': names,
  824. 'formats': dts,
  825. 'offsets': offsets,
  826. 'itemsize': arr.dtype.itemsize})
  827. arr = arr.view(flattened_fields)
  828. # next cast to a packed format with all fields converted to new dtype
  829. packed_fields = np.dtype({'names': names,
  830. 'formats': [(out_dtype, dt.shape) for dt in dts]})
  831. arr = arr.astype(packed_fields, copy=copy, casting=casting)
  832. # finally is it safe to view the packed fields as the unstructured type
  833. return arr.view((out_dtype, (sum(counts),)))
  834. def _unstructured_to_structured_dispatcher(arr, dtype=None, names=None,
  835. align=None, copy=None, casting=None):
  836. return (arr,)
  837. @array_function_dispatch(_unstructured_to_structured_dispatcher)
  838. def unstructured_to_structured(arr, dtype=None, names=None, align=False,
  839. copy=False, casting='unsafe'):
  840. """
  841. Converts an n-D unstructured array into an (n-1)-D structured array.
  842. The last dimension of the input array is converted into a structure, with
  843. number of field-elements equal to the size of the last dimension of the
  844. input array. By default all output fields have the input array's dtype, but
  845. an output structured dtype with an equal number of fields-elements can be
  846. supplied instead.
  847. Nested fields, as well as each element of any subarray fields, all count
  848. towards the number of field-elements.
  849. Parameters
  850. ----------
  851. arr : ndarray
  852. Unstructured array or dtype to convert.
  853. dtype : dtype, optional
  854. The structured dtype of the output array
  855. names : list of strings, optional
  856. If dtype is not supplied, this specifies the field names for the output
  857. dtype, in order. The field dtypes will be the same as the input array.
  858. align : boolean, optional
  859. Whether to create an aligned memory layout.
  860. copy : bool, optional
  861. See copy argument to `numpy.ndarray.astype`. If true, always return a
  862. copy. If false, and `dtype` requirements are satisfied, a view is
  863. returned.
  864. casting : {'no', 'equiv', 'safe', 'same_kind', 'unsafe'}, optional
  865. See casting argument of `numpy.ndarray.astype`. Controls what kind of
  866. data casting may occur.
  867. Returns
  868. -------
  869. structured : ndarray
  870. Structured array with fewer dimensions.
  871. Examples
  872. --------
  873. >>> from numpy.lib import recfunctions as rfn
  874. >>> dt = np.dtype([('a', 'i4'), ('b', 'f4,u2'), ('c', 'f4', 2)])
  875. >>> a = np.arange(20).reshape((4,5))
  876. >>> a
  877. array([[ 0, 1, 2, 3, 4],
  878. [ 5, 6, 7, 8, 9],
  879. [10, 11, 12, 13, 14],
  880. [15, 16, 17, 18, 19]])
  881. >>> rfn.unstructured_to_structured(a, dt)
  882. array([( 0, ( 1., 2), [ 3., 4.]), ( 5, ( 6., 7), [ 8., 9.]),
  883. (10, (11., 12), [13., 14.]), (15, (16., 17), [18., 19.])],
  884. dtype=[('a', '<i4'), ('b', [('f0', '<f4'), ('f1', '<u2')]), ('c', '<f4', (2,))])
  885. """
  886. if arr.shape == ():
  887. raise ValueError('arr must have at least one dimension')
  888. n_elem = arr.shape[-1]
  889. if n_elem == 0:
  890. # too many bugs elsewhere for this to work now
  891. raise NotImplementedError("last axis with size 0 is not supported")
  892. if dtype is None:
  893. if names is None:
  894. names = ['f{}'.format(n) for n in range(n_elem)]
  895. out_dtype = np.dtype([(n, arr.dtype) for n in names], align=align)
  896. fields = _get_fields_and_offsets(out_dtype)
  897. dts, counts, offsets = zip(*fields)
  898. else:
  899. if names is not None:
  900. raise ValueError("don't supply both dtype and names")
  901. # if dtype is the args of np.dtype, construct it
  902. dtype = np.dtype(dtype)
  903. # sanity check of the input dtype
  904. fields = _get_fields_and_offsets(dtype)
  905. if len(fields) == 0:
  906. dts, counts, offsets = [], [], []
  907. else:
  908. dts, counts, offsets = zip(*fields)
  909. if n_elem != sum(counts):
  910. raise ValueError('The length of the last dimension of arr must '
  911. 'be equal to the number of fields in dtype')
  912. out_dtype = dtype
  913. if align and not out_dtype.isalignedstruct:
  914. raise ValueError("align was True but dtype is not aligned")
  915. names = ['f{}'.format(n) for n in range(len(fields))]
  916. # Use a series of views and casts to convert to a structured array:
  917. # first view as a packed structured array of one dtype
  918. packed_fields = np.dtype({'names': names,
  919. 'formats': [(arr.dtype, dt.shape) for dt in dts]})
  920. arr = np.ascontiguousarray(arr).view(packed_fields)
  921. # next cast to an unpacked but flattened format with varied dtypes
  922. flattened_fields = np.dtype({'names': names,
  923. 'formats': dts,
  924. 'offsets': offsets,
  925. 'itemsize': out_dtype.itemsize})
  926. arr = arr.astype(flattened_fields, copy=copy, casting=casting)
  927. # finally view as the final nested dtype and remove the last axis
  928. return arr.view(out_dtype)[..., 0]
  929. def _apply_along_fields_dispatcher(func, arr):
  930. return (arr,)
  931. @array_function_dispatch(_apply_along_fields_dispatcher)
  932. def apply_along_fields(func, arr):
  933. """
  934. Apply function 'func' as a reduction across fields of a structured array.
  935. This is similar to `apply_along_axis`, but treats the fields of a
  936. structured array as an extra axis. The fields are all first cast to a
  937. common type following the type-promotion rules from `numpy.result_type`
  938. applied to the field's dtypes.
  939. Parameters
  940. ----------
  941. func : function
  942. Function to apply on the "field" dimension. This function must
  943. support an `axis` argument, like np.mean, np.sum, etc.
  944. arr : ndarray
  945. Structured array for which to apply func.
  946. Returns
  947. -------
  948. out : ndarray
  949. Result of the recution operation
  950. Examples
  951. --------
  952. >>> from numpy.lib import recfunctions as rfn
  953. >>> b = np.array([(1, 2, 5), (4, 5, 7), (7, 8 ,11), (10, 11, 12)],
  954. ... dtype=[('x', 'i4'), ('y', 'f4'), ('z', 'f8')])
  955. >>> rfn.apply_along_fields(np.mean, b)
  956. array([ 2.66666667, 5.33333333, 8.66666667, 11. ])
  957. >>> rfn.apply_along_fields(np.mean, b[['x', 'z']])
  958. array([ 3. , 5.5, 9. , 11. ])
  959. """
  960. if arr.dtype.names is None:
  961. raise ValueError('arr must be a structured array')
  962. uarr = structured_to_unstructured(arr)
  963. return func(uarr, axis=-1)
  964. # works and avoids axis requirement, but very, very slow:
  965. #return np.apply_along_axis(func, -1, uarr)
  966. def _assign_fields_by_name_dispatcher(dst, src, zero_unassigned=None):
  967. return dst, src
  968. @array_function_dispatch(_assign_fields_by_name_dispatcher)
  969. def assign_fields_by_name(dst, src, zero_unassigned=True):
  970. """
  971. Assigns values from one structured array to another by field name.
  972. Normally in numpy >= 1.14, assignment of one structured array to another
  973. copies fields "by position", meaning that the first field from the src is
  974. copied to the first field of the dst, and so on, regardless of field name.
  975. This function instead copies "by field name", such that fields in the dst
  976. are assigned from the identically named field in the src. This applies
  977. recursively for nested structures. This is how structure assignment worked
  978. in numpy >= 1.6 to <= 1.13.
  979. Parameters
  980. ----------
  981. dst : ndarray
  982. src : ndarray
  983. The source and destination arrays during assignment.
  984. zero_unassigned : bool, optional
  985. If True, fields in the dst for which there was no matching
  986. field in the src are filled with the value 0 (zero). This
  987. was the behavior of numpy <= 1.13. If False, those fields
  988. are not modified.
  989. """
  990. if dst.dtype.names is None:
  991. dst[...] = src
  992. return
  993. for name in dst.dtype.names:
  994. if name not in src.dtype.names:
  995. if zero_unassigned:
  996. dst[name] = 0
  997. else:
  998. assign_fields_by_name(dst[name], src[name],
  999. zero_unassigned)
  1000. def _require_fields_dispatcher(array, required_dtype):
  1001. return (array,)
  1002. @array_function_dispatch(_require_fields_dispatcher)
  1003. def require_fields(array, required_dtype):
  1004. """
  1005. Casts a structured array to a new dtype using assignment by field-name.
  1006. This function assigns from the old to the new array by name, so the
  1007. value of a field in the output array is the value of the field with the
  1008. same name in the source array. This has the effect of creating a new
  1009. ndarray containing only the fields "required" by the required_dtype.
  1010. If a field name in the required_dtype does not exist in the
  1011. input array, that field is created and set to 0 in the output array.
  1012. Parameters
  1013. ----------
  1014. a : ndarray
  1015. array to cast
  1016. required_dtype : dtype
  1017. datatype for output array
  1018. Returns
  1019. -------
  1020. out : ndarray
  1021. array with the new dtype, with field values copied from the fields in
  1022. the input array with the same name
  1023. Examples
  1024. --------
  1025. >>> from numpy.lib import recfunctions as rfn
  1026. >>> a = np.ones(4, dtype=[('a', 'i4'), ('b', 'f8'), ('c', 'u1')])
  1027. >>> rfn.require_fields(a, [('b', 'f4'), ('c', 'u1')])
  1028. array([(1., 1), (1., 1), (1., 1), (1., 1)],
  1029. dtype=[('b', '<f4'), ('c', 'u1')])
  1030. >>> rfn.require_fields(a, [('b', 'f4'), ('newf', 'u1')])
  1031. array([(1., 0), (1., 0), (1., 0), (1., 0)],
  1032. dtype=[('b', '<f4'), ('newf', 'u1')])
  1033. """
  1034. out = np.empty(array.shape, dtype=required_dtype)
  1035. assign_fields_by_name(out, array)
  1036. return out
  1037. def _stack_arrays_dispatcher(arrays, defaults=None, usemask=None,
  1038. asrecarray=None, autoconvert=None):
  1039. return arrays
  1040. @array_function_dispatch(_stack_arrays_dispatcher)
  1041. def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
  1042. autoconvert=False):
  1043. """
  1044. Superposes arrays fields by fields
  1045. Parameters
  1046. ----------
  1047. arrays : array or sequence
  1048. Sequence of input arrays.
  1049. defaults : dictionary, optional
  1050. Dictionary mapping field names to the corresponding default values.
  1051. usemask : {True, False}, optional
  1052. Whether to return a MaskedArray (or MaskedRecords is
  1053. `asrecarray==True`) or a ndarray.
  1054. asrecarray : {False, True}, optional
  1055. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1056. or just a flexible-type ndarray.
  1057. autoconvert : {False, True}, optional
  1058. Whether automatically cast the type of the field to the maximum.
  1059. Examples
  1060. --------
  1061. >>> from numpy.lib import recfunctions as rfn
  1062. >>> x = np.array([1, 2,])
  1063. >>> rfn.stack_arrays(x) is x
  1064. True
  1065. >>> z = np.array([('A', 1), ('B', 2)], dtype=[('A', '|S3'), ('B', float)])
  1066. >>> zz = np.array([('a', 10., 100.), ('b', 20., 200.), ('c', 30., 300.)],
  1067. ... dtype=[('A', '|S3'), ('B', np.double), ('C', np.double)])
  1068. >>> test = rfn.stack_arrays((z,zz))
  1069. >>> test
  1070. masked_array(data=[(b'A', 1.0, --), (b'B', 2.0, --), (b'a', 10.0, 100.0),
  1071. (b'b', 20.0, 200.0), (b'c', 30.0, 300.0)],
  1072. mask=[(False, False, True), (False, False, True),
  1073. (False, False, False), (False, False, False),
  1074. (False, False, False)],
  1075. fill_value=(b'N/A', 1.e+20, 1.e+20),
  1076. dtype=[('A', 'S3'), ('B', '<f8'), ('C', '<f8')])
  1077. """
  1078. if isinstance(arrays, ndarray):
  1079. return arrays
  1080. elif len(arrays) == 1:
  1081. return arrays[0]
  1082. seqarrays = [np.asanyarray(a).ravel() for a in arrays]
  1083. nrecords = [len(a) for a in seqarrays]
  1084. ndtype = [a.dtype for a in seqarrays]
  1085. fldnames = [d.names for d in ndtype]
  1086. #
  1087. dtype_l = ndtype[0]
  1088. newdescr = _get_fieldspec(dtype_l)
  1089. names = [n for n, d in newdescr]
  1090. for dtype_n in ndtype[1:]:
  1091. for fname, fdtype in _get_fieldspec(dtype_n):
  1092. if fname not in names:
  1093. newdescr.append((fname, fdtype))
  1094. names.append(fname)
  1095. else:
  1096. nameidx = names.index(fname)
  1097. _, cdtype = newdescr[nameidx]
  1098. if autoconvert:
  1099. newdescr[nameidx] = (fname, max(fdtype, cdtype))
  1100. elif fdtype != cdtype:
  1101. raise TypeError("Incompatible type '%s' <> '%s'" %
  1102. (cdtype, fdtype))
  1103. # Only one field: use concatenate
  1104. if len(newdescr) == 1:
  1105. output = ma.concatenate(seqarrays)
  1106. else:
  1107. #
  1108. output = ma.masked_all((np.sum(nrecords),), newdescr)
  1109. offset = np.cumsum(np.r_[0, nrecords])
  1110. seen = []
  1111. for (a, n, i, j) in zip(seqarrays, fldnames, offset[:-1], offset[1:]):
  1112. names = a.dtype.names
  1113. if names is None:
  1114. output['f%i' % len(seen)][i:j] = a
  1115. else:
  1116. for name in n:
  1117. output[name][i:j] = a[name]
  1118. if name not in seen:
  1119. seen.append(name)
  1120. #
  1121. return _fix_output(_fix_defaults(output, defaults),
  1122. usemask=usemask, asrecarray=asrecarray)
  1123. def _find_duplicates_dispatcher(
  1124. a, key=None, ignoremask=None, return_index=None):
  1125. return (a,)
  1126. @array_function_dispatch(_find_duplicates_dispatcher)
  1127. def find_duplicates(a, key=None, ignoremask=True, return_index=False):
  1128. """
  1129. Find the duplicates in a structured array along a given key
  1130. Parameters
  1131. ----------
  1132. a : array-like
  1133. Input array
  1134. key : {string, None}, optional
  1135. Name of the fields along which to check the duplicates.
  1136. If None, the search is performed by records
  1137. ignoremask : {True, False}, optional
  1138. Whether masked data should be discarded or considered as duplicates.
  1139. return_index : {False, True}, optional
  1140. Whether to return the indices of the duplicated values.
  1141. Examples
  1142. --------
  1143. >>> from numpy.lib import recfunctions as rfn
  1144. >>> ndtype = [('a', int)]
  1145. >>> a = np.ma.array([1, 1, 1, 2, 2, 3, 3],
  1146. ... mask=[0, 0, 1, 0, 0, 0, 1]).view(ndtype)
  1147. >>> rfn.find_duplicates(a, ignoremask=True, return_index=True)
  1148. (masked_array(data=[(1,), (1,), (2,), (2,)],
  1149. mask=[(False,), (False,), (False,), (False,)],
  1150. fill_value=(999999,),
  1151. dtype=[('a', '<i8')]), array([0, 1, 3, 4]))
  1152. """
  1153. a = np.asanyarray(a).ravel()
  1154. # Get a dictionary of fields
  1155. fields = get_fieldstructure(a.dtype)
  1156. # Get the sorting data (by selecting the corresponding field)
  1157. base = a
  1158. if key:
  1159. for f in fields[key]:
  1160. base = base[f]
  1161. base = base[key]
  1162. # Get the sorting indices and the sorted data
  1163. sortidx = base.argsort()
  1164. sortedbase = base[sortidx]
  1165. sorteddata = sortedbase.filled()
  1166. # Compare the sorting data
  1167. flag = (sorteddata[:-1] == sorteddata[1:])
  1168. # If masked data must be ignored, set the flag to false where needed
  1169. if ignoremask:
  1170. sortedmask = sortedbase.recordmask
  1171. flag[sortedmask[1:]] = False
  1172. flag = np.concatenate(([False], flag))
  1173. # We need to take the point on the left as well (else we're missing it)
  1174. flag[:-1] = flag[:-1] + flag[1:]
  1175. duplicates = a[sortidx][flag]
  1176. if return_index:
  1177. return (duplicates, sortidx[flag])
  1178. else:
  1179. return duplicates
  1180. def _join_by_dispatcher(
  1181. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1182. defaults=None, usemask=None, asrecarray=None):
  1183. return (r1, r2)
  1184. @array_function_dispatch(_join_by_dispatcher)
  1185. def join_by(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1186. defaults=None, usemask=True, asrecarray=False):
  1187. """
  1188. Join arrays `r1` and `r2` on key `key`.
  1189. The key should be either a string or a sequence of string corresponding
  1190. to the fields used to join the array. An exception is raised if the
  1191. `key` field cannot be found in the two input arrays. Neither `r1` nor
  1192. `r2` should have any duplicates along `key`: the presence of duplicates
  1193. will make the output quite unreliable. Note that duplicates are not
  1194. looked for by the algorithm.
  1195. Parameters
  1196. ----------
  1197. key : {string, sequence}
  1198. A string or a sequence of strings corresponding to the fields used
  1199. for comparison.
  1200. r1, r2 : arrays
  1201. Structured arrays.
  1202. jointype : {'inner', 'outer', 'leftouter'}, optional
  1203. If 'inner', returns the elements common to both r1 and r2.
  1204. If 'outer', returns the common elements as well as the elements of
  1205. r1 not in r2 and the elements of not in r2.
  1206. If 'leftouter', returns the common elements and the elements of r1
  1207. not in r2.
  1208. r1postfix : string, optional
  1209. String appended to the names of the fields of r1 that are present
  1210. in r2 but absent of the key.
  1211. r2postfix : string, optional
  1212. String appended to the names of the fields of r2 that are present
  1213. in r1 but absent of the key.
  1214. defaults : {dictionary}, optional
  1215. Dictionary mapping field names to the corresponding default values.
  1216. usemask : {True, False}, optional
  1217. Whether to return a MaskedArray (or MaskedRecords is
  1218. `asrecarray==True`) or a ndarray.
  1219. asrecarray : {False, True}, optional
  1220. Whether to return a recarray (or MaskedRecords if `usemask==True`)
  1221. or just a flexible-type ndarray.
  1222. Notes
  1223. -----
  1224. * The output is sorted along the key.
  1225. * A temporary array is formed by dropping the fields not in the key for
  1226. the two arrays and concatenating the result. This array is then
  1227. sorted, and the common entries selected. The output is constructed by
  1228. filling the fields with the selected entries. Matching is not
  1229. preserved if there are some duplicates...
  1230. """
  1231. # Check jointype
  1232. if jointype not in ('inner', 'outer', 'leftouter'):
  1233. raise ValueError(
  1234. "The 'jointype' argument should be in 'inner', "
  1235. "'outer' or 'leftouter' (got '%s' instead)" % jointype
  1236. )
  1237. # If we have a single key, put it in a tuple
  1238. if isinstance(key, str):
  1239. key = (key,)
  1240. # Check the keys
  1241. if len(set(key)) != len(key):
  1242. dup = next(x for n,x in enumerate(key) if x in key[n+1:])
  1243. raise ValueError("duplicate join key %r" % dup)
  1244. for name in key:
  1245. if name not in r1.dtype.names:
  1246. raise ValueError('r1 does not have key field %r' % name)
  1247. if name not in r2.dtype.names:
  1248. raise ValueError('r2 does not have key field %r' % name)
  1249. # Make sure we work with ravelled arrays
  1250. r1 = r1.ravel()
  1251. r2 = r2.ravel()
  1252. # Fixme: nb2 below is never used. Commenting out for pyflakes.
  1253. # (nb1, nb2) = (len(r1), len(r2))
  1254. nb1 = len(r1)
  1255. (r1names, r2names) = (r1.dtype.names, r2.dtype.names)
  1256. # Check the names for collision
  1257. collisions = (set(r1names) & set(r2names)) - set(key)
  1258. if collisions and not (r1postfix or r2postfix):
  1259. msg = "r1 and r2 contain common names, r1postfix and r2postfix "
  1260. msg += "can't both be empty"
  1261. raise ValueError(msg)
  1262. # Make temporary arrays of just the keys
  1263. # (use order of keys in `r1` for back-compatibility)
  1264. key1 = [ n for n in r1names if n in key ]
  1265. r1k = _keep_fields(r1, key1)
  1266. r2k = _keep_fields(r2, key1)
  1267. # Concatenate the two arrays for comparison
  1268. aux = ma.concatenate((r1k, r2k))
  1269. idx_sort = aux.argsort(order=key)
  1270. aux = aux[idx_sort]
  1271. #
  1272. # Get the common keys
  1273. flag_in = ma.concatenate(([False], aux[1:] == aux[:-1]))
  1274. flag_in[:-1] = flag_in[1:] + flag_in[:-1]
  1275. idx_in = idx_sort[flag_in]
  1276. idx_1 = idx_in[(idx_in < nb1)]
  1277. idx_2 = idx_in[(idx_in >= nb1)] - nb1
  1278. (r1cmn, r2cmn) = (len(idx_1), len(idx_2))
  1279. if jointype == 'inner':
  1280. (r1spc, r2spc) = (0, 0)
  1281. elif jointype == 'outer':
  1282. idx_out = idx_sort[~flag_in]
  1283. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1284. idx_2 = np.concatenate((idx_2, idx_out[(idx_out >= nb1)] - nb1))
  1285. (r1spc, r2spc) = (len(idx_1) - r1cmn, len(idx_2) - r2cmn)
  1286. elif jointype == 'leftouter':
  1287. idx_out = idx_sort[~flag_in]
  1288. idx_1 = np.concatenate((idx_1, idx_out[(idx_out < nb1)]))
  1289. (r1spc, r2spc) = (len(idx_1) - r1cmn, 0)
  1290. # Select the entries from each input
  1291. (s1, s2) = (r1[idx_1], r2[idx_2])
  1292. #
  1293. # Build the new description of the output array .......
  1294. # Start with the key fields
  1295. ndtype = _get_fieldspec(r1k.dtype)
  1296. # Add the fields from r1
  1297. for fname, fdtype in _get_fieldspec(r1.dtype):
  1298. if fname not in key:
  1299. ndtype.append((fname, fdtype))
  1300. # Add the fields from r2
  1301. for fname, fdtype in _get_fieldspec(r2.dtype):
  1302. # Have we seen the current name already ?
  1303. # we need to rebuild this list every time
  1304. names = list(name for name, dtype in ndtype)
  1305. try:
  1306. nameidx = names.index(fname)
  1307. except ValueError:
  1308. #... we haven't: just add the description to the current list
  1309. ndtype.append((fname, fdtype))
  1310. else:
  1311. # collision
  1312. _, cdtype = ndtype[nameidx]
  1313. if fname in key:
  1314. # The current field is part of the key: take the largest dtype
  1315. ndtype[nameidx] = (fname, max(fdtype, cdtype))
  1316. else:
  1317. # The current field is not part of the key: add the suffixes,
  1318. # and place the new field adjacent to the old one
  1319. ndtype[nameidx:nameidx + 1] = [
  1320. (fname + r1postfix, cdtype),
  1321. (fname + r2postfix, fdtype)
  1322. ]
  1323. # Rebuild a dtype from the new fields
  1324. ndtype = np.dtype(ndtype)
  1325. # Find the largest nb of common fields :
  1326. # r1cmn and r2cmn should be equal, but...
  1327. cmn = max(r1cmn, r2cmn)
  1328. # Construct an empty array
  1329. output = ma.masked_all((cmn + r1spc + r2spc,), dtype=ndtype)
  1330. names = output.dtype.names
  1331. for f in r1names:
  1332. selected = s1[f]
  1333. if f not in names or (f in r2names and not r2postfix and f not in key):
  1334. f += r1postfix
  1335. current = output[f]
  1336. current[:r1cmn] = selected[:r1cmn]
  1337. if jointype in ('outer', 'leftouter'):
  1338. current[cmn:cmn + r1spc] = selected[r1cmn:]
  1339. for f in r2names:
  1340. selected = s2[f]
  1341. if f not in names or (f in r1names and not r1postfix and f not in key):
  1342. f += r2postfix
  1343. current = output[f]
  1344. current[:r2cmn] = selected[:r2cmn]
  1345. if (jointype == 'outer') and r2spc:
  1346. current[-r2spc:] = selected[r2cmn:]
  1347. # Sort and finalize the output
  1348. output.sort(order=key)
  1349. kwargs = dict(usemask=usemask, asrecarray=asrecarray)
  1350. return _fix_output(_fix_defaults(output, defaults), **kwargs)
  1351. def _rec_join_dispatcher(
  1352. key, r1, r2, jointype=None, r1postfix=None, r2postfix=None,
  1353. defaults=None):
  1354. return (r1, r2)
  1355. @array_function_dispatch(_rec_join_dispatcher)
  1356. def rec_join(key, r1, r2, jointype='inner', r1postfix='1', r2postfix='2',
  1357. defaults=None):
  1358. """
  1359. Join arrays `r1` and `r2` on keys.
  1360. Alternative to join_by, that always returns a np.recarray.
  1361. See Also
  1362. --------
  1363. join_by : equivalent function
  1364. """
  1365. kwargs = dict(jointype=jointype, r1postfix=r1postfix, r2postfix=r2postfix,
  1366. defaults=defaults, usemask=False, asrecarray=True)
  1367. return join_by(key, r1, r2, **kwargs)