groupby.pyx 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884
  1. cimport cython
  2. from cython cimport (
  3. Py_ssize_t,
  4. floating,
  5. )
  6. from libc.stdlib cimport (
  7. free,
  8. malloc,
  9. )
  10. import numpy as np
  11. cimport numpy as cnp
  12. from numpy cimport (
  13. complex64_t,
  14. complex128_t,
  15. float32_t,
  16. float64_t,
  17. int8_t,
  18. int64_t,
  19. intp_t,
  20. ndarray,
  21. uint8_t,
  22. uint64_t,
  23. )
  24. from numpy.math cimport NAN
  25. cnp.import_array()
  26. from pandas._libs cimport util
  27. from pandas._libs.algos cimport (
  28. get_rank_nan_fill_val,
  29. kth_smallest_c,
  30. )
  31. from pandas._libs.algos import (
  32. groupsort_indexer,
  33. rank_1d,
  34. take_2d_axis1_bool_bool,
  35. take_2d_axis1_float64_float64,
  36. )
  37. from pandas._libs.dtypes cimport (
  38. numeric_object_t,
  39. numeric_t,
  40. )
  41. from pandas._libs.missing cimport checknull
  42. cdef int64_t NPY_NAT = util.get_nat()
  43. _int64_max = np.iinfo(np.int64).max
  44. cdef float64_t NaN = <float64_t>np.NaN
  45. cdef enum InterpolationEnumType:
  46. INTERPOLATION_LINEAR,
  47. INTERPOLATION_LOWER,
  48. INTERPOLATION_HIGHER,
  49. INTERPOLATION_NEAREST,
  50. INTERPOLATION_MIDPOINT
  51. cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) nogil:
  52. cdef:
  53. int i, j, na_count = 0
  54. float64_t* tmp
  55. float64_t result
  56. if n == 0:
  57. return NaN
  58. # count NAs
  59. for i in range(n):
  60. if mask[i]:
  61. na_count += 1
  62. if na_count:
  63. if na_count == n:
  64. return NaN
  65. tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
  66. j = 0
  67. for i in range(n):
  68. if not mask[i]:
  69. tmp[j] = a[i]
  70. j += 1
  71. a = tmp
  72. n -= na_count
  73. result = calc_median_linear(a, n, na_count)
  74. if na_count:
  75. free(a)
  76. return result
  77. cdef float64_t median_linear(float64_t* a, int n) nogil:
  78. cdef:
  79. int i, j, na_count = 0
  80. float64_t* tmp
  81. float64_t result
  82. if n == 0:
  83. return NaN
  84. # count NAs
  85. for i in range(n):
  86. if a[i] != a[i]:
  87. na_count += 1
  88. if na_count:
  89. if na_count == n:
  90. return NaN
  91. tmp = <float64_t*>malloc((n - na_count) * sizeof(float64_t))
  92. j = 0
  93. for i in range(n):
  94. if a[i] == a[i]:
  95. tmp[j] = a[i]
  96. j += 1
  97. a = tmp
  98. n -= na_count
  99. result = calc_median_linear(a, n, na_count)
  100. if na_count:
  101. free(a)
  102. return result
  103. cdef float64_t calc_median_linear(float64_t* a, int n, int na_count) nogil:
  104. cdef:
  105. float64_t result
  106. if n % 2:
  107. result = kth_smallest_c(a, n // 2, n)
  108. else:
  109. result = (kth_smallest_c(a, n // 2, n) +
  110. kth_smallest_c(a, n // 2 - 1, n)) / 2
  111. return result
  112. ctypedef fused int64float_t:
  113. int64_t
  114. uint64_t
  115. float32_t
  116. float64_t
  117. @cython.boundscheck(False)
  118. @cython.wraparound(False)
  119. def group_median_float64(
  120. ndarray[float64_t, ndim=2] out,
  121. ndarray[int64_t] counts,
  122. ndarray[float64_t, ndim=2] values,
  123. ndarray[intp_t] labels,
  124. Py_ssize_t min_count=-1,
  125. const uint8_t[:, :] mask=None,
  126. uint8_t[:, ::1] result_mask=None,
  127. ) -> None:
  128. """
  129. Only aggregates on axis=0
  130. """
  131. cdef:
  132. Py_ssize_t i, j, N, K, ngroups, size
  133. ndarray[intp_t] _counts
  134. ndarray[float64_t, ndim=2] data
  135. ndarray[uint8_t, ndim=2] data_mask
  136. ndarray[intp_t] indexer
  137. float64_t* ptr
  138. uint8_t* ptr_mask
  139. float64_t result
  140. bint uses_mask = mask is not None
  141. assert min_count == -1, "'min_count' only used in sum and prod"
  142. ngroups = len(counts)
  143. N, K = (<object>values).shape
  144. indexer, _counts = groupsort_indexer(labels, ngroups)
  145. counts[:] = _counts[1:]
  146. data = np.empty((K, N), dtype=np.float64)
  147. ptr = <float64_t*>cnp.PyArray_DATA(data)
  148. take_2d_axis1_float64_float64(values.T, indexer, out=data)
  149. if uses_mask:
  150. data_mask = np.empty((K, N), dtype=np.uint8)
  151. ptr_mask = <uint8_t *>cnp.PyArray_DATA(data_mask)
  152. take_2d_axis1_bool_bool(mask.T, indexer, out=data_mask, fill_value=1)
  153. with nogil:
  154. for i in range(K):
  155. # exclude NA group
  156. ptr += _counts[0]
  157. ptr_mask += _counts[0]
  158. for j in range(ngroups):
  159. size = _counts[j + 1]
  160. result = median_linear_mask(ptr, size, ptr_mask)
  161. out[j, i] = result
  162. if result != result:
  163. result_mask[j, i] = 1
  164. ptr += size
  165. ptr_mask += size
  166. else:
  167. with nogil:
  168. for i in range(K):
  169. # exclude NA group
  170. ptr += _counts[0]
  171. for j in range(ngroups):
  172. size = _counts[j + 1]
  173. out[j, i] = median_linear(ptr, size)
  174. ptr += size
  175. @cython.boundscheck(False)
  176. @cython.wraparound(False)
  177. def group_cumprod(
  178. int64float_t[:, ::1] out,
  179. ndarray[int64float_t, ndim=2] values,
  180. const intp_t[::1] labels,
  181. int ngroups,
  182. bint is_datetimelike,
  183. bint skipna=True,
  184. const uint8_t[:, :] mask=None,
  185. uint8_t[:, ::1] result_mask=None,
  186. ) -> None:
  187. """
  188. Cumulative product of columns of `values`, in row groups `labels`.
  189. Parameters
  190. ----------
  191. out : np.ndarray[np.float64, ndim=2]
  192. Array to store cumprod in.
  193. values : np.ndarray[np.float64, ndim=2]
  194. Values to take cumprod of.
  195. labels : np.ndarray[np.intp]
  196. Labels to group by.
  197. ngroups : int
  198. Number of groups, larger than all entries of `labels`.
  199. is_datetimelike : bool
  200. Always false, `values` is never datetime-like.
  201. skipna : bool
  202. If true, ignore nans in `values`.
  203. mask: np.ndarray[uint8], optional
  204. Mask of values
  205. result_mask: np.ndarray[int8], optional
  206. Mask of out array
  207. Notes
  208. -----
  209. This method modifies the `out` parameter, rather than returning an object.
  210. """
  211. cdef:
  212. Py_ssize_t i, j, N, K
  213. int64float_t val, na_val
  214. int64float_t[:, ::1] accum
  215. intp_t lab
  216. uint8_t[:, ::1] accum_mask
  217. bint isna_entry, isna_prev = False
  218. bint uses_mask = mask is not None
  219. N, K = (<object>values).shape
  220. accum = np.ones((ngroups, K), dtype=(<object>values).dtype)
  221. na_val = _get_na_val(<int64float_t>0, is_datetimelike)
  222. accum_mask = np.zeros((ngroups, K), dtype="uint8")
  223. with nogil:
  224. for i in range(N):
  225. lab = labels[i]
  226. if lab < 0:
  227. continue
  228. for j in range(K):
  229. val = values[i, j]
  230. if uses_mask:
  231. isna_entry = mask[i, j]
  232. else:
  233. isna_entry = _treat_as_na(val, False)
  234. if not isna_entry:
  235. isna_prev = accum_mask[lab, j]
  236. if isna_prev:
  237. out[i, j] = na_val
  238. if uses_mask:
  239. result_mask[i, j] = True
  240. else:
  241. accum[lab, j] *= val
  242. out[i, j] = accum[lab, j]
  243. else:
  244. if uses_mask:
  245. result_mask[i, j] = True
  246. out[i, j] = 0
  247. else:
  248. out[i, j] = na_val
  249. if not skipna:
  250. accum[lab, j] = na_val
  251. accum_mask[lab, j] = True
  252. @cython.boundscheck(False)
  253. @cython.wraparound(False)
  254. def group_cumsum(
  255. int64float_t[:, ::1] out,
  256. ndarray[int64float_t, ndim=2] values,
  257. const intp_t[::1] labels,
  258. int ngroups,
  259. bint is_datetimelike,
  260. bint skipna=True,
  261. const uint8_t[:, :] mask=None,
  262. uint8_t[:, ::1] result_mask=None,
  263. ) -> None:
  264. """
  265. Cumulative sum of columns of `values`, in row groups `labels`.
  266. Parameters
  267. ----------
  268. out : np.ndarray[ndim=2]
  269. Array to store cumsum in.
  270. values : np.ndarray[ndim=2]
  271. Values to take cumsum of.
  272. labels : np.ndarray[np.intp]
  273. Labels to group by.
  274. ngroups : int
  275. Number of groups, larger than all entries of `labels`.
  276. is_datetimelike : bool
  277. True if `values` contains datetime-like entries.
  278. skipna : bool
  279. If true, ignore nans in `values`.
  280. mask: np.ndarray[uint8], optional
  281. Mask of values
  282. result_mask: np.ndarray[int8], optional
  283. Mask of out array
  284. Notes
  285. -----
  286. This method modifies the `out` parameter, rather than returning an object.
  287. """
  288. cdef:
  289. Py_ssize_t i, j, N, K
  290. int64float_t val, y, t, na_val
  291. int64float_t[:, ::1] accum, compensation
  292. uint8_t[:, ::1] accum_mask
  293. intp_t lab
  294. bint isna_entry, isna_prev = False
  295. bint uses_mask = mask is not None
  296. N, K = (<object>values).shape
  297. if uses_mask:
  298. accum_mask = np.zeros((ngroups, K), dtype="uint8")
  299. accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
  300. compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
  301. na_val = _get_na_val(<int64float_t>0, is_datetimelike)
  302. with nogil:
  303. for i in range(N):
  304. lab = labels[i]
  305. if lab < 0:
  306. continue
  307. for j in range(K):
  308. val = values[i, j]
  309. if uses_mask:
  310. isna_entry = mask[i, j]
  311. else:
  312. isna_entry = _treat_as_na(val, is_datetimelike)
  313. if not skipna:
  314. if uses_mask:
  315. isna_prev = accum_mask[lab, j]
  316. else:
  317. isna_prev = _treat_as_na(accum[lab, j], is_datetimelike)
  318. if isna_prev:
  319. if uses_mask:
  320. result_mask[i, j] = True
  321. # Be deterministic, out was initialized as empty
  322. out[i, j] = 0
  323. else:
  324. out[i, j] = na_val
  325. continue
  326. if isna_entry:
  327. if uses_mask:
  328. result_mask[i, j] = True
  329. # Be deterministic, out was initialized as empty
  330. out[i, j] = 0
  331. else:
  332. out[i, j] = na_val
  333. if not skipna:
  334. if uses_mask:
  335. accum_mask[lab, j] = True
  336. else:
  337. accum[lab, j] = na_val
  338. else:
  339. # For floats, use Kahan summation to reduce floating-point
  340. # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
  341. if int64float_t == float32_t or int64float_t == float64_t:
  342. y = val - compensation[lab, j]
  343. t = accum[lab, j] + y
  344. compensation[lab, j] = t - accum[lab, j] - y
  345. else:
  346. t = val + accum[lab, j]
  347. accum[lab, j] = t
  348. out[i, j] = t
  349. @cython.boundscheck(False)
  350. @cython.wraparound(False)
  351. def group_shift_indexer(
  352. int64_t[::1] out,
  353. const intp_t[::1] labels,
  354. int ngroups,
  355. int periods,
  356. ) -> None:
  357. cdef:
  358. Py_ssize_t N, i, ii, lab
  359. int offset = 0, sign
  360. int64_t idxer, idxer_slot
  361. int64_t[::1] label_seen = np.zeros(ngroups, dtype=np.int64)
  362. int64_t[:, ::1] label_indexer
  363. N, = (<object>labels).shape
  364. if periods < 0:
  365. periods = -periods
  366. offset = N - 1
  367. sign = -1
  368. elif periods > 0:
  369. offset = 0
  370. sign = 1
  371. if periods == 0:
  372. with nogil:
  373. for i in range(N):
  374. out[i] = i
  375. else:
  376. # array of each previous indexer seen
  377. label_indexer = np.zeros((ngroups, periods), dtype=np.int64)
  378. with nogil:
  379. for i in range(N):
  380. # reverse iterator if shifting backwards
  381. ii = offset + sign * i
  382. lab = labels[ii]
  383. # Skip null keys
  384. if lab == -1:
  385. out[ii] = -1
  386. continue
  387. label_seen[lab] += 1
  388. idxer_slot = label_seen[lab] % periods
  389. idxer = label_indexer[lab, idxer_slot]
  390. if label_seen[lab] > periods:
  391. out[ii] = idxer
  392. else:
  393. out[ii] = -1
  394. label_indexer[lab, idxer_slot] = ii
  395. @cython.wraparound(False)
  396. @cython.boundscheck(False)
  397. def group_fillna_indexer(
  398. ndarray[intp_t] out,
  399. ndarray[intp_t] labels,
  400. ndarray[intp_t] sorted_labels,
  401. ndarray[uint8_t] mask,
  402. str direction,
  403. int64_t limit,
  404. bint dropna,
  405. ) -> None:
  406. """
  407. Indexes how to fill values forwards or backwards within a group.
  408. Parameters
  409. ----------
  410. out : np.ndarray[np.intp]
  411. Values into which this method will write its results.
  412. labels : np.ndarray[np.intp]
  413. Array containing unique label for each group, with its ordering
  414. matching up to the corresponding record in `values`.
  415. sorted_labels : np.ndarray[np.intp]
  416. obtained by `np.argsort(labels, kind="mergesort")`; reversed if
  417. direction == "bfill"
  418. values : np.ndarray[np.uint8]
  419. Containing the truth value of each element.
  420. mask : np.ndarray[np.uint8]
  421. Indicating whether a value is na or not.
  422. direction : {'ffill', 'bfill'}
  423. Direction for fill to be applied (forwards or backwards, respectively)
  424. limit : Consecutive values to fill before stopping, or -1 for no limit
  425. dropna : Flag to indicate if NaN groups should return all NaN values
  426. Notes
  427. -----
  428. This method modifies the `out` parameter rather than returning an object
  429. """
  430. cdef:
  431. Py_ssize_t i, N, idx
  432. intp_t curr_fill_idx=-1
  433. int64_t filled_vals = 0
  434. N = len(out)
  435. # Make sure all arrays are the same size
  436. assert N == len(labels) == len(mask)
  437. with nogil:
  438. for i in range(N):
  439. idx = sorted_labels[i]
  440. if dropna and labels[idx] == -1: # nan-group gets nan-values
  441. curr_fill_idx = -1
  442. elif mask[idx] == 1: # is missing
  443. # Stop filling once we've hit the limit
  444. if filled_vals >= limit and limit != -1:
  445. curr_fill_idx = -1
  446. filled_vals += 1
  447. else: # reset items when not missing
  448. filled_vals = 0
  449. curr_fill_idx = idx
  450. out[idx] = curr_fill_idx
  451. # If we move to the next group, reset
  452. # the fill_idx and counter
  453. if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]:
  454. curr_fill_idx = -1
  455. filled_vals = 0
  456. @cython.boundscheck(False)
  457. @cython.wraparound(False)
  458. def group_any_all(
  459. int8_t[:, ::1] out,
  460. const int8_t[:, :] values,
  461. const intp_t[::1] labels,
  462. const uint8_t[:, :] mask,
  463. str val_test,
  464. bint skipna,
  465. bint nullable,
  466. ) -> None:
  467. """
  468. Aggregated boolean values to show truthfulness of group elements. If the
  469. input is a nullable type (nullable=True), the result will be computed
  470. using Kleene logic.
  471. Parameters
  472. ----------
  473. out : np.ndarray[np.int8]
  474. Values into which this method will write its results.
  475. labels : np.ndarray[np.intp]
  476. Array containing unique label for each group, with its
  477. ordering matching up to the corresponding record in `values`
  478. values : np.ndarray[np.int8]
  479. Containing the truth value of each element.
  480. mask : np.ndarray[np.uint8]
  481. Indicating whether a value is na or not.
  482. val_test : {'any', 'all'}
  483. String object dictating whether to use any or all truth testing
  484. skipna : bool
  485. Flag to ignore nan values during truth testing
  486. nullable : bool
  487. Whether or not the input is a nullable type. If True, the
  488. result will be computed using Kleene logic
  489. Notes
  490. -----
  491. This method modifies the `out` parameter rather than returning an object.
  492. The returned values will either be 0, 1 (False or True, respectively), or
  493. -1 to signify a masked position in the case of a nullable input.
  494. """
  495. cdef:
  496. Py_ssize_t i, j, N = len(labels), K = out.shape[1]
  497. intp_t lab
  498. int8_t flag_val, val
  499. if val_test == "all":
  500. # Because the 'all' value of an empty iterable in Python is True we can
  501. # start with an array full of ones and set to zero when a False value
  502. # is encountered
  503. flag_val = 0
  504. elif val_test == "any":
  505. # Because the 'any' value of an empty iterable in Python is False we
  506. # can start with an array full of zeros and set to one only if any
  507. # value encountered is True
  508. flag_val = 1
  509. else:
  510. raise ValueError("'bool_func' must be either 'any' or 'all'!")
  511. out[:] = 1 - flag_val
  512. with nogil:
  513. for i in range(N):
  514. lab = labels[i]
  515. if lab < 0:
  516. continue
  517. for j in range(K):
  518. if skipna and mask[i, j]:
  519. continue
  520. if nullable and mask[i, j]:
  521. # Set the position as masked if `out[lab] != flag_val`, which
  522. # would indicate True/False has not yet been seen for any/all,
  523. # so by Kleene logic the result is currently unknown
  524. if out[lab, j] != flag_val:
  525. out[lab, j] = -1
  526. continue
  527. val = values[i, j]
  528. # If True and 'any' or False and 'all', the result is
  529. # already determined
  530. if val == flag_val:
  531. out[lab, j] = flag_val
  532. # ----------------------------------------------------------------------
  533. # group_sum, group_prod, group_var, group_mean, group_ohlc
  534. # ----------------------------------------------------------------------
  535. ctypedef fused mean_t:
  536. float64_t
  537. float32_t
  538. complex64_t
  539. complex128_t
  540. ctypedef fused sum_t:
  541. mean_t
  542. int64_t
  543. uint64_t
  544. object
  545. @cython.wraparound(False)
  546. @cython.boundscheck(False)
  547. def group_sum(
  548. sum_t[:, ::1] out,
  549. int64_t[::1] counts,
  550. ndarray[sum_t, ndim=2] values,
  551. const intp_t[::1] labels,
  552. const uint8_t[:, :] mask,
  553. uint8_t[:, ::1] result_mask=None,
  554. Py_ssize_t min_count=0,
  555. bint is_datetimelike=False,
  556. ) -> None:
  557. """
  558. Only aggregates on axis=0 using Kahan summation
  559. """
  560. cdef:
  561. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  562. sum_t val, t, y
  563. sum_t[:, ::1] sumx, compensation
  564. int64_t[:, ::1] nobs
  565. Py_ssize_t len_values = len(values), len_labels = len(labels)
  566. bint uses_mask = mask is not None
  567. bint isna_entry
  568. if len_values != len_labels:
  569. raise ValueError("len(index) != len(labels)")
  570. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  571. # the below is equivalent to `np.zeros_like(out)` but faster
  572. sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  573. compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  574. N, K = (<object>values).shape
  575. if sum_t is object:
  576. # NB: this does not use 'compensation' like the non-object track does.
  577. for i in range(N):
  578. lab = labels[i]
  579. if lab < 0:
  580. continue
  581. counts[lab] += 1
  582. for j in range(K):
  583. val = values[i, j]
  584. # not nan
  585. if not checknull(val):
  586. nobs[lab, j] += 1
  587. if nobs[lab, j] == 1:
  588. # i.e. we haven't added anything yet; avoid TypeError
  589. # if e.g. val is a str and sumx[lab, j] is 0
  590. t = val
  591. else:
  592. t = sumx[lab, j] + val
  593. sumx[lab, j] = t
  594. for i in range(ncounts):
  595. for j in range(K):
  596. if nobs[i, j] < min_count:
  597. out[i, j] = None
  598. else:
  599. out[i, j] = sumx[i, j]
  600. else:
  601. with nogil:
  602. for i in range(N):
  603. lab = labels[i]
  604. if lab < 0:
  605. continue
  606. counts[lab] += 1
  607. for j in range(K):
  608. val = values[i, j]
  609. if uses_mask:
  610. isna_entry = mask[i, j]
  611. else:
  612. isna_entry = _treat_as_na(val, is_datetimelike)
  613. if not isna_entry:
  614. nobs[lab, j] += 1
  615. y = val - compensation[lab, j]
  616. t = sumx[lab, j] + y
  617. compensation[lab, j] = t - sumx[lab, j] - y
  618. sumx[lab, j] = t
  619. _check_below_mincount(
  620. out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
  621. )
  622. @cython.wraparound(False)
  623. @cython.boundscheck(False)
  624. def group_prod(
  625. int64float_t[:, ::1] out,
  626. int64_t[::1] counts,
  627. ndarray[int64float_t, ndim=2] values,
  628. const intp_t[::1] labels,
  629. const uint8_t[:, ::1] mask,
  630. uint8_t[:, ::1] result_mask=None,
  631. Py_ssize_t min_count=0,
  632. ) -> None:
  633. """
  634. Only aggregates on axis=0
  635. """
  636. cdef:
  637. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  638. int64float_t val
  639. int64float_t[:, ::1] prodx
  640. int64_t[:, ::1] nobs
  641. Py_ssize_t len_values = len(values), len_labels = len(labels)
  642. bint isna_entry, uses_mask = mask is not None
  643. if len_values != len_labels:
  644. raise ValueError("len(index) != len(labels)")
  645. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  646. prodx = np.ones((<object>out).shape, dtype=(<object>out).base.dtype)
  647. N, K = (<object>values).shape
  648. with nogil:
  649. for i in range(N):
  650. lab = labels[i]
  651. if lab < 0:
  652. continue
  653. counts[lab] += 1
  654. for j in range(K):
  655. val = values[i, j]
  656. if uses_mask:
  657. isna_entry = mask[i, j]
  658. else:
  659. isna_entry = _treat_as_na(val, False)
  660. if not isna_entry:
  661. nobs[lab, j] += 1
  662. prodx[lab, j] *= val
  663. _check_below_mincount(
  664. out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
  665. )
  666. @cython.wraparound(False)
  667. @cython.boundscheck(False)
  668. @cython.cdivision(True)
  669. def group_var(
  670. floating[:, ::1] out,
  671. int64_t[::1] counts,
  672. ndarray[floating, ndim=2] values,
  673. const intp_t[::1] labels,
  674. Py_ssize_t min_count=-1,
  675. int64_t ddof=1,
  676. const uint8_t[:, ::1] mask=None,
  677. uint8_t[:, ::1] result_mask=None,
  678. bint is_datetimelike=False,
  679. ) -> None:
  680. cdef:
  681. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  682. floating val, ct, oldmean
  683. floating[:, ::1] mean
  684. int64_t[:, ::1] nobs
  685. Py_ssize_t len_values = len(values), len_labels = len(labels)
  686. bint isna_entry, uses_mask = mask is not None
  687. assert min_count == -1, "'min_count' only used in sum and prod"
  688. if len_values != len_labels:
  689. raise ValueError("len(index) != len(labels)")
  690. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  691. mean = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  692. N, K = (<object>values).shape
  693. out[:, :] = 0.0
  694. with nogil:
  695. for i in range(N):
  696. lab = labels[i]
  697. if lab < 0:
  698. continue
  699. counts[lab] += 1
  700. for j in range(K):
  701. val = values[i, j]
  702. if uses_mask:
  703. isna_entry = mask[i, j]
  704. elif is_datetimelike:
  705. # With group_var, we cannot just use _treat_as_na bc
  706. # datetimelike dtypes get cast to float64 instead of
  707. # to int64.
  708. isna_entry = val == NPY_NAT
  709. else:
  710. isna_entry = _treat_as_na(val, is_datetimelike)
  711. if not isna_entry:
  712. nobs[lab, j] += 1
  713. oldmean = mean[lab, j]
  714. mean[lab, j] += (val - oldmean) / nobs[lab, j]
  715. out[lab, j] += (val - mean[lab, j]) * (val - oldmean)
  716. for i in range(ncounts):
  717. for j in range(K):
  718. ct = nobs[i, j]
  719. if ct <= ddof:
  720. if uses_mask:
  721. result_mask[i, j] = True
  722. else:
  723. out[i, j] = NAN
  724. else:
  725. out[i, j] /= (ct - ddof)
  726. @cython.wraparound(False)
  727. @cython.boundscheck(False)
  728. def group_mean(
  729. mean_t[:, ::1] out,
  730. int64_t[::1] counts,
  731. ndarray[mean_t, ndim=2] values,
  732. const intp_t[::1] labels,
  733. Py_ssize_t min_count=-1,
  734. bint is_datetimelike=False,
  735. const uint8_t[:, ::1] mask=None,
  736. uint8_t[:, ::1] result_mask=None,
  737. ) -> None:
  738. """
  739. Compute the mean per label given a label assignment for each value.
  740. NaN values are ignored.
  741. Parameters
  742. ----------
  743. out : np.ndarray[floating]
  744. Values into which this method will write its results.
  745. counts : np.ndarray[int64]
  746. A zeroed array of the same shape as labels,
  747. populated by group sizes during algorithm.
  748. values : np.ndarray[floating]
  749. 2-d array of the values to find the mean of.
  750. labels : np.ndarray[np.intp]
  751. Array containing unique label for each group, with its
  752. ordering matching up to the corresponding record in `values`.
  753. min_count : Py_ssize_t
  754. Only used in sum and prod. Always -1.
  755. is_datetimelike : bool
  756. True if `values` contains datetime-like entries.
  757. mask : ndarray[bool, ndim=2], optional
  758. Mask of the input values.
  759. result_mask : ndarray[bool, ndim=2], optional
  760. Mask of the out array
  761. Notes
  762. -----
  763. This method modifies the `out` parameter rather than returning an object.
  764. `counts` is modified to hold group sizes
  765. """
  766. cdef:
  767. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  768. mean_t val, count, y, t, nan_val
  769. mean_t[:, ::1] sumx, compensation
  770. int64_t[:, ::1] nobs
  771. Py_ssize_t len_values = len(values), len_labels = len(labels)
  772. bint isna_entry, uses_mask = mask is not None
  773. assert min_count == -1, "'min_count' only used in sum and prod"
  774. if len_values != len_labels:
  775. raise ValueError("len(index) != len(labels)")
  776. # the below is equivalent to `np.zeros_like(out)` but faster
  777. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  778. sumx = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  779. compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
  780. N, K = (<object>values).shape
  781. if uses_mask:
  782. nan_val = 0
  783. elif is_datetimelike:
  784. nan_val = NPY_NAT
  785. else:
  786. nan_val = NAN
  787. with nogil:
  788. for i in range(N):
  789. lab = labels[i]
  790. if lab < 0:
  791. continue
  792. counts[lab] += 1
  793. for j in range(K):
  794. val = values[i, j]
  795. if uses_mask:
  796. isna_entry = mask[i, j]
  797. elif is_datetimelike:
  798. # With group_mean, we cannot just use _treat_as_na bc
  799. # datetimelike dtypes get cast to float64 instead of
  800. # to int64.
  801. isna_entry = val == NPY_NAT
  802. else:
  803. isna_entry = _treat_as_na(val, is_datetimelike)
  804. if not isna_entry:
  805. nobs[lab, j] += 1
  806. y = val - compensation[lab, j]
  807. t = sumx[lab, j] + y
  808. compensation[lab, j] = t - sumx[lab, j] - y
  809. sumx[lab, j] = t
  810. for i in range(ncounts):
  811. for j in range(K):
  812. count = nobs[i, j]
  813. if nobs[i, j] == 0:
  814. if uses_mask:
  815. result_mask[i, j] = True
  816. else:
  817. out[i, j] = nan_val
  818. else:
  819. out[i, j] = sumx[i, j] / count
  820. @cython.wraparound(False)
  821. @cython.boundscheck(False)
  822. def group_ohlc(
  823. int64float_t[:, ::1] out,
  824. int64_t[::1] counts,
  825. ndarray[int64float_t, ndim=2] values,
  826. const intp_t[::1] labels,
  827. Py_ssize_t min_count=-1,
  828. const uint8_t[:, ::1] mask=None,
  829. uint8_t[:, ::1] result_mask=None,
  830. ) -> None:
  831. """
  832. Only aggregates on axis=0
  833. """
  834. cdef:
  835. Py_ssize_t i, N, K, lab
  836. int64float_t val
  837. uint8_t[::1] first_element_set
  838. bint isna_entry, uses_mask = mask is not None
  839. assert min_count == -1, "'min_count' only used in sum and prod"
  840. if len(labels) == 0:
  841. return
  842. N, K = (<object>values).shape
  843. if out.shape[1] != 4:
  844. raise ValueError("Output array must have 4 columns")
  845. if K > 1:
  846. raise NotImplementedError("Argument 'values' must have only one dimension")
  847. if int64float_t is float32_t or int64float_t is float64_t:
  848. out[:] = np.nan
  849. else:
  850. out[:] = 0
  851. first_element_set = np.zeros((<object>counts).shape, dtype=np.uint8)
  852. if uses_mask:
  853. result_mask[:] = True
  854. with nogil:
  855. for i in range(N):
  856. lab = labels[i]
  857. if lab == -1:
  858. continue
  859. counts[lab] += 1
  860. val = values[i, 0]
  861. if uses_mask:
  862. isna_entry = mask[i, 0]
  863. else:
  864. isna_entry = _treat_as_na(val, False)
  865. if isna_entry:
  866. continue
  867. if not first_element_set[lab]:
  868. out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
  869. first_element_set[lab] = True
  870. if uses_mask:
  871. result_mask[lab] = False
  872. else:
  873. out[lab, 1] = max(out[lab, 1], val)
  874. out[lab, 2] = min(out[lab, 2], val)
  875. out[lab, 3] = val
  876. @cython.boundscheck(False)
  877. @cython.wraparound(False)
  878. def group_quantile(
  879. ndarray[float64_t, ndim=2] out,
  880. ndarray[numeric_t, ndim=1] values,
  881. ndarray[intp_t] labels,
  882. ndarray[uint8_t] mask,
  883. const intp_t[:] sort_indexer,
  884. const float64_t[:] qs,
  885. str interpolation,
  886. uint8_t[:, ::1] result_mask=None,
  887. ) -> None:
  888. """
  889. Calculate the quantile per group.
  890. Parameters
  891. ----------
  892. out : np.ndarray[np.float64, ndim=2]
  893. Array of aggregated values that will be written to.
  894. values : np.ndarray
  895. Array containing the values to apply the function against.
  896. labels : ndarray[np.intp]
  897. Array containing the unique group labels.
  898. sort_indexer : ndarray[np.intp]
  899. Indices describing sort order by values and labels.
  900. qs : ndarray[float64_t]
  901. The quantile values to search for.
  902. interpolation : {'linear', 'lower', 'highest', 'nearest', 'midpoint'}
  903. Notes
  904. -----
  905. Rather than explicitly returning a value, this function modifies the
  906. provided `out` parameter.
  907. """
  908. cdef:
  909. Py_ssize_t i, N=len(labels), ngroups, grp_sz, non_na_sz, k, nqs
  910. Py_ssize_t grp_start=0, idx=0
  911. intp_t lab
  912. InterpolationEnumType interp
  913. float64_t q_val, q_idx, frac, val, next_val
  914. int64_t[::1] counts, non_na_counts
  915. bint uses_result_mask = result_mask is not None
  916. assert values.shape[0] == N
  917. if any(not (0 <= q <= 1) for q in qs):
  918. wrong = [x for x in qs if not (0 <= x <= 1)][0]
  919. raise ValueError(
  920. f"Each 'q' must be between 0 and 1. Got '{wrong}' instead"
  921. )
  922. inter_methods = {
  923. "linear": INTERPOLATION_LINEAR,
  924. "lower": INTERPOLATION_LOWER,
  925. "higher": INTERPOLATION_HIGHER,
  926. "nearest": INTERPOLATION_NEAREST,
  927. "midpoint": INTERPOLATION_MIDPOINT,
  928. }
  929. interp = inter_methods[interpolation]
  930. nqs = len(qs)
  931. ngroups = len(out)
  932. counts = np.zeros(ngroups, dtype=np.int64)
  933. non_na_counts = np.zeros(ngroups, dtype=np.int64)
  934. # First figure out the size of every group
  935. with nogil:
  936. for i in range(N):
  937. lab = labels[i]
  938. if lab == -1: # NA group label
  939. continue
  940. counts[lab] += 1
  941. if not mask[i]:
  942. non_na_counts[lab] += 1
  943. with nogil:
  944. for i in range(ngroups):
  945. # Figure out how many group elements there are
  946. grp_sz = counts[i]
  947. non_na_sz = non_na_counts[i]
  948. if non_na_sz == 0:
  949. for k in range(nqs):
  950. if uses_result_mask:
  951. result_mask[i, k] = 1
  952. else:
  953. out[i, k] = NaN
  954. else:
  955. for k in range(nqs):
  956. q_val = qs[k]
  957. # Calculate where to retrieve the desired value
  958. # Casting to int will intentionally truncate result
  959. idx = grp_start + <int64_t>(q_val * <float64_t>(non_na_sz - 1))
  960. val = values[sort_indexer[idx]]
  961. # If requested quantile falls evenly on a particular index
  962. # then write that index's value out. Otherwise interpolate
  963. q_idx = q_val * (non_na_sz - 1)
  964. frac = q_idx % 1
  965. if frac == 0.0 or interp == INTERPOLATION_LOWER:
  966. out[i, k] = val
  967. else:
  968. next_val = values[sort_indexer[idx + 1]]
  969. if interp == INTERPOLATION_LINEAR:
  970. out[i, k] = val + (next_val - val) * frac
  971. elif interp == INTERPOLATION_HIGHER:
  972. out[i, k] = next_val
  973. elif interp == INTERPOLATION_MIDPOINT:
  974. out[i, k] = (val + next_val) / 2.0
  975. elif interp == INTERPOLATION_NEAREST:
  976. if frac > .5 or (frac == .5 and q_val > .5): # Always OK?
  977. out[i, k] = next_val
  978. else:
  979. out[i, k] = val
  980. # Increment the index reference in sorted_arr for the next group
  981. grp_start += grp_sz
  982. # ----------------------------------------------------------------------
  983. # group_nth, group_last, group_rank
  984. # ----------------------------------------------------------------------
  985. ctypedef fused numeric_object_complex_t:
  986. numeric_object_t
  987. complex64_t
  988. complex128_t
  989. cdef bint _treat_as_na(numeric_object_complex_t val, bint is_datetimelike) nogil:
  990. if numeric_object_complex_t is object:
  991. # Should never be used, but we need to avoid the `val != val` below
  992. # or else cython will raise about gil acquisition.
  993. raise NotImplementedError
  994. elif numeric_object_complex_t is int64_t:
  995. return is_datetimelike and val == NPY_NAT
  996. elif (
  997. numeric_object_complex_t is float32_t
  998. or numeric_object_complex_t is float64_t
  999. or numeric_object_complex_t is complex64_t
  1000. or numeric_object_complex_t is complex128_t
  1001. ):
  1002. return val != val
  1003. else:
  1004. # non-datetimelike integer
  1005. return False
  1006. cdef numeric_object_t _get_min_or_max(
  1007. numeric_object_t val,
  1008. bint compute_max,
  1009. bint is_datetimelike,
  1010. ):
  1011. """
  1012. Find either the min or the max supported by numeric_object_t; 'val' is a
  1013. placeholder to effectively make numeric_object_t an argument.
  1014. """
  1015. return get_rank_nan_fill_val(
  1016. not compute_max,
  1017. val=val,
  1018. is_datetimelike=is_datetimelike,
  1019. )
  1020. cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike):
  1021. cdef:
  1022. numeric_t na_val
  1023. if numeric_t == float32_t or numeric_t == float64_t:
  1024. na_val = NaN
  1025. elif numeric_t is int64_t and is_datetimelike:
  1026. na_val = NPY_NAT
  1027. else:
  1028. # Used in case of masks
  1029. na_val = 0
  1030. return na_val
  1031. ctypedef fused mincount_t:
  1032. numeric_t
  1033. complex64_t
  1034. complex128_t
  1035. @cython.wraparound(False)
  1036. @cython.boundscheck(False)
  1037. cdef inline void _check_below_mincount(
  1038. mincount_t[:, ::1] out,
  1039. bint uses_mask,
  1040. uint8_t[:, ::1] result_mask,
  1041. Py_ssize_t ncounts,
  1042. Py_ssize_t K,
  1043. int64_t[:, ::1] nobs,
  1044. int64_t min_count,
  1045. mincount_t[:, ::1] resx,
  1046. ) nogil:
  1047. """
  1048. Check if the number of observations for a group is below min_count,
  1049. and if so set the result for that group to the appropriate NA-like value.
  1050. """
  1051. cdef:
  1052. Py_ssize_t i, j
  1053. for i in range(ncounts):
  1054. for j in range(K):
  1055. if nobs[i, j] < min_count:
  1056. # if we are integer dtype, not is_datetimelike, and
  1057. # not uses_mask, then getting here implies that
  1058. # counts[i] < min_count, which means we will
  1059. # be cast to float64 and masked at the end
  1060. # of WrappedCythonOp._call_cython_op. So we can safely
  1061. # set a placeholder value in out[i, j].
  1062. if uses_mask:
  1063. result_mask[i, j] = True
  1064. # set out[i, j] to 0 to be deterministic, as
  1065. # it was initialized with np.empty. Also ensures
  1066. # we can downcast out if appropriate.
  1067. out[i, j] = 0
  1068. elif (
  1069. mincount_t is float32_t
  1070. or mincount_t is float64_t
  1071. or mincount_t is complex64_t
  1072. or mincount_t is complex128_t
  1073. ):
  1074. out[i, j] = NAN
  1075. elif mincount_t is int64_t:
  1076. # Per above, this is a placeholder in
  1077. # non-is_datetimelike cases.
  1078. out[i, j] = NPY_NAT
  1079. else:
  1080. # placeholder, see above
  1081. out[i, j] = 0
  1082. else:
  1083. out[i, j] = resx[i, j]
  1084. # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
  1085. # use `const numeric_object_t[:, :] values`
  1086. @cython.wraparound(False)
  1087. @cython.boundscheck(False)
  1088. def group_last(
  1089. numeric_object_t[:, ::1] out,
  1090. int64_t[::1] counts,
  1091. ndarray[numeric_object_t, ndim=2] values,
  1092. const intp_t[::1] labels,
  1093. const uint8_t[:, :] mask,
  1094. uint8_t[:, ::1] result_mask=None,
  1095. Py_ssize_t min_count=-1,
  1096. bint is_datetimelike=False,
  1097. ) -> None:
  1098. """
  1099. Only aggregates on axis=0
  1100. """
  1101. cdef:
  1102. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  1103. numeric_object_t val
  1104. numeric_object_t[:, ::1] resx
  1105. int64_t[:, ::1] nobs
  1106. bint uses_mask = mask is not None
  1107. bint isna_entry
  1108. # TODO(cython3):
  1109. # Instead of `labels.shape[0]` use `len(labels)`
  1110. if not len(values) == labels.shape[0]:
  1111. raise AssertionError("len(index) != len(labels)")
  1112. min_count = max(min_count, 1)
  1113. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  1114. if numeric_object_t is object:
  1115. resx = np.empty((<object>out).shape, dtype=object)
  1116. else:
  1117. resx = np.empty_like(out)
  1118. N, K = (<object>values).shape
  1119. if numeric_object_t is object:
  1120. # TODO(cython3): De-duplicate once conditional-nogil is available
  1121. for i in range(N):
  1122. lab = labels[i]
  1123. if lab < 0:
  1124. continue
  1125. counts[lab] += 1
  1126. for j in range(K):
  1127. val = values[i, j]
  1128. if uses_mask:
  1129. isna_entry = mask[i, j]
  1130. else:
  1131. isna_entry = checknull(val)
  1132. if not isna_entry:
  1133. # TODO(cython3): use _treat_as_na here once
  1134. # conditional-nogil is available.
  1135. nobs[lab, j] += 1
  1136. resx[lab, j] = val
  1137. for i in range(ncounts):
  1138. for j in range(K):
  1139. if nobs[i, j] < min_count:
  1140. out[i, j] = None
  1141. else:
  1142. out[i, j] = resx[i, j]
  1143. else:
  1144. with nogil:
  1145. for i in range(N):
  1146. lab = labels[i]
  1147. if lab < 0:
  1148. continue
  1149. counts[lab] += 1
  1150. for j in range(K):
  1151. val = values[i, j]
  1152. if uses_mask:
  1153. isna_entry = mask[i, j]
  1154. else:
  1155. isna_entry = _treat_as_na(val, is_datetimelike)
  1156. if not isna_entry:
  1157. nobs[lab, j] += 1
  1158. resx[lab, j] = val
  1159. _check_below_mincount(
  1160. out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
  1161. )
  1162. # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
  1163. # use `const numeric_object_t[:, :] values`
  1164. @cython.wraparound(False)
  1165. @cython.boundscheck(False)
  1166. def group_nth(
  1167. numeric_object_t[:, ::1] out,
  1168. int64_t[::1] counts,
  1169. ndarray[numeric_object_t, ndim=2] values,
  1170. const intp_t[::1] labels,
  1171. const uint8_t[:, :] mask,
  1172. uint8_t[:, ::1] result_mask=None,
  1173. int64_t min_count=-1,
  1174. int64_t rank=1,
  1175. bint is_datetimelike=False,
  1176. ) -> None:
  1177. """
  1178. Only aggregates on axis=0
  1179. """
  1180. cdef:
  1181. Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
  1182. numeric_object_t val
  1183. numeric_object_t[:, ::1] resx
  1184. int64_t[:, ::1] nobs
  1185. bint uses_mask = mask is not None
  1186. bint isna_entry
  1187. # TODO(cython3):
  1188. # Instead of `labels.shape[0]` use `len(labels)`
  1189. if not len(values) == labels.shape[0]:
  1190. raise AssertionError("len(index) != len(labels)")
  1191. min_count = max(min_count, 1)
  1192. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  1193. if numeric_object_t is object:
  1194. resx = np.empty((<object>out).shape, dtype=object)
  1195. else:
  1196. resx = np.empty_like(out)
  1197. N, K = (<object>values).shape
  1198. if numeric_object_t is object:
  1199. # TODO(cython3): De-duplicate once conditional-nogil is available
  1200. for i in range(N):
  1201. lab = labels[i]
  1202. if lab < 0:
  1203. continue
  1204. counts[lab] += 1
  1205. for j in range(K):
  1206. val = values[i, j]
  1207. if uses_mask:
  1208. isna_entry = mask[i, j]
  1209. else:
  1210. isna_entry = checknull(val)
  1211. if not isna_entry:
  1212. # TODO(cython3): use _treat_as_na here once
  1213. # conditional-nogil is available.
  1214. nobs[lab, j] += 1
  1215. if nobs[lab, j] == rank:
  1216. resx[lab, j] = val
  1217. for i in range(ncounts):
  1218. for j in range(K):
  1219. if nobs[i, j] < min_count:
  1220. out[i, j] = None
  1221. else:
  1222. out[i, j] = resx[i, j]
  1223. else:
  1224. with nogil:
  1225. for i in range(N):
  1226. lab = labels[i]
  1227. if lab < 0:
  1228. continue
  1229. counts[lab] += 1
  1230. for j in range(K):
  1231. val = values[i, j]
  1232. if uses_mask:
  1233. isna_entry = mask[i, j]
  1234. else:
  1235. isna_entry = _treat_as_na(val, is_datetimelike)
  1236. if not isna_entry:
  1237. nobs[lab, j] += 1
  1238. if nobs[lab, j] == rank:
  1239. resx[lab, j] = val
  1240. _check_below_mincount(
  1241. out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx
  1242. )
  1243. @cython.boundscheck(False)
  1244. @cython.wraparound(False)
  1245. def group_rank(
  1246. float64_t[:, ::1] out,
  1247. ndarray[numeric_object_t, ndim=2] values,
  1248. const intp_t[::1] labels,
  1249. int ngroups,
  1250. bint is_datetimelike,
  1251. str ties_method="average",
  1252. bint ascending=True,
  1253. bint pct=False,
  1254. str na_option="keep",
  1255. const uint8_t[:, :] mask=None,
  1256. ) -> None:
  1257. """
  1258. Provides the rank of values within each group.
  1259. Parameters
  1260. ----------
  1261. out : np.ndarray[np.float64, ndim=2]
  1262. Values to which this method will write its results.
  1263. values : np.ndarray of numeric_object_t values to be ranked
  1264. labels : np.ndarray[np.intp]
  1265. Array containing unique label for each group, with its ordering
  1266. matching up to the corresponding record in `values`
  1267. ngroups : int
  1268. This parameter is not used, is needed to match signatures of other
  1269. groupby functions.
  1270. is_datetimelike : bool
  1271. True if `values` contains datetime-like entries.
  1272. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  1273. * average: average rank of group
  1274. * min: lowest rank in group
  1275. * max: highest rank in group
  1276. * first: ranks assigned in order they appear in the array
  1277. * dense: like 'min', but rank always increases by 1 between groups
  1278. ascending : bool, default True
  1279. False for ranks by high (1) to low (N)
  1280. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  1281. pct : bool, default False
  1282. Compute percentage rank of data within each group
  1283. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  1284. * keep: leave NA values where they are
  1285. * top: smallest rank if ascending
  1286. * bottom: smallest rank if descending
  1287. mask : np.ndarray[bool] or None, default None
  1288. Notes
  1289. -----
  1290. This method modifies the `out` parameter rather than returning an object
  1291. """
  1292. cdef:
  1293. Py_ssize_t i, k, N
  1294. ndarray[float64_t, ndim=1] result
  1295. const uint8_t[:] sub_mask
  1296. N = values.shape[1]
  1297. for k in range(N):
  1298. if mask is None:
  1299. sub_mask = None
  1300. else:
  1301. sub_mask = mask[:, k]
  1302. result = rank_1d(
  1303. values=values[:, k],
  1304. labels=labels,
  1305. is_datetimelike=is_datetimelike,
  1306. ties_method=ties_method,
  1307. ascending=ascending,
  1308. pct=pct,
  1309. na_option=na_option,
  1310. mask=sub_mask,
  1311. )
  1312. for i in range(len(result)):
  1313. if labels[i] >= 0:
  1314. out[i, k] = result[i]
  1315. # ----------------------------------------------------------------------
  1316. # group_min, group_max
  1317. # ----------------------------------------------------------------------
  1318. @cython.wraparound(False)
  1319. @cython.boundscheck(False)
  1320. cdef group_min_max(
  1321. numeric_t[:, ::1] out,
  1322. int64_t[::1] counts,
  1323. ndarray[numeric_t, ndim=2] values,
  1324. const intp_t[::1] labels,
  1325. Py_ssize_t min_count=-1,
  1326. bint is_datetimelike=False,
  1327. bint compute_max=True,
  1328. const uint8_t[:, ::1] mask=None,
  1329. uint8_t[:, ::1] result_mask=None,
  1330. ):
  1331. """
  1332. Compute minimum/maximum of columns of `values`, in row groups `labels`.
  1333. Parameters
  1334. ----------
  1335. out : np.ndarray[numeric_t, ndim=2]
  1336. Array to store result in.
  1337. counts : np.ndarray[int64]
  1338. Input as a zeroed array, populated by group sizes during algorithm
  1339. values : array
  1340. Values to find column-wise min/max of.
  1341. labels : np.ndarray[np.intp]
  1342. Labels to group by.
  1343. min_count : Py_ssize_t, default -1
  1344. The minimum number of non-NA group elements, NA result if threshold
  1345. is not met
  1346. is_datetimelike : bool
  1347. True if `values` contains datetime-like entries.
  1348. compute_max : bint, default True
  1349. True to compute group-wise max, False to compute min
  1350. mask : ndarray[bool, ndim=2], optional
  1351. If not None, indices represent missing values,
  1352. otherwise the mask will not be used
  1353. result_mask : ndarray[bool, ndim=2], optional
  1354. If not None, these specify locations in the output that are NA.
  1355. Modified in-place.
  1356. Notes
  1357. -----
  1358. This method modifies the `out` parameter, rather than returning an object.
  1359. `counts` is modified to hold group sizes
  1360. """
  1361. cdef:
  1362. Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
  1363. numeric_t val
  1364. numeric_t[:, ::1] group_min_or_max
  1365. int64_t[:, ::1] nobs
  1366. bint uses_mask = mask is not None
  1367. bint isna_entry
  1368. # TODO(cython3):
  1369. # Instead of `labels.shape[0]` use `len(labels)`
  1370. if not len(values) == labels.shape[0]:
  1371. raise AssertionError("len(index) != len(labels)")
  1372. min_count = max(min_count, 1)
  1373. nobs = np.zeros((<object>out).shape, dtype=np.int64)
  1374. group_min_or_max = np.empty_like(out)
  1375. group_min_or_max[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
  1376. N, K = (<object>values).shape
  1377. with nogil:
  1378. for i in range(N):
  1379. lab = labels[i]
  1380. if lab < 0:
  1381. continue
  1382. counts[lab] += 1
  1383. for j in range(K):
  1384. val = values[i, j]
  1385. if uses_mask:
  1386. isna_entry = mask[i, j]
  1387. else:
  1388. isna_entry = _treat_as_na(val, is_datetimelike)
  1389. if not isna_entry:
  1390. nobs[lab, j] += 1
  1391. if compute_max:
  1392. if val > group_min_or_max[lab, j]:
  1393. group_min_or_max[lab, j] = val
  1394. else:
  1395. if val < group_min_or_max[lab, j]:
  1396. group_min_or_max[lab, j] = val
  1397. _check_below_mincount(
  1398. out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max
  1399. )
  1400. @cython.wraparound(False)
  1401. @cython.boundscheck(False)
  1402. def group_max(
  1403. numeric_t[:, ::1] out,
  1404. int64_t[::1] counts,
  1405. ndarray[numeric_t, ndim=2] values,
  1406. const intp_t[::1] labels,
  1407. Py_ssize_t min_count=-1,
  1408. bint is_datetimelike=False,
  1409. const uint8_t[:, ::1] mask=None,
  1410. uint8_t[:, ::1] result_mask=None,
  1411. ) -> None:
  1412. """See group_min_max.__doc__"""
  1413. group_min_max(
  1414. out,
  1415. counts,
  1416. values,
  1417. labels,
  1418. min_count=min_count,
  1419. is_datetimelike=is_datetimelike,
  1420. compute_max=True,
  1421. mask=mask,
  1422. result_mask=result_mask,
  1423. )
  1424. @cython.wraparound(False)
  1425. @cython.boundscheck(False)
  1426. def group_min(
  1427. numeric_t[:, ::1] out,
  1428. int64_t[::1] counts,
  1429. ndarray[numeric_t, ndim=2] values,
  1430. const intp_t[::1] labels,
  1431. Py_ssize_t min_count=-1,
  1432. bint is_datetimelike=False,
  1433. const uint8_t[:, ::1] mask=None,
  1434. uint8_t[:, ::1] result_mask=None,
  1435. ) -> None:
  1436. """See group_min_max.__doc__"""
  1437. group_min_max(
  1438. out,
  1439. counts,
  1440. values,
  1441. labels,
  1442. min_count=min_count,
  1443. is_datetimelike=is_datetimelike,
  1444. compute_max=False,
  1445. mask=mask,
  1446. result_mask=result_mask,
  1447. )
  1448. @cython.boundscheck(False)
  1449. @cython.wraparound(False)
  1450. cdef group_cummin_max(
  1451. numeric_t[:, ::1] out,
  1452. ndarray[numeric_t, ndim=2] values,
  1453. const uint8_t[:, ::1] mask,
  1454. uint8_t[:, ::1] result_mask,
  1455. const intp_t[::1] labels,
  1456. int ngroups,
  1457. bint is_datetimelike,
  1458. bint skipna,
  1459. bint compute_max,
  1460. ):
  1461. """
  1462. Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
  1463. Parameters
  1464. ----------
  1465. out : np.ndarray[numeric_t, ndim=2]
  1466. Array to store cummin/max in.
  1467. values : np.ndarray[numeric_t, ndim=2]
  1468. Values to take cummin/max of.
  1469. mask : np.ndarray[bool] or None
  1470. If not None, indices represent missing values,
  1471. otherwise the mask will not be used
  1472. result_mask : ndarray[bool, ndim=2], optional
  1473. If not None, these specify locations in the output that are NA.
  1474. Modified in-place.
  1475. labels : np.ndarray[np.intp]
  1476. Labels to group by.
  1477. ngroups : int
  1478. Number of groups, larger than all entries of `labels`.
  1479. is_datetimelike : bool
  1480. True if `values` contains datetime-like entries.
  1481. skipna : bool
  1482. If True, ignore nans in `values`.
  1483. compute_max : bool
  1484. True if cumulative maximum should be computed, False
  1485. if cumulative minimum should be computed
  1486. Notes
  1487. -----
  1488. This method modifies the `out` parameter, rather than returning an object.
  1489. """
  1490. cdef:
  1491. numeric_t[:, ::1] accum
  1492. Py_ssize_t i, j, N, K
  1493. numeric_t val, mval, na_val
  1494. uint8_t[:, ::1] seen_na
  1495. intp_t lab
  1496. bint na_possible
  1497. bint uses_mask = mask is not None
  1498. bint isna_entry
  1499. accum = np.empty((ngroups, (<object>values).shape[1]), dtype=values.dtype)
  1500. accum[:] = _get_min_or_max(<numeric_t>0, compute_max, is_datetimelike)
  1501. na_val = _get_na_val(<numeric_t>0, is_datetimelike)
  1502. if uses_mask:
  1503. na_possible = True
  1504. # Will never be used, just to avoid uninitialized warning
  1505. na_val = 0
  1506. elif numeric_t is float64_t or numeric_t is float32_t:
  1507. na_possible = True
  1508. elif is_datetimelike:
  1509. na_possible = True
  1510. else:
  1511. # Will never be used, just to avoid uninitialized warning
  1512. na_possible = False
  1513. if na_possible:
  1514. seen_na = np.zeros((<object>accum).shape, dtype=np.uint8)
  1515. N, K = (<object>values).shape
  1516. with nogil:
  1517. for i in range(N):
  1518. lab = labels[i]
  1519. if lab < 0:
  1520. continue
  1521. for j in range(K):
  1522. if not skipna and na_possible and seen_na[lab, j]:
  1523. if uses_mask:
  1524. result_mask[i, j] = 1
  1525. # Set to 0 ensures that we are deterministic and can
  1526. # downcast if appropriate
  1527. out[i, j] = 0
  1528. else:
  1529. out[i, j] = na_val
  1530. else:
  1531. val = values[i, j]
  1532. if uses_mask:
  1533. isna_entry = mask[i, j]
  1534. else:
  1535. isna_entry = _treat_as_na(val, is_datetimelike)
  1536. if not isna_entry:
  1537. mval = accum[lab, j]
  1538. if compute_max:
  1539. if val > mval:
  1540. accum[lab, j] = mval = val
  1541. else:
  1542. if val < mval:
  1543. accum[lab, j] = mval = val
  1544. out[i, j] = mval
  1545. else:
  1546. seen_na[lab, j] = 1
  1547. out[i, j] = val
  1548. @cython.boundscheck(False)
  1549. @cython.wraparound(False)
  1550. def group_cummin(
  1551. numeric_t[:, ::1] out,
  1552. ndarray[numeric_t, ndim=2] values,
  1553. const intp_t[::1] labels,
  1554. int ngroups,
  1555. bint is_datetimelike,
  1556. const uint8_t[:, ::1] mask=None,
  1557. uint8_t[:, ::1] result_mask=None,
  1558. bint skipna=True,
  1559. ) -> None:
  1560. """See group_cummin_max.__doc__"""
  1561. group_cummin_max(
  1562. out=out,
  1563. values=values,
  1564. mask=mask,
  1565. result_mask=result_mask,
  1566. labels=labels,
  1567. ngroups=ngroups,
  1568. is_datetimelike=is_datetimelike,
  1569. skipna=skipna,
  1570. compute_max=False,
  1571. )
  1572. @cython.boundscheck(False)
  1573. @cython.wraparound(False)
  1574. def group_cummax(
  1575. numeric_t[:, ::1] out,
  1576. ndarray[numeric_t, ndim=2] values,
  1577. const intp_t[::1] labels,
  1578. int ngroups,
  1579. bint is_datetimelike,
  1580. const uint8_t[:, ::1] mask=None,
  1581. uint8_t[:, ::1] result_mask=None,
  1582. bint skipna=True,
  1583. ) -> None:
  1584. """See group_cummin_max.__doc__"""
  1585. group_cummin_max(
  1586. out=out,
  1587. values=values,
  1588. mask=mask,
  1589. result_mask=result_mask,
  1590. labels=labels,
  1591. ngroups=ngroups,
  1592. is_datetimelike=is_datetimelike,
  1593. skipna=skipna,
  1594. compute_max=True,
  1595. )