algos.pyx 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536
  1. cimport cython
  2. from cython cimport Py_ssize_t
  3. from libc.math cimport (
  4. fabs,
  5. sqrt,
  6. )
  7. from libc.stdlib cimport (
  8. free,
  9. malloc,
  10. )
  11. from libc.string cimport memmove
  12. import numpy as np
  13. cimport numpy as cnp
  14. from numpy cimport (
  15. NPY_FLOAT64,
  16. NPY_INT8,
  17. NPY_INT16,
  18. NPY_INT32,
  19. NPY_INT64,
  20. NPY_OBJECT,
  21. NPY_UINT64,
  22. float32_t,
  23. float64_t,
  24. int8_t,
  25. int16_t,
  26. int32_t,
  27. int64_t,
  28. intp_t,
  29. ndarray,
  30. uint8_t,
  31. uint16_t,
  32. uint32_t,
  33. uint64_t,
  34. )
  35. cnp.import_array()
  36. cimport pandas._libs.util as util
  37. from pandas._libs.dtypes cimport (
  38. numeric_object_t,
  39. numeric_t,
  40. )
  41. from pandas._libs.khash cimport (
  42. kh_destroy_int64,
  43. kh_get_int64,
  44. kh_init_int64,
  45. kh_int64_t,
  46. kh_put_int64,
  47. kh_resize_int64,
  48. khiter_t,
  49. )
  50. from pandas._libs.util cimport get_nat
  51. import pandas._libs.missing as missing
  52. cdef:
  53. float64_t FP_ERR = 1e-13
  54. float64_t NaN = <float64_t>np.NaN
  55. int64_t NPY_NAT = get_nat()
  56. tiebreakers = {
  57. "average": TIEBREAK_AVERAGE,
  58. "min": TIEBREAK_MIN,
  59. "max": TIEBREAK_MAX,
  60. "first": TIEBREAK_FIRST,
  61. "dense": TIEBREAK_DENSE,
  62. }
  63. cdef bint are_diff(object left, object right):
  64. try:
  65. return fabs(left - right) > FP_ERR
  66. except TypeError:
  67. return left != right
  68. class Infinity:
  69. """
  70. Provide a positive Infinity comparison method for ranking.
  71. """
  72. def __lt__(self, other):
  73. return False
  74. def __le__(self, other):
  75. return isinstance(other, Infinity)
  76. def __eq__(self, other):
  77. return isinstance(other, Infinity)
  78. def __ne__(self, other):
  79. return not isinstance(other, Infinity)
  80. def __gt__(self, other):
  81. return (not isinstance(other, Infinity) and
  82. not missing.checknull(other))
  83. def __ge__(self, other):
  84. return not missing.checknull(other)
  85. class NegInfinity:
  86. """
  87. Provide a negative Infinity comparison method for ranking.
  88. """
  89. def __lt__(self, other):
  90. return (not isinstance(other, NegInfinity) and
  91. not missing.checknull(other))
  92. def __le__(self, other):
  93. return not missing.checknull(other)
  94. def __eq__(self, other):
  95. return isinstance(other, NegInfinity)
  96. def __ne__(self, other):
  97. return not isinstance(other, NegInfinity)
  98. def __gt__(self, other):
  99. return False
  100. def __ge__(self, other):
  101. return isinstance(other, NegInfinity)
  102. @cython.wraparound(False)
  103. @cython.boundscheck(False)
  104. cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr):
  105. """
  106. Efficiently find the unique first-differences of the given array.
  107. Parameters
  108. ----------
  109. arr : ndarray[int64_t]
  110. Returns
  111. -------
  112. ndarray[int64_t]
  113. An ordered ndarray[int64_t]
  114. """
  115. cdef:
  116. Py_ssize_t i, n = len(arr)
  117. int64_t val
  118. khiter_t k
  119. kh_int64_t *table
  120. int ret = 0
  121. list uniques = []
  122. ndarray[int64_t, ndim=1] result
  123. table = kh_init_int64()
  124. kh_resize_int64(table, 10)
  125. for i in range(n - 1):
  126. val = arr[i + 1] - arr[i]
  127. k = kh_get_int64(table, val)
  128. if k == table.n_buckets:
  129. kh_put_int64(table, val, &ret)
  130. uniques.append(val)
  131. kh_destroy_int64(table)
  132. result = np.array(uniques, dtype=np.int64)
  133. result.sort()
  134. return result
  135. @cython.wraparound(False)
  136. @cython.boundscheck(False)
  137. def is_lexsorted(list_of_arrays: list) -> bool:
  138. cdef:
  139. Py_ssize_t i
  140. Py_ssize_t n, nlevels
  141. int64_t k, cur, pre
  142. ndarray arr
  143. bint result = True
  144. nlevels = len(list_of_arrays)
  145. n = len(list_of_arrays[0])
  146. cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
  147. for i in range(nlevels):
  148. arr = list_of_arrays[i]
  149. assert arr.dtype.name == "int64"
  150. vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
  151. # Assume uniqueness??
  152. with nogil:
  153. for i in range(1, n):
  154. for k in range(nlevels):
  155. cur = vecs[k][i]
  156. pre = vecs[k][i -1]
  157. if cur == pre:
  158. continue
  159. elif cur > pre:
  160. break
  161. else:
  162. result = False
  163. break
  164. if not result:
  165. break
  166. free(vecs)
  167. return result
  168. @cython.boundscheck(False)
  169. @cython.wraparound(False)
  170. def groupsort_indexer(const intp_t[:] index, Py_ssize_t ngroups):
  171. """
  172. Compute a 1-d indexer.
  173. The indexer is an ordering of the passed index,
  174. ordered by the groups.
  175. Parameters
  176. ----------
  177. index: np.ndarray[np.intp]
  178. Mappings from group -> position.
  179. ngroups: int64
  180. Number of groups.
  181. Returns
  182. -------
  183. ndarray[intp_t, ndim=1]
  184. Indexer
  185. ndarray[intp_t, ndim=1]
  186. Group Counts
  187. Notes
  188. -----
  189. This is a reverse of the label factorization process.
  190. """
  191. cdef:
  192. Py_ssize_t i, label, n
  193. intp_t[::1] indexer, where, counts
  194. counts = np.zeros(ngroups + 1, dtype=np.intp)
  195. n = len(index)
  196. indexer = np.zeros(n, dtype=np.intp)
  197. where = np.zeros(ngroups + 1, dtype=np.intp)
  198. with nogil:
  199. # count group sizes, location 0 for NA
  200. for i in range(n):
  201. counts[index[i] + 1] += 1
  202. # mark the start of each contiguous group of like-indexed data
  203. for i in range(1, ngroups + 1):
  204. where[i] = where[i - 1] + counts[i - 1]
  205. # this is our indexer
  206. for i in range(n):
  207. label = index[i] + 1
  208. indexer[where[label]] = i
  209. where[label] += 1
  210. return indexer.base, counts.base
  211. cdef Py_ssize_t swap(numeric_t *a, numeric_t *b) nogil:
  212. cdef:
  213. numeric_t t
  214. # cython doesn't allow pointer dereference so use array syntax
  215. t = a[0]
  216. a[0] = b[0]
  217. b[0] = t
  218. return 0
  219. cdef numeric_t kth_smallest_c(numeric_t* arr, Py_ssize_t k, Py_ssize_t n) nogil:
  220. """
  221. See kth_smallest.__doc__. The additional parameter n specifies the maximum
  222. number of elements considered in arr, needed for compatibility with usage
  223. in groupby.pyx
  224. """
  225. cdef:
  226. Py_ssize_t i, j, left, m
  227. numeric_t x
  228. left = 0
  229. m = n - 1
  230. while left < m:
  231. x = arr[k]
  232. i = left
  233. j = m
  234. while 1:
  235. while arr[i] < x:
  236. i += 1
  237. while x < arr[j]:
  238. j -= 1
  239. if i <= j:
  240. swap(&arr[i], &arr[j])
  241. i += 1
  242. j -= 1
  243. if i > j:
  244. break
  245. if j < k:
  246. left = i
  247. if k < i:
  248. m = j
  249. return arr[k]
  250. @cython.boundscheck(False)
  251. @cython.wraparound(False)
  252. def kth_smallest(numeric_t[::1] arr, Py_ssize_t k) -> numeric_t:
  253. """
  254. Compute the kth smallest value in arr. Note that the input
  255. array will be modified.
  256. Parameters
  257. ----------
  258. arr : numeric[::1]
  259. Array to compute the kth smallest value for, must be
  260. contiguous
  261. k : Py_ssize_t
  262. Returns
  263. -------
  264. numeric
  265. The kth smallest value in arr
  266. """
  267. cdef:
  268. numeric_t result
  269. with nogil:
  270. result = kth_smallest_c(&arr[0], k, arr.shape[0])
  271. return result
  272. # ----------------------------------------------------------------------
  273. # Pairwise correlation/covariance
  274. @cython.boundscheck(False)
  275. @cython.wraparound(False)
  276. @cython.cdivision(True)
  277. def nancorr(const float64_t[:, :] mat, bint cov=False, minp=None):
  278. cdef:
  279. Py_ssize_t i, xi, yi, N, K
  280. bint minpv
  281. float64_t[:, ::1] result
  282. ndarray[uint8_t, ndim=2] mask
  283. int64_t nobs = 0
  284. float64_t vx, vy, dx, dy, meanx, meany, divisor, ssqdmx, ssqdmy, covxy
  285. N, K = (<object>mat).shape
  286. if minp is None:
  287. minpv = 1
  288. else:
  289. minpv = <int>minp
  290. result = np.empty((K, K), dtype=np.float64)
  291. mask = np.isfinite(mat).view(np.uint8)
  292. with nogil:
  293. for xi in range(K):
  294. for yi in range(xi + 1):
  295. # Welford's method for the variance-calculation
  296. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  297. nobs = ssqdmx = ssqdmy = covxy = meanx = meany = 0
  298. for i in range(N):
  299. if mask[i, xi] and mask[i, yi]:
  300. vx = mat[i, xi]
  301. vy = mat[i, yi]
  302. nobs += 1
  303. dx = vx - meanx
  304. dy = vy - meany
  305. meanx += 1. / nobs * dx
  306. meany += 1. / nobs * dy
  307. ssqdmx += (vx - meanx) * dx
  308. ssqdmy += (vy - meany) * dy
  309. covxy += (vx - meanx) * dy
  310. if nobs < minpv:
  311. result[xi, yi] = result[yi, xi] = NaN
  312. else:
  313. divisor = (nobs - 1.0) if cov else sqrt(ssqdmx * ssqdmy)
  314. if divisor != 0:
  315. result[xi, yi] = result[yi, xi] = covxy / divisor
  316. else:
  317. result[xi, yi] = result[yi, xi] = NaN
  318. return result.base
  319. # ----------------------------------------------------------------------
  320. # Pairwise Spearman correlation
  321. @cython.boundscheck(False)
  322. @cython.wraparound(False)
  323. def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray:
  324. cdef:
  325. Py_ssize_t i, xi, yi, N, K
  326. ndarray[float64_t, ndim=2] result
  327. ndarray[float64_t, ndim=2] ranked_mat
  328. ndarray[float64_t, ndim=1] rankedx, rankedy
  329. float64_t[::1] maskedx, maskedy
  330. ndarray[uint8_t, ndim=2] mask
  331. int64_t nobs = 0
  332. bint no_nans
  333. float64_t vx, vy, sumx, sumxx, sumyy, mean, divisor
  334. N, K = (<object>mat).shape
  335. # Handle the edge case where we know all results will be nan
  336. # to keep conditional logic inside loop simpler
  337. if N < minp:
  338. result = np.full((K, K), np.nan, dtype=np.float64)
  339. return result
  340. result = np.empty((K, K), dtype=np.float64)
  341. mask = np.isfinite(mat).view(np.uint8)
  342. no_nans = mask.all()
  343. ranked_mat = np.empty((N, K), dtype=np.float64)
  344. # Note: we index into maskedx, maskedy in loops up to nobs, but using N is safe
  345. # here since N >= nobs and values are stored contiguously
  346. maskedx = np.empty(N, dtype=np.float64)
  347. maskedy = np.empty(N, dtype=np.float64)
  348. for i in range(K):
  349. ranked_mat[:, i] = rank_1d(mat[:, i])
  350. with nogil:
  351. for xi in range(K):
  352. for yi in range(xi + 1):
  353. sumx = sumxx = sumyy = 0
  354. # Fastpath for data with no nans/infs, allows avoiding mask checks
  355. # and array reassignments
  356. if no_nans:
  357. mean = (N + 1) / 2.
  358. # now the cov numerator
  359. for i in range(N):
  360. vx = ranked_mat[i, xi] - mean
  361. vy = ranked_mat[i, yi] - mean
  362. sumx += vx * vy
  363. sumxx += vx * vx
  364. sumyy += vy * vy
  365. else:
  366. nobs = 0
  367. # Keep track of whether we need to recompute ranks
  368. all_ranks = True
  369. for i in range(N):
  370. all_ranks &= not (mask[i, xi] ^ mask[i, yi])
  371. if mask[i, xi] and mask[i, yi]:
  372. maskedx[nobs] = ranked_mat[i, xi]
  373. maskedy[nobs] = ranked_mat[i, yi]
  374. nobs += 1
  375. if nobs < minp:
  376. result[xi, yi] = result[yi, xi] = NaN
  377. continue
  378. else:
  379. if not all_ranks:
  380. with gil:
  381. # We need to slice back to nobs because rank_1d will
  382. # require arrays of nobs length
  383. rankedx = rank_1d(np.asarray(maskedx)[:nobs])
  384. rankedy = rank_1d(np.asarray(maskedy)[:nobs])
  385. for i in range(nobs):
  386. maskedx[i] = rankedx[i]
  387. maskedy[i] = rankedy[i]
  388. mean = (nobs + 1) / 2.
  389. # now the cov numerator
  390. for i in range(nobs):
  391. vx = maskedx[i] - mean
  392. vy = maskedy[i] - mean
  393. sumx += vx * vy
  394. sumxx += vx * vx
  395. sumyy += vy * vy
  396. divisor = sqrt(sumxx * sumyy)
  397. if divisor != 0:
  398. result[xi, yi] = result[yi, xi] = sumx / divisor
  399. else:
  400. result[xi, yi] = result[yi, xi] = NaN
  401. return result
  402. # ----------------------------------------------------------------------
  403. def validate_limit(nobs: int | None, limit=None) -> int:
  404. """
  405. Check that the `limit` argument is a positive integer.
  406. Parameters
  407. ----------
  408. nobs : int
  409. limit : object
  410. Returns
  411. -------
  412. int
  413. The limit.
  414. """
  415. if limit is None:
  416. lim = nobs
  417. else:
  418. if not util.is_integer_object(limit):
  419. raise ValueError("Limit must be an integer")
  420. if limit < 1:
  421. raise ValueError("Limit must be greater than 0")
  422. lim = limit
  423. return lim
  424. @cython.boundscheck(False)
  425. @cython.wraparound(False)
  426. def pad(
  427. ndarray[numeric_object_t] old,
  428. ndarray[numeric_object_t] new,
  429. limit=None
  430. ) -> ndarray:
  431. # -> ndarray[intp_t, ndim=1]
  432. cdef:
  433. Py_ssize_t i, j, nleft, nright
  434. ndarray[intp_t, ndim=1] indexer
  435. numeric_object_t cur, next_val
  436. int lim, fill_count = 0
  437. nleft = len(old)
  438. nright = len(new)
  439. indexer = np.empty(nright, dtype=np.intp)
  440. indexer[:] = -1
  441. lim = validate_limit(nright, limit)
  442. if nleft == 0 or nright == 0 or new[nright - 1] < old[0]:
  443. return indexer
  444. i = j = 0
  445. cur = old[0]
  446. while j <= nright - 1 and new[j] < cur:
  447. j += 1
  448. while True:
  449. if j == nright:
  450. break
  451. if i == nleft - 1:
  452. while j < nright:
  453. if new[j] == cur:
  454. indexer[j] = i
  455. elif new[j] > cur and fill_count < lim:
  456. indexer[j] = i
  457. fill_count += 1
  458. j += 1
  459. break
  460. next_val = old[i + 1]
  461. while j < nright and cur <= new[j] < next_val:
  462. if new[j] == cur:
  463. indexer[j] = i
  464. elif fill_count < lim:
  465. indexer[j] = i
  466. fill_count += 1
  467. j += 1
  468. fill_count = 0
  469. i += 1
  470. cur = next_val
  471. return indexer
  472. @cython.boundscheck(False)
  473. @cython.wraparound(False)
  474. def pad_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
  475. cdef:
  476. Py_ssize_t i, N
  477. numeric_object_t val
  478. uint8_t prev_mask
  479. int lim, fill_count = 0
  480. N = len(values)
  481. # GH#2778
  482. if N == 0:
  483. return
  484. lim = validate_limit(N, limit)
  485. val = values[0]
  486. prev_mask = mask[0]
  487. for i in range(N):
  488. if mask[i]:
  489. if fill_count >= lim:
  490. continue
  491. fill_count += 1
  492. values[i] = val
  493. mask[i] = prev_mask
  494. else:
  495. fill_count = 0
  496. val = values[i]
  497. prev_mask = mask[i]
  498. @cython.boundscheck(False)
  499. @cython.wraparound(False)
  500. def pad_2d_inplace(numeric_object_t[:, :] values, uint8_t[:, :] mask, limit=None):
  501. cdef:
  502. Py_ssize_t i, j, N, K
  503. numeric_object_t val
  504. int lim, fill_count = 0
  505. K, N = (<object>values).shape
  506. # GH#2778
  507. if N == 0:
  508. return
  509. lim = validate_limit(N, limit)
  510. for j in range(K):
  511. fill_count = 0
  512. val = values[j, 0]
  513. for i in range(N):
  514. if mask[j, i]:
  515. if fill_count >= lim or i == 0:
  516. continue
  517. fill_count += 1
  518. values[j, i] = val
  519. mask[j, i] = False
  520. else:
  521. fill_count = 0
  522. val = values[j, i]
  523. @cython.boundscheck(False)
  524. @cython.wraparound(False)
  525. def backfill(
  526. ndarray[numeric_object_t] old,
  527. ndarray[numeric_object_t] new,
  528. limit=None
  529. ) -> ndarray: # -> ndarray[intp_t, ndim=1]
  530. """
  531. Backfilling logic for generating fill vector
  532. Diagram of what's going on
  533. Old New Fill vector Mask
  534. . 0 1
  535. . 0 1
  536. . 0 1
  537. A A 0 1
  538. . 1 1
  539. . 1 1
  540. . 1 1
  541. . 1 1
  542. . 1 1
  543. B B 1 1
  544. . 2 1
  545. . 2 1
  546. . 2 1
  547. C C 2 1
  548. . 0
  549. . 0
  550. D
  551. """
  552. cdef:
  553. Py_ssize_t i, j, nleft, nright
  554. ndarray[intp_t, ndim=1] indexer
  555. numeric_object_t cur, prev
  556. int lim, fill_count = 0
  557. nleft = len(old)
  558. nright = len(new)
  559. indexer = np.empty(nright, dtype=np.intp)
  560. indexer[:] = -1
  561. lim = validate_limit(nright, limit)
  562. if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]:
  563. return indexer
  564. i = nleft - 1
  565. j = nright - 1
  566. cur = old[nleft - 1]
  567. while j >= 0 and new[j] > cur:
  568. j -= 1
  569. while True:
  570. if j < 0:
  571. break
  572. if i == 0:
  573. while j >= 0:
  574. if new[j] == cur:
  575. indexer[j] = i
  576. elif new[j] < cur and fill_count < lim:
  577. indexer[j] = i
  578. fill_count += 1
  579. j -= 1
  580. break
  581. prev = old[i - 1]
  582. while j >= 0 and prev < new[j] <= cur:
  583. if new[j] == cur:
  584. indexer[j] = i
  585. elif new[j] < cur and fill_count < lim:
  586. indexer[j] = i
  587. fill_count += 1
  588. j -= 1
  589. fill_count = 0
  590. i -= 1
  591. cur = prev
  592. return indexer
  593. def backfill_inplace(numeric_object_t[:] values, uint8_t[:] mask, limit=None):
  594. pad_inplace(values[::-1], mask[::-1], limit=limit)
  595. def backfill_2d_inplace(numeric_object_t[:, :] values,
  596. uint8_t[:, :] mask,
  597. limit=None):
  598. pad_2d_inplace(values[:, ::-1], mask[:, ::-1], limit)
  599. @cython.boundscheck(False)
  600. @cython.wraparound(False)
  601. def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike):
  602. """
  603. Returns
  604. -------
  605. tuple
  606. is_monotonic_inc : bool
  607. is_monotonic_dec : bool
  608. is_unique : bool
  609. """
  610. cdef:
  611. Py_ssize_t i, n
  612. numeric_object_t prev, cur
  613. bint is_monotonic_inc = 1
  614. bint is_monotonic_dec = 1
  615. bint is_unique = 1
  616. bint is_strict_monotonic = 1
  617. n = len(arr)
  618. if n == 1:
  619. if arr[0] != arr[0] or (numeric_object_t is int64_t and timelike and
  620. arr[0] == NPY_NAT):
  621. # single value is NaN
  622. return False, False, True
  623. else:
  624. return True, True, True
  625. elif n < 2:
  626. return True, True, True
  627. if timelike and <int64_t>arr[0] == NPY_NAT:
  628. return False, False, True
  629. if numeric_object_t is not object:
  630. with nogil:
  631. prev = arr[0]
  632. for i in range(1, n):
  633. cur = arr[i]
  634. if timelike and <int64_t>cur == NPY_NAT:
  635. is_monotonic_inc = 0
  636. is_monotonic_dec = 0
  637. break
  638. if cur < prev:
  639. is_monotonic_inc = 0
  640. elif cur > prev:
  641. is_monotonic_dec = 0
  642. elif cur == prev:
  643. is_unique = 0
  644. else:
  645. # cur or prev is NaN
  646. is_monotonic_inc = 0
  647. is_monotonic_dec = 0
  648. break
  649. if not is_monotonic_inc and not is_monotonic_dec:
  650. is_monotonic_inc = 0
  651. is_monotonic_dec = 0
  652. break
  653. prev = cur
  654. else:
  655. # object-dtype, identical to above except we cannot use `with nogil`
  656. prev = arr[0]
  657. for i in range(1, n):
  658. cur = arr[i]
  659. if timelike and <int64_t>cur == NPY_NAT:
  660. is_monotonic_inc = 0
  661. is_monotonic_dec = 0
  662. break
  663. if cur < prev:
  664. is_monotonic_inc = 0
  665. elif cur > prev:
  666. is_monotonic_dec = 0
  667. elif cur == prev:
  668. is_unique = 0
  669. else:
  670. # cur or prev is NaN
  671. is_monotonic_inc = 0
  672. is_monotonic_dec = 0
  673. break
  674. if not is_monotonic_inc and not is_monotonic_dec:
  675. is_monotonic_inc = 0
  676. is_monotonic_dec = 0
  677. break
  678. prev = cur
  679. is_strict_monotonic = is_unique and (is_monotonic_inc or is_monotonic_dec)
  680. return is_monotonic_inc, is_monotonic_dec, is_strict_monotonic
  681. # ----------------------------------------------------------------------
  682. # rank_1d, rank_2d
  683. # ----------------------------------------------------------------------
  684. cdef numeric_object_t get_rank_nan_fill_val(
  685. bint rank_nans_highest,
  686. numeric_object_t val,
  687. bint is_datetimelike=False,
  688. ):
  689. """
  690. Return the value we'll use to represent missing values when sorting depending
  691. on if we'd like missing values to end up at the top/bottom. (The second parameter
  692. is unused, but needed for fused type specialization)
  693. """
  694. if numeric_object_t is int64_t and is_datetimelike and not rank_nans_highest:
  695. return NPY_NAT + 1
  696. if rank_nans_highest:
  697. if numeric_object_t is object:
  698. return Infinity()
  699. elif numeric_object_t is int64_t:
  700. return util.INT64_MAX
  701. elif numeric_object_t is int32_t:
  702. return util.INT32_MAX
  703. elif numeric_object_t is int16_t:
  704. return util.INT16_MAX
  705. elif numeric_object_t is int8_t:
  706. return util.INT8_MAX
  707. elif numeric_object_t is uint64_t:
  708. return util.UINT64_MAX
  709. elif numeric_object_t is uint32_t:
  710. return util.UINT32_MAX
  711. elif numeric_object_t is uint16_t:
  712. return util.UINT16_MAX
  713. elif numeric_object_t is uint8_t:
  714. return util.UINT8_MAX
  715. else:
  716. return np.inf
  717. else:
  718. if numeric_object_t is object:
  719. return NegInfinity()
  720. elif numeric_object_t is int64_t:
  721. # Note(jbrockmendel) 2022-03-15 for reasons unknown, using util.INT64_MIN
  722. # instead of NPY_NAT here causes build warnings and failure in
  723. # test_cummax_i8_at_implementation_bound
  724. return NPY_NAT
  725. elif numeric_object_t is int32_t:
  726. return util.INT32_MIN
  727. elif numeric_object_t is int16_t:
  728. return util.INT16_MIN
  729. elif numeric_object_t is int8_t:
  730. return util.INT8_MIN
  731. elif numeric_object_t is uint64_t:
  732. return 0
  733. elif numeric_object_t is uint32_t:
  734. return 0
  735. elif numeric_object_t is uint16_t:
  736. return 0
  737. elif numeric_object_t is uint8_t:
  738. return 0
  739. else:
  740. return -np.inf
  741. @cython.wraparound(False)
  742. @cython.boundscheck(False)
  743. def rank_1d(
  744. ndarray[numeric_object_t, ndim=1] values,
  745. const intp_t[:] labels=None,
  746. bint is_datetimelike=False,
  747. ties_method="average",
  748. bint ascending=True,
  749. bint pct=False,
  750. na_option="keep",
  751. const uint8_t[:] mask=None,
  752. ):
  753. """
  754. Fast NaN-friendly version of ``scipy.stats.rankdata``.
  755. Parameters
  756. ----------
  757. values : array of numeric_object_t values to be ranked
  758. labels : np.ndarray[np.intp] or None
  759. Array containing unique label for each group, with its ordering
  760. matching up to the corresponding record in `values`. If not called
  761. from a groupby operation, will be None.
  762. is_datetimelike : bool, default False
  763. True if `values` contains datetime-like entries.
  764. ties_method : {'average', 'min', 'max', 'first', 'dense'}, default
  765. 'average'
  766. * average: average rank of group
  767. * min: lowest rank in group
  768. * max: highest rank in group
  769. * first: ranks assigned in order they appear in the array
  770. * dense: like 'min', but rank always increases by 1 between groups
  771. ascending : bool, default True
  772. False for ranks by high (1) to low (N)
  773. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  774. pct : bool, default False
  775. Compute percentage rank of data within each group
  776. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  777. * keep: leave NA values where they are
  778. * top: smallest rank if ascending
  779. * bottom: smallest rank if descending
  780. mask : np.ndarray[bool], optional, default None
  781. Specify locations to be treated as NA, for e.g. Categorical.
  782. """
  783. cdef:
  784. TiebreakEnumType tiebreak
  785. Py_ssize_t N
  786. int64_t[::1] grp_sizes
  787. intp_t[:] lexsort_indexer
  788. float64_t[::1] out
  789. ndarray[numeric_object_t, ndim=1] masked_vals
  790. numeric_object_t[:] masked_vals_memview
  791. bint keep_na, nans_rank_highest, check_labels, check_mask
  792. numeric_object_t nan_fill_val
  793. tiebreak = tiebreakers[ties_method]
  794. if tiebreak == TIEBREAK_FIRST:
  795. if not ascending:
  796. tiebreak = TIEBREAK_FIRST_DESCENDING
  797. keep_na = na_option == "keep"
  798. N = len(values)
  799. if labels is not None:
  800. # TODO(cython3): cast won't be necessary (#2992)
  801. assert <Py_ssize_t>len(labels) == N
  802. out = np.empty(N)
  803. grp_sizes = np.ones(N, dtype=np.int64)
  804. # If we don't care about labels, can short-circuit later label
  805. # comparisons
  806. check_labels = labels is not None
  807. # For cases where a mask is not possible, we can avoid mask checks
  808. check_mask = (
  809. numeric_object_t is float32_t
  810. or numeric_object_t is float64_t
  811. or numeric_object_t is object
  812. or (numeric_object_t is int64_t and is_datetimelike)
  813. )
  814. check_mask = check_mask or mask is not None
  815. # Copy values into new array in order to fill missing data
  816. # with mask, without obfuscating location of missing data
  817. # in values array
  818. if numeric_object_t is object and values.dtype != np.object_:
  819. masked_vals = values.astype("O")
  820. else:
  821. masked_vals = values.copy()
  822. if mask is not None:
  823. pass
  824. elif numeric_object_t is object:
  825. mask = missing.isnaobj(masked_vals)
  826. elif numeric_object_t is int64_t and is_datetimelike:
  827. mask = (masked_vals == NPY_NAT).astype(np.uint8)
  828. elif numeric_object_t is float64_t or numeric_object_t is float32_t:
  829. mask = np.isnan(masked_vals).astype(np.uint8)
  830. else:
  831. mask = np.zeros(shape=len(masked_vals), dtype=np.uint8)
  832. # If `na_option == 'top'`, we want to assign the lowest rank
  833. # to NaN regardless of ascending/descending. So if ascending,
  834. # fill with lowest value of type to end up with lowest rank.
  835. # If descending, fill with highest value since descending
  836. # will flip the ordering to still end up with lowest rank.
  837. # Symmetric logic applies to `na_option == 'bottom'`
  838. nans_rank_highest = ascending ^ (na_option == "top")
  839. nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
  840. if nans_rank_highest:
  841. order = [masked_vals, mask]
  842. else:
  843. order = [masked_vals, ~(np.asarray(mask))]
  844. if check_labels:
  845. order.append(labels)
  846. np.putmask(masked_vals, mask, nan_fill_val)
  847. # putmask doesn't accept a memoryview, so we assign as a separate step
  848. masked_vals_memview = masked_vals
  849. # lexsort using labels, then mask, then actual values
  850. # each label corresponds to a different group value,
  851. # the mask helps you differentiate missing values before
  852. # performing sort on the actual values
  853. lexsort_indexer = np.lexsort(order).astype(np.intp, copy=False)
  854. if not ascending:
  855. lexsort_indexer = lexsort_indexer[::-1]
  856. with nogil:
  857. rank_sorted_1d(
  858. out,
  859. grp_sizes,
  860. lexsort_indexer,
  861. masked_vals_memview,
  862. mask,
  863. check_mask=check_mask,
  864. N=N,
  865. tiebreak=tiebreak,
  866. keep_na=keep_na,
  867. pct=pct,
  868. labels=labels,
  869. )
  870. return np.asarray(out)
  871. @cython.wraparound(False)
  872. @cython.boundscheck(False)
  873. cdef void rank_sorted_1d(
  874. float64_t[::1] out,
  875. int64_t[::1] grp_sizes,
  876. const intp_t[:] sort_indexer,
  877. # TODO(cython3): make const (https://github.com/cython/cython/issues/3222)
  878. numeric_object_t[:] masked_vals,
  879. const uint8_t[:] mask,
  880. bint check_mask,
  881. Py_ssize_t N,
  882. TiebreakEnumType tiebreak=TIEBREAK_AVERAGE,
  883. bint keep_na=True,
  884. bint pct=False,
  885. # https://github.com/cython/cython/issues/1630, only trailing arguments can
  886. # currently be omitted for cdef functions, which is why we keep this at the end
  887. const intp_t[:] labels=None,
  888. ) nogil:
  889. """
  890. See rank_1d.__doc__. Handles only actual ranking, so sorting and masking should
  891. be handled in the caller. Note that `out` and `grp_sizes` are modified inplace.
  892. Parameters
  893. ----------
  894. out : float64_t[::1]
  895. Array to store computed ranks
  896. grp_sizes : int64_t[::1]
  897. Array to store group counts, only used if pct=True. Should only be None
  898. if labels is None.
  899. sort_indexer : intp_t[:]
  900. Array of indices which sorts masked_vals
  901. masked_vals : numeric_object_t[:]
  902. The values input to rank_1d, with missing values replaced by fill values
  903. mask : uint8_t[:]
  904. Array where entries are True if the value is missing, False otherwise.
  905. check_mask : bool
  906. If False, assumes the mask is all False to skip mask indexing
  907. N : Py_ssize_t
  908. The number of elements to rank. Note: it is not always true that
  909. N == len(out) or N == len(masked_vals) (see `nancorr_spearman` usage for why)
  910. tiebreak : TiebreakEnumType, default TIEBREAK_AVERAGE
  911. See rank_1d.__doc__ for the different modes
  912. keep_na : bool, default True
  913. Whether or not to keep nulls
  914. pct : bool, default False
  915. Compute percentage rank of data within each group
  916. labels : See rank_1d.__doc__, default None. None implies all labels are the same.
  917. """
  918. cdef:
  919. Py_ssize_t i, j, dups=0, sum_ranks=0,
  920. Py_ssize_t grp_start=0, grp_vals_seen=1, grp_na_count=0
  921. bint at_end, next_val_diff, group_changed, check_labels
  922. int64_t grp_size
  923. check_labels = labels is not None
  924. # Loop over the length of the value array
  925. # each incremental i value can be looked up in the lexsort_indexer
  926. # array that we sorted previously, which gives us the location of
  927. # that sorted value for retrieval back from the original
  928. # values / masked_vals arrays
  929. # TODO(cython3): de-duplicate once cython supports conditional nogil
  930. if numeric_object_t is object:
  931. with gil:
  932. for i in range(N):
  933. at_end = i == N - 1
  934. # dups and sum_ranks will be incremented each loop where
  935. # the value / group remains the same, and should be reset
  936. # when either of those change. Used to calculate tiebreakers
  937. dups += 1
  938. sum_ranks += i - grp_start + 1
  939. next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
  940. masked_vals[sort_indexer[i+1]])
  941. # We'll need this check later anyway to determine group size, so just
  942. # compute it here since shortcircuiting won't help
  943. group_changed = at_end or (check_labels and
  944. (labels[sort_indexer[i]]
  945. != labels[sort_indexer[i+1]]))
  946. # Update out only when there is a transition of values or labels.
  947. # When a new value or group is encountered, go back #dups steps(
  948. # the number of occurrence of current value) and assign the ranks
  949. # based on the starting index of the current group (grp_start)
  950. # and the current index
  951. if (next_val_diff or group_changed or (check_mask and
  952. (mask[sort_indexer[i]]
  953. ^ mask[sort_indexer[i+1]]))):
  954. # If keep_na, check for missing values and assign back
  955. # to the result where appropriate
  956. if keep_na and check_mask and mask[sort_indexer[i]]:
  957. grp_na_count = dups
  958. for j in range(i - dups + 1, i + 1):
  959. out[sort_indexer[j]] = NaN
  960. elif tiebreak == TIEBREAK_AVERAGE:
  961. for j in range(i - dups + 1, i + 1):
  962. out[sort_indexer[j]] = sum_ranks / <float64_t>dups
  963. elif tiebreak == TIEBREAK_MIN:
  964. for j in range(i - dups + 1, i + 1):
  965. out[sort_indexer[j]] = i - grp_start - dups + 2
  966. elif tiebreak == TIEBREAK_MAX:
  967. for j in range(i - dups + 1, i + 1):
  968. out[sort_indexer[j]] = i - grp_start + 1
  969. # With n as the previous rank in the group and m as the number
  970. # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
  971. # then rankings should be n + 1, n + 2 ... n + m
  972. elif tiebreak == TIEBREAK_FIRST:
  973. for j in range(i - dups + 1, i + 1):
  974. out[sort_indexer[j]] = j + 1 - grp_start
  975. # If TIEBREAK_FIRST and descending, the ranking should be
  976. # n + m, n + (m - 1) ... n + 1. This is equivalent to
  977. # (i - dups + 1) + (i - j + 1) - grp_start
  978. elif tiebreak == TIEBREAK_FIRST_DESCENDING:
  979. for j in range(i - dups + 1, i + 1):
  980. out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
  981. elif tiebreak == TIEBREAK_DENSE:
  982. for j in range(i - dups + 1, i + 1):
  983. out[sort_indexer[j]] = grp_vals_seen
  984. # Look forward to the next value (using the sorting in
  985. # lexsort_indexer). If the value does not equal the current
  986. # value then we need to reset the dups and sum_ranks, knowing
  987. # that a new value is coming up. The conditional also needs
  988. # to handle nan equality and the end of iteration. If group
  989. # changes we do not record seeing a new value in the group
  990. if not group_changed and (next_val_diff or (check_mask and
  991. (mask[sort_indexer[i]]
  992. ^ mask[sort_indexer[i+1]]))):
  993. dups = sum_ranks = 0
  994. grp_vals_seen += 1
  995. # Similar to the previous conditional, check now if we are
  996. # moving to a new group. If so, keep track of the index where
  997. # the new group occurs, so the tiebreaker calculations can
  998. # decrement that from their position. Fill in the size of each
  999. # group encountered (used by pct calculations later). Also be
  1000. # sure to reset any of the items helping to calculate dups
  1001. if group_changed:
  1002. # If not dense tiebreak, group size used to compute
  1003. # percentile will be # of non-null elements in group
  1004. if tiebreak != TIEBREAK_DENSE:
  1005. grp_size = i - grp_start + 1 - grp_na_count
  1006. # Otherwise, it will be the number of distinct values
  1007. # in the group, subtracting 1 if NaNs are present
  1008. # since that is a distinct value we shouldn't count
  1009. else:
  1010. grp_size = grp_vals_seen - (grp_na_count > 0)
  1011. for j in range(grp_start, i + 1):
  1012. grp_sizes[sort_indexer[j]] = grp_size
  1013. dups = sum_ranks = 0
  1014. grp_na_count = 0
  1015. grp_start = i + 1
  1016. grp_vals_seen = 1
  1017. else:
  1018. for i in range(N):
  1019. at_end = i == N - 1
  1020. # dups and sum_ranks will be incremented each loop where
  1021. # the value / group remains the same, and should be reset
  1022. # when either of those change. Used to calculate tiebreakers
  1023. dups += 1
  1024. sum_ranks += i - grp_start + 1
  1025. next_val_diff = at_end or (masked_vals[sort_indexer[i]]
  1026. != masked_vals[sort_indexer[i+1]])
  1027. # We'll need this check later anyway to determine group size, so just
  1028. # compute it here since shortcircuiting won't help
  1029. group_changed = at_end or (check_labels and
  1030. (labels[sort_indexer[i]]
  1031. != labels[sort_indexer[i+1]]))
  1032. # Update out only when there is a transition of values or labels.
  1033. # When a new value or group is encountered, go back #dups steps(
  1034. # the number of occurrence of current value) and assign the ranks
  1035. # based on the starting index of the current group (grp_start)
  1036. # and the current index
  1037. if (next_val_diff or group_changed
  1038. or (check_mask and
  1039. (mask[sort_indexer[i]] ^ mask[sort_indexer[i+1]]))):
  1040. # If keep_na, check for missing values and assign back
  1041. # to the result where appropriate
  1042. if keep_na and check_mask and mask[sort_indexer[i]]:
  1043. grp_na_count = dups
  1044. for j in range(i - dups + 1, i + 1):
  1045. out[sort_indexer[j]] = NaN
  1046. elif tiebreak == TIEBREAK_AVERAGE:
  1047. for j in range(i - dups + 1, i + 1):
  1048. out[sort_indexer[j]] = sum_ranks / <float64_t>dups
  1049. elif tiebreak == TIEBREAK_MIN:
  1050. for j in range(i - dups + 1, i + 1):
  1051. out[sort_indexer[j]] = i - grp_start - dups + 2
  1052. elif tiebreak == TIEBREAK_MAX:
  1053. for j in range(i - dups + 1, i + 1):
  1054. out[sort_indexer[j]] = i - grp_start + 1
  1055. # With n as the previous rank in the group and m as the number
  1056. # of duplicates in this stretch, if TIEBREAK_FIRST and ascending,
  1057. # then rankings should be n + 1, n + 2 ... n + m
  1058. elif tiebreak == TIEBREAK_FIRST:
  1059. for j in range(i - dups + 1, i + 1):
  1060. out[sort_indexer[j]] = j + 1 - grp_start
  1061. # If TIEBREAK_FIRST and descending, the ranking should be
  1062. # n + m, n + (m - 1) ... n + 1. This is equivalent to
  1063. # (i - dups + 1) + (i - j + 1) - grp_start
  1064. elif tiebreak == TIEBREAK_FIRST_DESCENDING:
  1065. for j in range(i - dups + 1, i + 1):
  1066. out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start
  1067. elif tiebreak == TIEBREAK_DENSE:
  1068. for j in range(i - dups + 1, i + 1):
  1069. out[sort_indexer[j]] = grp_vals_seen
  1070. # Look forward to the next value (using the sorting in
  1071. # lexsort_indexer). If the value does not equal the current
  1072. # value then we need to reset the dups and sum_ranks, knowing
  1073. # that a new value is coming up. The conditional also needs
  1074. # to handle nan equality and the end of iteration. If group
  1075. # changes we do not record seeing a new value in the group
  1076. if not group_changed and (next_val_diff
  1077. or (check_mask and
  1078. (mask[sort_indexer[i]]
  1079. ^ mask[sort_indexer[i+1]]))):
  1080. dups = sum_ranks = 0
  1081. grp_vals_seen += 1
  1082. # Similar to the previous conditional, check now if we are
  1083. # moving to a new group. If so, keep track of the index where
  1084. # the new group occurs, so the tiebreaker calculations can
  1085. # decrement that from their position. Fill in the size of each
  1086. # group encountered (used by pct calculations later). Also be
  1087. # sure to reset any of the items helping to calculate dups
  1088. if group_changed:
  1089. # If not dense tiebreak, group size used to compute
  1090. # percentile will be # of non-null elements in group
  1091. if tiebreak != TIEBREAK_DENSE:
  1092. grp_size = i - grp_start + 1 - grp_na_count
  1093. # Otherwise, it will be the number of distinct values
  1094. # in the group, subtracting 1 if NaNs are present
  1095. # since that is a distinct value we shouldn't count
  1096. else:
  1097. grp_size = grp_vals_seen - (grp_na_count > 0)
  1098. for j in range(grp_start, i + 1):
  1099. grp_sizes[sort_indexer[j]] = grp_size
  1100. dups = sum_ranks = 0
  1101. grp_na_count = 0
  1102. grp_start = i + 1
  1103. grp_vals_seen = 1
  1104. if pct:
  1105. for i in range(N):
  1106. if grp_sizes[i] != 0:
  1107. out[i] = out[i] / grp_sizes[i]
  1108. def rank_2d(
  1109. ndarray[numeric_object_t, ndim=2] in_arr,
  1110. int axis=0,
  1111. bint is_datetimelike=False,
  1112. ties_method="average",
  1113. bint ascending=True,
  1114. na_option="keep",
  1115. bint pct=False,
  1116. ):
  1117. """
  1118. Fast NaN-friendly version of ``scipy.stats.rankdata``.
  1119. """
  1120. cdef:
  1121. Py_ssize_t k, n, col
  1122. float64_t[::1, :] out # Column-major so columns are contiguous
  1123. int64_t[::1] grp_sizes
  1124. ndarray[numeric_object_t, ndim=2] values
  1125. numeric_object_t[:, :] masked_vals
  1126. intp_t[:, :] sort_indexer
  1127. uint8_t[:, :] mask
  1128. TiebreakEnumType tiebreak
  1129. bint check_mask, keep_na, nans_rank_highest
  1130. numeric_object_t nan_fill_val
  1131. tiebreak = tiebreakers[ties_method]
  1132. if tiebreak == TIEBREAK_FIRST:
  1133. if not ascending:
  1134. tiebreak = TIEBREAK_FIRST_DESCENDING
  1135. keep_na = na_option == "keep"
  1136. # For cases where a mask is not possible, we can avoid mask checks
  1137. check_mask = (
  1138. numeric_object_t is float32_t
  1139. or numeric_object_t is float64_t
  1140. or numeric_object_t is object
  1141. or (numeric_object_t is int64_t and is_datetimelike)
  1142. )
  1143. if axis == 1:
  1144. values = np.asarray(in_arr).T.copy()
  1145. else:
  1146. values = np.asarray(in_arr).copy()
  1147. if numeric_object_t is object:
  1148. if values.dtype != np.object_:
  1149. values = values.astype("O")
  1150. nans_rank_highest = ascending ^ (na_option == "top")
  1151. if check_mask:
  1152. nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
  1153. if numeric_object_t is object:
  1154. mask = missing.isnaobj(values).view(np.uint8)
  1155. elif numeric_object_t is float64_t or numeric_object_t is float32_t:
  1156. mask = np.isnan(values).view(np.uint8)
  1157. else:
  1158. # i.e. int64 and datetimelike
  1159. mask = (values == NPY_NAT).view(np.uint8)
  1160. np.putmask(values, mask, nan_fill_val)
  1161. else:
  1162. mask = np.zeros_like(values, dtype=np.uint8)
  1163. if nans_rank_highest:
  1164. order = (values, mask)
  1165. else:
  1166. order = (values, ~np.asarray(mask))
  1167. n, k = (<object>values).shape
  1168. out = np.empty((n, k), dtype="f8", order="F")
  1169. grp_sizes = np.ones(n, dtype=np.int64)
  1170. # lexsort is slower, so only use if we need to worry about the mask
  1171. if check_mask:
  1172. sort_indexer = np.lexsort(order, axis=0).astype(np.intp, copy=False)
  1173. else:
  1174. kind = "stable" if ties_method == "first" else None
  1175. sort_indexer = values.argsort(axis=0, kind=kind).astype(np.intp, copy=False)
  1176. if not ascending:
  1177. sort_indexer = sort_indexer[::-1, :]
  1178. # putmask doesn't accept a memoryview, so we assign in a separate step
  1179. masked_vals = values
  1180. with nogil:
  1181. for col in range(k):
  1182. rank_sorted_1d(
  1183. out[:, col],
  1184. grp_sizes,
  1185. sort_indexer[:, col],
  1186. masked_vals[:, col],
  1187. mask[:, col],
  1188. check_mask=check_mask,
  1189. N=n,
  1190. tiebreak=tiebreak,
  1191. keep_na=keep_na,
  1192. pct=pct,
  1193. )
  1194. if axis == 1:
  1195. return np.asarray(out.T)
  1196. else:
  1197. return np.asarray(out)
  1198. ctypedef fused diff_t:
  1199. float64_t
  1200. float32_t
  1201. int8_t
  1202. int16_t
  1203. int32_t
  1204. int64_t
  1205. ctypedef fused out_t:
  1206. float32_t
  1207. float64_t
  1208. int64_t
  1209. @cython.boundscheck(False)
  1210. @cython.wraparound(False)
  1211. def diff_2d(
  1212. ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr"
  1213. ndarray[out_t, ndim=2] out,
  1214. Py_ssize_t periods,
  1215. int axis,
  1216. bint datetimelike=False,
  1217. ):
  1218. cdef:
  1219. Py_ssize_t i, j, sx, sy, start, stop
  1220. bint f_contig = arr.flags.f_contiguous
  1221. # bint f_contig = arr.is_f_contig() # TODO(cython3)
  1222. diff_t left, right
  1223. # Disable for unsupported dtype combinations,
  1224. # see https://github.com/cython/cython/issues/2646
  1225. if (out_t is float32_t
  1226. and not (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
  1227. raise NotImplementedError # pragma: no cover
  1228. elif (out_t is float64_t
  1229. and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)):
  1230. raise NotImplementedError # pragma: no cover
  1231. elif out_t is int64_t and diff_t is not int64_t:
  1232. # We only have out_t of int64_t if we have datetimelike
  1233. raise NotImplementedError # pragma: no cover
  1234. else:
  1235. # We put this inside an indented else block to avoid cython build
  1236. # warnings about unreachable code
  1237. sx, sy = (<object>arr).shape
  1238. with nogil:
  1239. if f_contig:
  1240. if axis == 0:
  1241. if periods >= 0:
  1242. start, stop = periods, sx
  1243. else:
  1244. start, stop = 0, sx + periods
  1245. for j in range(sy):
  1246. for i in range(start, stop):
  1247. left = arr[i, j]
  1248. right = arr[i - periods, j]
  1249. if out_t is int64_t and datetimelike:
  1250. if left == NPY_NAT or right == NPY_NAT:
  1251. out[i, j] = NPY_NAT
  1252. else:
  1253. out[i, j] = left - right
  1254. else:
  1255. out[i, j] = left - right
  1256. else:
  1257. if periods >= 0:
  1258. start, stop = periods, sy
  1259. else:
  1260. start, stop = 0, sy + periods
  1261. for j in range(start, stop):
  1262. for i in range(sx):
  1263. left = arr[i, j]
  1264. right = arr[i, j - periods]
  1265. if out_t is int64_t and datetimelike:
  1266. if left == NPY_NAT or right == NPY_NAT:
  1267. out[i, j] = NPY_NAT
  1268. else:
  1269. out[i, j] = left - right
  1270. else:
  1271. out[i, j] = left - right
  1272. else:
  1273. if axis == 0:
  1274. if periods >= 0:
  1275. start, stop = periods, sx
  1276. else:
  1277. start, stop = 0, sx + periods
  1278. for i in range(start, stop):
  1279. for j in range(sy):
  1280. left = arr[i, j]
  1281. right = arr[i - periods, j]
  1282. if out_t is int64_t and datetimelike:
  1283. if left == NPY_NAT or right == NPY_NAT:
  1284. out[i, j] = NPY_NAT
  1285. else:
  1286. out[i, j] = left - right
  1287. else:
  1288. out[i, j] = left - right
  1289. else:
  1290. if periods >= 0:
  1291. start, stop = periods, sy
  1292. else:
  1293. start, stop = 0, sy + periods
  1294. for i in range(sx):
  1295. for j in range(start, stop):
  1296. left = arr[i, j]
  1297. right = arr[i, j - periods]
  1298. if out_t is int64_t and datetimelike:
  1299. if left == NPY_NAT or right == NPY_NAT:
  1300. out[i, j] = NPY_NAT
  1301. else:
  1302. out[i, j] = left - right
  1303. else:
  1304. out[i, j] = left - right
  1305. # generated from template
  1306. include "algos_common_helper.pxi"
  1307. include "algos_take_helper.pxi"