aggregations.pyx 61 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953
  1. # cython: boundscheck=False, wraparound=False, cdivision=True
  2. from libc.math cimport (
  3. round,
  4. signbit,
  5. sqrt,
  6. )
  7. from libcpp.deque cimport deque
  8. from pandas._libs.algos cimport TiebreakEnumType
  9. import numpy as np
  10. cimport numpy as cnp
  11. from numpy cimport (
  12. float32_t,
  13. float64_t,
  14. int64_t,
  15. ndarray,
  16. )
  17. cnp.import_array()
  18. import cython
  19. from pandas._libs.algos import is_monotonic
  20. cdef extern from "../src/skiplist.h":
  21. ctypedef struct node_t:
  22. node_t **next
  23. int *width
  24. double value
  25. int is_nil
  26. int levels
  27. int ref_count
  28. ctypedef struct skiplist_t:
  29. node_t *head
  30. node_t **tmp_chain
  31. int *tmp_steps
  32. int size
  33. int maxlevels
  34. skiplist_t* skiplist_init(int) nogil
  35. void skiplist_destroy(skiplist_t*) nogil
  36. double skiplist_get(skiplist_t*, int, int*) nogil
  37. int skiplist_insert(skiplist_t*, double) nogil
  38. int skiplist_remove(skiplist_t*, double) nogil
  39. int skiplist_rank(skiplist_t*, double) nogil
  40. int skiplist_min_rank(skiplist_t*, double) nogil
  41. cdef:
  42. float32_t MINfloat32 = np.NINF
  43. float64_t MINfloat64 = np.NINF
  44. float32_t MAXfloat32 = np.inf
  45. float64_t MAXfloat64 = np.inf
  46. float64_t NaN = <float64_t>np.NaN
  47. cdef bint is_monotonic_increasing_start_end_bounds(
  48. ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end
  49. ):
  50. return is_monotonic(start, False)[0] and is_monotonic(end, False)[0]
  51. # ----------------------------------------------------------------------
  52. # Rolling sum
  53. cdef float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x,
  54. int64_t num_consecutive_same_value, float64_t prev_value
  55. ) nogil:
  56. cdef:
  57. float64_t result
  58. if nobs == 0 == minp:
  59. result = 0
  60. elif nobs >= minp:
  61. if num_consecutive_same_value >= nobs:
  62. result = prev_value * nobs
  63. else:
  64. result = sum_x
  65. else:
  66. result = NaN
  67. return result
  68. cdef void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
  69. float64_t *compensation, int64_t *num_consecutive_same_value,
  70. float64_t *prev_value) nogil:
  71. """ add a value from the sum calc using Kahan summation """
  72. cdef:
  73. float64_t y, t
  74. # Not NaN
  75. if val == val:
  76. nobs[0] = nobs[0] + 1
  77. y = val - compensation[0]
  78. t = sum_x[0] + y
  79. compensation[0] = t - sum_x[0] - y
  80. sum_x[0] = t
  81. # GH#42064, record num of same values to remove floating point artifacts
  82. if val == prev_value[0]:
  83. num_consecutive_same_value[0] += 1
  84. else:
  85. # reset to 1 (include current value itself)
  86. num_consecutive_same_value[0] = 1
  87. prev_value[0] = val
  88. cdef void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x,
  89. float64_t *compensation) nogil:
  90. """ remove a value from the sum calc using Kahan summation """
  91. cdef:
  92. float64_t y, t
  93. # Not NaN
  94. if val == val:
  95. nobs[0] = nobs[0] - 1
  96. y = - val - compensation[0]
  97. t = sum_x[0] + y
  98. compensation[0] = t - sum_x[0] - y
  99. sum_x[0] = t
  100. def roll_sum(const float64_t[:] values, ndarray[int64_t] start,
  101. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  102. cdef:
  103. Py_ssize_t i, j
  104. float64_t sum_x, compensation_add, compensation_remove, prev_value
  105. int64_t s, e, num_consecutive_same_value
  106. int64_t nobs = 0, N = len(start)
  107. ndarray[float64_t] output
  108. bint is_monotonic_increasing_bounds
  109. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  110. start, end
  111. )
  112. output = np.empty(N, dtype=np.float64)
  113. with nogil:
  114. for i in range(0, N):
  115. s = start[i]
  116. e = end[i]
  117. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  118. # setup
  119. prev_value = values[s]
  120. num_consecutive_same_value = 0
  121. sum_x = compensation_add = compensation_remove = 0
  122. nobs = 0
  123. for j in range(s, e):
  124. add_sum(values[j], &nobs, &sum_x, &compensation_add,
  125. &num_consecutive_same_value, &prev_value)
  126. else:
  127. # calculate deletes
  128. for j in range(start[i - 1], s):
  129. remove_sum(values[j], &nobs, &sum_x, &compensation_remove)
  130. # calculate adds
  131. for j in range(end[i - 1], e):
  132. add_sum(values[j], &nobs, &sum_x, &compensation_add,
  133. &num_consecutive_same_value, &prev_value)
  134. output[i] = calc_sum(
  135. minp, nobs, sum_x, num_consecutive_same_value, prev_value
  136. )
  137. if not is_monotonic_increasing_bounds:
  138. nobs = 0
  139. sum_x = 0.0
  140. compensation_remove = 0.0
  141. return output
  142. # ----------------------------------------------------------------------
  143. # Rolling mean
  144. cdef float64_t calc_mean(int64_t minp, Py_ssize_t nobs, Py_ssize_t neg_ct,
  145. float64_t sum_x, int64_t num_consecutive_same_value,
  146. float64_t prev_value) nogil:
  147. cdef:
  148. float64_t result
  149. if nobs >= minp and nobs > 0:
  150. result = sum_x / <float64_t>nobs
  151. if num_consecutive_same_value >= nobs:
  152. result = prev_value
  153. elif neg_ct == 0 and result < 0:
  154. # all positive
  155. result = 0
  156. elif neg_ct == nobs and result > 0:
  157. # all negative
  158. result = 0
  159. else:
  160. pass
  161. else:
  162. result = NaN
  163. return result
  164. cdef void add_mean(
  165. float64_t val,
  166. Py_ssize_t *nobs,
  167. float64_t *sum_x,
  168. Py_ssize_t *neg_ct,
  169. float64_t *compensation,
  170. int64_t *num_consecutive_same_value,
  171. float64_t *prev_value
  172. ) nogil:
  173. """ add a value from the mean calc using Kahan summation """
  174. cdef:
  175. float64_t y, t
  176. # Not NaN
  177. if val == val:
  178. nobs[0] = nobs[0] + 1
  179. y = val - compensation[0]
  180. t = sum_x[0] + y
  181. compensation[0] = t - sum_x[0] - y
  182. sum_x[0] = t
  183. if signbit(val):
  184. neg_ct[0] = neg_ct[0] + 1
  185. # GH#42064, record num of same values to remove floating point artifacts
  186. if val == prev_value[0]:
  187. num_consecutive_same_value[0] += 1
  188. else:
  189. # reset to 1 (include current value itself)
  190. num_consecutive_same_value[0] = 1
  191. prev_value[0] = val
  192. cdef void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x,
  193. Py_ssize_t *neg_ct, float64_t *compensation) nogil:
  194. """ remove a value from the mean calc using Kahan summation """
  195. cdef:
  196. float64_t y, t
  197. if val == val:
  198. nobs[0] = nobs[0] - 1
  199. y = - val - compensation[0]
  200. t = sum_x[0] + y
  201. compensation[0] = t - sum_x[0] - y
  202. sum_x[0] = t
  203. if signbit(val):
  204. neg_ct[0] = neg_ct[0] - 1
  205. def roll_mean(const float64_t[:] values, ndarray[int64_t] start,
  206. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  207. cdef:
  208. float64_t val, compensation_add, compensation_remove, sum_x, prev_value
  209. int64_t s, e, num_consecutive_same_value
  210. Py_ssize_t nobs, i, j, neg_ct, N = len(start)
  211. ndarray[float64_t] output
  212. bint is_monotonic_increasing_bounds
  213. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  214. start, end
  215. )
  216. output = np.empty(N, dtype=np.float64)
  217. with nogil:
  218. for i in range(0, N):
  219. s = start[i]
  220. e = end[i]
  221. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  222. # setup
  223. compensation_add = compensation_remove = sum_x = 0
  224. nobs = neg_ct = 0
  225. prev_value = values[s]
  226. num_consecutive_same_value = 0
  227. for j in range(s, e):
  228. val = values[j]
  229. add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
  230. &num_consecutive_same_value, &prev_value)
  231. else:
  232. # calculate deletes
  233. for j in range(start[i - 1], s):
  234. val = values[j]
  235. remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove)
  236. # calculate adds
  237. for j in range(end[i - 1], e):
  238. val = values[j]
  239. add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add,
  240. &num_consecutive_same_value, &prev_value)
  241. output[i] = calc_mean(
  242. minp, nobs, neg_ct, sum_x, num_consecutive_same_value, prev_value
  243. )
  244. if not is_monotonic_increasing_bounds:
  245. nobs = 0
  246. neg_ct = 0
  247. sum_x = 0.0
  248. compensation_remove = 0.0
  249. return output
  250. # ----------------------------------------------------------------------
  251. # Rolling variance
  252. cdef float64_t calc_var(
  253. int64_t minp,
  254. int ddof,
  255. float64_t nobs,
  256. float64_t ssqdm_x,
  257. int64_t num_consecutive_same_value
  258. ) nogil:
  259. cdef:
  260. float64_t result
  261. # Variance is unchanged if no observation is added or removed
  262. if (nobs >= minp) and (nobs > ddof):
  263. # pathological case & repeatedly same values case
  264. if nobs == 1 or num_consecutive_same_value >= nobs:
  265. result = 0
  266. else:
  267. result = ssqdm_x / (nobs - <float64_t>ddof)
  268. else:
  269. result = NaN
  270. return result
  271. cdef void add_var(
  272. float64_t val,
  273. float64_t *nobs,
  274. float64_t *mean_x,
  275. float64_t *ssqdm_x,
  276. float64_t *compensation,
  277. int64_t *num_consecutive_same_value,
  278. float64_t *prev_value,
  279. ) nogil:
  280. """ add a value from the var calc """
  281. cdef:
  282. float64_t delta, prev_mean, y, t
  283. # GH#21813, if msvc 2017 bug is resolved, we should be OK with != instead of `isnan`
  284. if val != val:
  285. return
  286. nobs[0] = nobs[0] + 1
  287. # GH#42064, record num of same values to remove floating point artifacts
  288. if val == prev_value[0]:
  289. num_consecutive_same_value[0] += 1
  290. else:
  291. # reset to 1 (include current value itself)
  292. num_consecutive_same_value[0] = 1
  293. prev_value[0] = val
  294. # Welford's method for the online variance-calculation
  295. # using Kahan summation
  296. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  297. prev_mean = mean_x[0] - compensation[0]
  298. y = val - compensation[0]
  299. t = y - mean_x[0]
  300. compensation[0] = t + mean_x[0] - y
  301. delta = t
  302. if nobs[0]:
  303. mean_x[0] = mean_x[0] + delta / nobs[0]
  304. else:
  305. mean_x[0] = 0
  306. ssqdm_x[0] = ssqdm_x[0] + (val - prev_mean) * (val - mean_x[0])
  307. cdef void remove_var(
  308. float64_t val,
  309. float64_t *nobs,
  310. float64_t *mean_x,
  311. float64_t *ssqdm_x,
  312. float64_t *compensation
  313. ) nogil:
  314. """ remove a value from the var calc """
  315. cdef:
  316. float64_t delta, prev_mean, y, t
  317. if val == val:
  318. nobs[0] = nobs[0] - 1
  319. if nobs[0]:
  320. # Welford's method for the online variance-calculation
  321. # using Kahan summation
  322. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  323. prev_mean = mean_x[0] - compensation[0]
  324. y = val - compensation[0]
  325. t = y - mean_x[0]
  326. compensation[0] = t + mean_x[0] - y
  327. delta = t
  328. mean_x[0] = mean_x[0] - delta / nobs[0]
  329. ssqdm_x[0] = ssqdm_x[0] - (val - prev_mean) * (val - mean_x[0])
  330. else:
  331. mean_x[0] = 0
  332. ssqdm_x[0] = 0
  333. def roll_var(const float64_t[:] values, ndarray[int64_t] start,
  334. ndarray[int64_t] end, int64_t minp, int ddof=1) -> np.ndarray:
  335. """
  336. Numerically stable implementation using Welford's method.
  337. """
  338. cdef:
  339. float64_t mean_x, ssqdm_x, nobs, compensation_add,
  340. float64_t compensation_remove, prev_value
  341. int64_t s, e, num_consecutive_same_value
  342. Py_ssize_t i, j, N = len(start)
  343. ndarray[float64_t] output
  344. bint is_monotonic_increasing_bounds
  345. minp = max(minp, 1)
  346. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  347. start, end
  348. )
  349. output = np.empty(N, dtype=np.float64)
  350. with nogil:
  351. for i in range(0, N):
  352. s = start[i]
  353. e = end[i]
  354. # Over the first window, observations can only be added
  355. # never removed
  356. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  357. prev_value = values[s]
  358. num_consecutive_same_value = 0
  359. mean_x = ssqdm_x = nobs = compensation_add = compensation_remove = 0
  360. for j in range(s, e):
  361. add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
  362. &num_consecutive_same_value, &prev_value)
  363. else:
  364. # After the first window, observations can both be added
  365. # and removed
  366. # calculate deletes
  367. for j in range(start[i - 1], s):
  368. remove_var(values[j], &nobs, &mean_x, &ssqdm_x,
  369. &compensation_remove)
  370. # calculate adds
  371. for j in range(end[i - 1], e):
  372. add_var(values[j], &nobs, &mean_x, &ssqdm_x, &compensation_add,
  373. &num_consecutive_same_value, &prev_value)
  374. output[i] = calc_var(minp, ddof, nobs, ssqdm_x, num_consecutive_same_value)
  375. if not is_monotonic_increasing_bounds:
  376. nobs = 0.0
  377. mean_x = 0.0
  378. ssqdm_x = 0.0
  379. compensation_remove = 0.0
  380. return output
  381. # ----------------------------------------------------------------------
  382. # Rolling skewness
  383. cdef float64_t calc_skew(int64_t minp, int64_t nobs,
  384. float64_t x, float64_t xx, float64_t xxx,
  385. int64_t num_consecutive_same_value
  386. ) nogil:
  387. cdef:
  388. float64_t result, dnobs
  389. float64_t A, B, C, R
  390. if nobs >= minp:
  391. dnobs = <float64_t>nobs
  392. A = x / dnobs
  393. B = xx / dnobs - A * A
  394. C = xxx / dnobs - A * A * A - 3 * A * B
  395. if nobs < 3:
  396. result = NaN
  397. # GH 42064 46431
  398. # uniform case, force result to be 0
  399. elif num_consecutive_same_value >= nobs:
  400. result = 0.0
  401. # #18044: with uniform distribution, floating issue will
  402. # cause B != 0. and cause the result is a very
  403. # large number.
  404. #
  405. # in core/nanops.py nanskew/nankurt call the function
  406. # _zero_out_fperr(m2) to fix floating error.
  407. # if the variance is less than 1e-14, it could be
  408. # treat as zero, here we follow the original
  409. # skew/kurt behaviour to check B <= 1e-14
  410. elif B <= 1e-14:
  411. result = NaN
  412. else:
  413. R = sqrt(B)
  414. result = ((sqrt(dnobs * (dnobs - 1.)) * C) /
  415. ((dnobs - 2) * R * R * R))
  416. else:
  417. result = NaN
  418. return result
  419. cdef void add_skew(float64_t val, int64_t *nobs,
  420. float64_t *x, float64_t *xx,
  421. float64_t *xxx,
  422. float64_t *compensation_x,
  423. float64_t *compensation_xx,
  424. float64_t *compensation_xxx,
  425. int64_t *num_consecutive_same_value,
  426. float64_t *prev_value,
  427. ) nogil:
  428. """ add a value from the skew calc """
  429. cdef:
  430. float64_t y, t
  431. # Not NaN
  432. if val == val:
  433. nobs[0] = nobs[0] + 1
  434. y = val - compensation_x[0]
  435. t = x[0] + y
  436. compensation_x[0] = t - x[0] - y
  437. x[0] = t
  438. y = val * val - compensation_xx[0]
  439. t = xx[0] + y
  440. compensation_xx[0] = t - xx[0] - y
  441. xx[0] = t
  442. y = val * val * val - compensation_xxx[0]
  443. t = xxx[0] + y
  444. compensation_xxx[0] = t - xxx[0] - y
  445. xxx[0] = t
  446. # GH#42064, record num of same values to remove floating point artifacts
  447. if val == prev_value[0]:
  448. num_consecutive_same_value[0] += 1
  449. else:
  450. # reset to 1 (include current value itself)
  451. num_consecutive_same_value[0] = 1
  452. prev_value[0] = val
  453. cdef void remove_skew(float64_t val, int64_t *nobs,
  454. float64_t *x, float64_t *xx,
  455. float64_t *xxx,
  456. float64_t *compensation_x,
  457. float64_t *compensation_xx,
  458. float64_t *compensation_xxx) nogil:
  459. """ remove a value from the skew calc """
  460. cdef:
  461. float64_t y, t
  462. # Not NaN
  463. if val == val:
  464. nobs[0] = nobs[0] - 1
  465. y = - val - compensation_x[0]
  466. t = x[0] + y
  467. compensation_x[0] = t - x[0] - y
  468. x[0] = t
  469. y = - val * val - compensation_xx[0]
  470. t = xx[0] + y
  471. compensation_xx[0] = t - xx[0] - y
  472. xx[0] = t
  473. y = - val * val * val - compensation_xxx[0]
  474. t = xxx[0] + y
  475. compensation_xxx[0] = t - xxx[0] - y
  476. xxx[0] = t
  477. def roll_skew(ndarray[float64_t] values, ndarray[int64_t] start,
  478. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  479. cdef:
  480. Py_ssize_t i, j
  481. float64_t val, min_val, mean_val, sum_val = 0
  482. float64_t compensation_xxx_add, compensation_xxx_remove
  483. float64_t compensation_xx_add, compensation_xx_remove
  484. float64_t compensation_x_add, compensation_x_remove
  485. float64_t x, xx, xxx
  486. float64_t prev_value
  487. int64_t nobs = 0, N = len(start), V = len(values), nobs_mean = 0
  488. int64_t s, e, num_consecutive_same_value
  489. ndarray[float64_t] output, values_copy
  490. bint is_monotonic_increasing_bounds
  491. minp = max(minp, 3)
  492. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  493. start, end
  494. )
  495. output = np.empty(N, dtype=np.float64)
  496. min_val = np.nanmin(values)
  497. values_copy = np.copy(values)
  498. with nogil:
  499. for i in range(0, V):
  500. val = values_copy[i]
  501. if val == val:
  502. nobs_mean += 1
  503. sum_val += val
  504. mean_val = sum_val / nobs_mean
  505. # Other cases would lead to imprecision for smallest values
  506. if min_val - mean_val > -1e5:
  507. mean_val = round(mean_val)
  508. for i in range(0, V):
  509. values_copy[i] = values_copy[i] - mean_val
  510. for i in range(0, N):
  511. s = start[i]
  512. e = end[i]
  513. # Over the first window, observations can only be added
  514. # never removed
  515. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  516. prev_value = values[s]
  517. num_consecutive_same_value = 0
  518. compensation_xxx_add = compensation_xxx_remove = 0
  519. compensation_xx_add = compensation_xx_remove = 0
  520. compensation_x_add = compensation_x_remove = 0
  521. x = xx = xxx = 0
  522. nobs = 0
  523. for j in range(s, e):
  524. val = values_copy[j]
  525. add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
  526. &compensation_xx_add, &compensation_xxx_add,
  527. &num_consecutive_same_value, &prev_value)
  528. else:
  529. # After the first window, observations can both be added
  530. # and removed
  531. # calculate deletes
  532. for j in range(start[i - 1], s):
  533. val = values_copy[j]
  534. remove_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_remove,
  535. &compensation_xx_remove, &compensation_xxx_remove)
  536. # calculate adds
  537. for j in range(end[i - 1], e):
  538. val = values_copy[j]
  539. add_skew(val, &nobs, &x, &xx, &xxx, &compensation_x_add,
  540. &compensation_xx_add, &compensation_xxx_add,
  541. &num_consecutive_same_value, &prev_value)
  542. output[i] = calc_skew(minp, nobs, x, xx, xxx, num_consecutive_same_value)
  543. if not is_monotonic_increasing_bounds:
  544. nobs = 0
  545. x = 0.0
  546. xx = 0.0
  547. xxx = 0.0
  548. return output
  549. # ----------------------------------------------------------------------
  550. # Rolling kurtosis
  551. cdef float64_t calc_kurt(int64_t minp, int64_t nobs,
  552. float64_t x, float64_t xx,
  553. float64_t xxx, float64_t xxxx,
  554. int64_t num_consecutive_same_value,
  555. ) nogil:
  556. cdef:
  557. float64_t result, dnobs
  558. float64_t A, B, C, D, R, K
  559. if nobs >= minp:
  560. if nobs < 4:
  561. result = NaN
  562. # GH 42064 46431
  563. # uniform case, force result to be -3.
  564. elif num_consecutive_same_value >= nobs:
  565. result = -3.
  566. else:
  567. dnobs = <float64_t>nobs
  568. A = x / dnobs
  569. R = A * A
  570. B = xx / dnobs - R
  571. R = R * A
  572. C = xxx / dnobs - R - 3 * A * B
  573. R = R * A
  574. D = xxxx / dnobs - R - 6 * B * A * A - 4 * C * A
  575. # #18044: with uniform distribution, floating issue will
  576. # cause B != 0. and cause the result is a very
  577. # large number.
  578. #
  579. # in core/nanops.py nanskew/nankurt call the function
  580. # _zero_out_fperr(m2) to fix floating error.
  581. # if the variance is less than 1e-14, it could be
  582. # treat as zero, here we follow the original
  583. # skew/kurt behaviour to check B <= 1e-14
  584. if B <= 1e-14:
  585. result = NaN
  586. else:
  587. K = (dnobs * dnobs - 1.) * D / (B * B) - 3 * ((dnobs - 1.) ** 2)
  588. result = K / ((dnobs - 2.) * (dnobs - 3.))
  589. else:
  590. result = NaN
  591. return result
  592. cdef void add_kurt(float64_t val, int64_t *nobs,
  593. float64_t *x, float64_t *xx,
  594. float64_t *xxx, float64_t *xxxx,
  595. float64_t *compensation_x,
  596. float64_t *compensation_xx,
  597. float64_t *compensation_xxx,
  598. float64_t *compensation_xxxx,
  599. int64_t *num_consecutive_same_value,
  600. float64_t *prev_value
  601. ) nogil:
  602. """ add a value from the kurotic calc """
  603. cdef:
  604. float64_t y, t
  605. # Not NaN
  606. if val == val:
  607. nobs[0] = nobs[0] + 1
  608. y = val - compensation_x[0]
  609. t = x[0] + y
  610. compensation_x[0] = t - x[0] - y
  611. x[0] = t
  612. y = val * val - compensation_xx[0]
  613. t = xx[0] + y
  614. compensation_xx[0] = t - xx[0] - y
  615. xx[0] = t
  616. y = val * val * val - compensation_xxx[0]
  617. t = xxx[0] + y
  618. compensation_xxx[0] = t - xxx[0] - y
  619. xxx[0] = t
  620. y = val * val * val * val - compensation_xxxx[0]
  621. t = xxxx[0] + y
  622. compensation_xxxx[0] = t - xxxx[0] - y
  623. xxxx[0] = t
  624. # GH#42064, record num of same values to remove floating point artifacts
  625. if val == prev_value[0]:
  626. num_consecutive_same_value[0] += 1
  627. else:
  628. # reset to 1 (include current value itself)
  629. num_consecutive_same_value[0] = 1
  630. prev_value[0] = val
  631. cdef void remove_kurt(float64_t val, int64_t *nobs,
  632. float64_t *x, float64_t *xx,
  633. float64_t *xxx, float64_t *xxxx,
  634. float64_t *compensation_x,
  635. float64_t *compensation_xx,
  636. float64_t *compensation_xxx,
  637. float64_t *compensation_xxxx) nogil:
  638. """ remove a value from the kurotic calc """
  639. cdef:
  640. float64_t y, t
  641. # Not NaN
  642. if val == val:
  643. nobs[0] = nobs[0] - 1
  644. y = - val - compensation_x[0]
  645. t = x[0] + y
  646. compensation_x[0] = t - x[0] - y
  647. x[0] = t
  648. y = - val * val - compensation_xx[0]
  649. t = xx[0] + y
  650. compensation_xx[0] = t - xx[0] - y
  651. xx[0] = t
  652. y = - val * val * val - compensation_xxx[0]
  653. t = xxx[0] + y
  654. compensation_xxx[0] = t - xxx[0] - y
  655. xxx[0] = t
  656. y = - val * val * val * val - compensation_xxxx[0]
  657. t = xxxx[0] + y
  658. compensation_xxxx[0] = t - xxxx[0] - y
  659. xxxx[0] = t
  660. def roll_kurt(ndarray[float64_t] values, ndarray[int64_t] start,
  661. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  662. cdef:
  663. Py_ssize_t i, j
  664. float64_t val, mean_val, min_val, sum_val = 0
  665. float64_t compensation_xxxx_add, compensation_xxxx_remove
  666. float64_t compensation_xxx_remove, compensation_xxx_add
  667. float64_t compensation_xx_remove, compensation_xx_add
  668. float64_t compensation_x_remove, compensation_x_add
  669. float64_t x, xx, xxx, xxxx
  670. float64_t prev_value
  671. int64_t nobs, s, e, num_consecutive_same_value
  672. int64_t N = len(start), V = len(values), nobs_mean = 0
  673. ndarray[float64_t] output, values_copy
  674. bint is_monotonic_increasing_bounds
  675. minp = max(minp, 4)
  676. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  677. start, end
  678. )
  679. output = np.empty(N, dtype=np.float64)
  680. values_copy = np.copy(values)
  681. min_val = np.nanmin(values)
  682. with nogil:
  683. for i in range(0, V):
  684. val = values_copy[i]
  685. if val == val:
  686. nobs_mean += 1
  687. sum_val += val
  688. mean_val = sum_val / nobs_mean
  689. # Other cases would lead to imprecision for smallest values
  690. if min_val - mean_val > -1e4:
  691. mean_val = round(mean_val)
  692. for i in range(0, V):
  693. values_copy[i] = values_copy[i] - mean_val
  694. for i in range(0, N):
  695. s = start[i]
  696. e = end[i]
  697. # Over the first window, observations can only be added
  698. # never removed
  699. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  700. prev_value = values[s]
  701. num_consecutive_same_value = 0
  702. compensation_xxxx_add = compensation_xxxx_remove = 0
  703. compensation_xxx_remove = compensation_xxx_add = 0
  704. compensation_xx_remove = compensation_xx_add = 0
  705. compensation_x_remove = compensation_x_add = 0
  706. x = xx = xxx = xxxx = 0
  707. nobs = 0
  708. for j in range(s, e):
  709. add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
  710. &compensation_x_add, &compensation_xx_add,
  711. &compensation_xxx_add, &compensation_xxxx_add,
  712. &num_consecutive_same_value, &prev_value)
  713. else:
  714. # After the first window, observations can both be added
  715. # and removed
  716. # calculate deletes
  717. for j in range(start[i - 1], s):
  718. remove_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
  719. &compensation_x_remove, &compensation_xx_remove,
  720. &compensation_xxx_remove, &compensation_xxxx_remove)
  721. # calculate adds
  722. for j in range(end[i - 1], e):
  723. add_kurt(values_copy[j], &nobs, &x, &xx, &xxx, &xxxx,
  724. &compensation_x_add, &compensation_xx_add,
  725. &compensation_xxx_add, &compensation_xxxx_add,
  726. &num_consecutive_same_value, &prev_value)
  727. output[i] = calc_kurt(minp, nobs, x, xx, xxx, xxxx,
  728. num_consecutive_same_value)
  729. if not is_monotonic_increasing_bounds:
  730. nobs = 0
  731. x = 0.0
  732. xx = 0.0
  733. xxx = 0.0
  734. xxxx = 0.0
  735. return output
  736. # ----------------------------------------------------------------------
  737. # Rolling median, min, max
  738. def roll_median_c(const float64_t[:] values, ndarray[int64_t] start,
  739. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  740. cdef:
  741. Py_ssize_t i, j
  742. bint err = False, is_monotonic_increasing_bounds
  743. int midpoint, ret = 0
  744. int64_t nobs = 0, N = len(start), s, e, win
  745. float64_t val, res
  746. skiplist_t *sl
  747. ndarray[float64_t] output
  748. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  749. start, end
  750. )
  751. # we use the Fixed/Variable Indexer here as the
  752. # actual skiplist ops outweigh any window computation costs
  753. output = np.empty(N, dtype=np.float64)
  754. if (end - start).max() == 0:
  755. output[:] = NaN
  756. return output
  757. win = (end - start).max()
  758. sl = skiplist_init(<int>win)
  759. if sl == NULL:
  760. raise MemoryError("skiplist_init failed")
  761. with nogil:
  762. for i in range(0, N):
  763. s = start[i]
  764. e = end[i]
  765. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  766. if i != 0:
  767. skiplist_destroy(sl)
  768. sl = skiplist_init(<int>win)
  769. nobs = 0
  770. # setup
  771. for j in range(s, e):
  772. val = values[j]
  773. if val == val:
  774. nobs += 1
  775. err = skiplist_insert(sl, val) == -1
  776. if err:
  777. break
  778. else:
  779. # calculate adds
  780. for j in range(end[i - 1], e):
  781. val = values[j]
  782. if val == val:
  783. nobs += 1
  784. err = skiplist_insert(sl, val) == -1
  785. if err:
  786. break
  787. # calculate deletes
  788. for j in range(start[i - 1], s):
  789. val = values[j]
  790. if val == val:
  791. skiplist_remove(sl, val)
  792. nobs -= 1
  793. if nobs >= minp:
  794. midpoint = <int>(nobs / 2)
  795. if nobs % 2:
  796. res = skiplist_get(sl, midpoint, &ret)
  797. else:
  798. res = (skiplist_get(sl, midpoint, &ret) +
  799. skiplist_get(sl, (midpoint - 1), &ret)) / 2
  800. if ret == 0:
  801. res = NaN
  802. else:
  803. res = NaN
  804. output[i] = res
  805. if not is_monotonic_increasing_bounds:
  806. nobs = 0
  807. skiplist_destroy(sl)
  808. sl = skiplist_init(<int>win)
  809. skiplist_destroy(sl)
  810. if err:
  811. raise MemoryError("skiplist_insert failed")
  812. return output
  813. # ----------------------------------------------------------------------
  814. # Moving maximum / minimum code taken from Bottleneck under the terms
  815. # of its Simplified BSD license
  816. # https://github.com/pydata/bottleneck
  817. cdef float64_t init_mm(float64_t ai, Py_ssize_t *nobs, bint is_max) nogil:
  818. if ai == ai:
  819. nobs[0] = nobs[0] + 1
  820. elif is_max:
  821. ai = MINfloat64
  822. else:
  823. ai = MAXfloat64
  824. return ai
  825. cdef void remove_mm(float64_t aold, Py_ssize_t *nobs) nogil:
  826. """ remove a value from the mm calc """
  827. if aold == aold:
  828. nobs[0] = nobs[0] - 1
  829. cdef float64_t calc_mm(int64_t minp, Py_ssize_t nobs,
  830. float64_t value) nogil:
  831. cdef:
  832. float64_t result
  833. if nobs >= minp:
  834. result = value
  835. else:
  836. result = NaN
  837. return result
  838. def roll_max(ndarray[float64_t] values, ndarray[int64_t] start,
  839. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  840. """
  841. Moving max of 1d array of any numeric type along axis=0 ignoring NaNs.
  842. Parameters
  843. ----------
  844. values : np.ndarray[np.float64]
  845. window : int, size of rolling window
  846. minp : if number of observations in window
  847. is below this, output a NaN
  848. index : ndarray, optional
  849. index for window computation
  850. closed : 'right', 'left', 'both', 'neither'
  851. make the interval closed on the right, left,
  852. both or neither endpoints
  853. Returns
  854. -------
  855. np.ndarray[float]
  856. """
  857. return _roll_min_max(values, start, end, minp, is_max=1)
  858. def roll_min(ndarray[float64_t] values, ndarray[int64_t] start,
  859. ndarray[int64_t] end, int64_t minp) -> np.ndarray:
  860. """
  861. Moving min of 1d array of any numeric type along axis=0 ignoring NaNs.
  862. Parameters
  863. ----------
  864. values : np.ndarray[np.float64]
  865. window : int, size of rolling window
  866. minp : if number of observations in window
  867. is below this, output a NaN
  868. index : ndarray, optional
  869. index for window computation
  870. Returns
  871. -------
  872. np.ndarray[float]
  873. """
  874. return _roll_min_max(values, start, end, minp, is_max=0)
  875. cdef _roll_min_max(ndarray[float64_t] values,
  876. ndarray[int64_t] starti,
  877. ndarray[int64_t] endi,
  878. int64_t minp,
  879. bint is_max):
  880. cdef:
  881. float64_t ai
  882. int64_t curr_win_size, start
  883. Py_ssize_t i, k, nobs = 0, N = len(starti)
  884. deque Q[int64_t] # min/max always the front
  885. deque W[int64_t] # track the whole window for nobs compute
  886. ndarray[float64_t, ndim=1] output
  887. output = np.empty(N, dtype=np.float64)
  888. Q = deque[int64_t]()
  889. W = deque[int64_t]()
  890. with nogil:
  891. # This is using a modified version of the C++ code in this
  892. # SO post: https://stackoverflow.com/a/12239580
  893. # The original impl didn't deal with variable window sizes
  894. # So the code was optimized for that
  895. # first window's size
  896. curr_win_size = endi[0] - starti[0]
  897. # GH 32865
  898. # Anchor output index to values index to provide custom
  899. # BaseIndexer support
  900. for i in range(N):
  901. curr_win_size = endi[i] - starti[i]
  902. if i == 0:
  903. start = starti[i]
  904. else:
  905. start = endi[i - 1]
  906. for k in range(start, endi[i]):
  907. ai = init_mm(values[k], &nobs, is_max)
  908. # Discard previous entries if we find new min or max
  909. if is_max:
  910. while not Q.empty() and ((ai >= values[Q.back()]) or
  911. values[Q.back()] != values[Q.back()]):
  912. Q.pop_back()
  913. else:
  914. while not Q.empty() and ((ai <= values[Q.back()]) or
  915. values[Q.back()] != values[Q.back()]):
  916. Q.pop_back()
  917. Q.push_back(k)
  918. W.push_back(k)
  919. # Discard entries outside and left of current window
  920. while not Q.empty() and Q.front() <= starti[i] - 1:
  921. Q.pop_front()
  922. while not W.empty() and W.front() <= starti[i] - 1:
  923. remove_mm(values[W.front()], &nobs)
  924. W.pop_front()
  925. # Save output based on index in input value array
  926. if not Q.empty() and curr_win_size > 0:
  927. output[i] = calc_mm(minp, nobs, values[Q.front()])
  928. else:
  929. output[i] = NaN
  930. return output
  931. cdef enum InterpolationType:
  932. LINEAR,
  933. LOWER,
  934. HIGHER,
  935. NEAREST,
  936. MIDPOINT
  937. interpolation_types = {
  938. "linear": LINEAR,
  939. "lower": LOWER,
  940. "higher": HIGHER,
  941. "nearest": NEAREST,
  942. "midpoint": MIDPOINT,
  943. }
  944. def roll_quantile(const float64_t[:] values, ndarray[int64_t] start,
  945. ndarray[int64_t] end, int64_t minp,
  946. float64_t quantile, str interpolation) -> np.ndarray:
  947. """
  948. O(N log(window)) implementation using skip list
  949. """
  950. cdef:
  951. Py_ssize_t i, j, s, e, N = len(start), idx
  952. int ret = 0
  953. int64_t nobs = 0, win
  954. float64_t val, idx_with_fraction
  955. float64_t vlow, vhigh
  956. skiplist_t *skiplist
  957. InterpolationType interpolation_type
  958. ndarray[float64_t] output
  959. if quantile <= 0.0 or quantile >= 1.0:
  960. raise ValueError(f"quantile value {quantile} not in [0, 1]")
  961. try:
  962. interpolation_type = interpolation_types[interpolation]
  963. except KeyError:
  964. raise ValueError(f"Interpolation '{interpolation}' is not supported")
  965. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  966. start, end
  967. )
  968. # we use the Fixed/Variable Indexer here as the
  969. # actual skiplist ops outweigh any window computation costs
  970. output = np.empty(N, dtype=np.float64)
  971. win = (end - start).max()
  972. if win == 0:
  973. output[:] = NaN
  974. return output
  975. skiplist = skiplist_init(<int>win)
  976. if skiplist == NULL:
  977. raise MemoryError("skiplist_init failed")
  978. with nogil:
  979. for i in range(0, N):
  980. s = start[i]
  981. e = end[i]
  982. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  983. if i != 0:
  984. nobs = 0
  985. skiplist_destroy(skiplist)
  986. skiplist = skiplist_init(<int>win)
  987. # setup
  988. for j in range(s, e):
  989. val = values[j]
  990. if val == val:
  991. nobs += 1
  992. skiplist_insert(skiplist, val)
  993. else:
  994. # calculate adds
  995. for j in range(end[i - 1], e):
  996. val = values[j]
  997. if val == val:
  998. nobs += 1
  999. skiplist_insert(skiplist, val)
  1000. # calculate deletes
  1001. for j in range(start[i - 1], s):
  1002. val = values[j]
  1003. if val == val:
  1004. skiplist_remove(skiplist, val)
  1005. nobs -= 1
  1006. if nobs >= minp:
  1007. if nobs == 1:
  1008. # Single value in skip list
  1009. output[i] = skiplist_get(skiplist, 0, &ret)
  1010. else:
  1011. idx_with_fraction = quantile * (nobs - 1)
  1012. idx = <int>idx_with_fraction
  1013. if idx_with_fraction == idx:
  1014. # no need to interpolate
  1015. output[i] = skiplist_get(skiplist, idx, &ret)
  1016. continue
  1017. if interpolation_type == LINEAR:
  1018. vlow = skiplist_get(skiplist, idx, &ret)
  1019. vhigh = skiplist_get(skiplist, idx + 1, &ret)
  1020. output[i] = ((vlow + (vhigh - vlow) *
  1021. (idx_with_fraction - idx)))
  1022. elif interpolation_type == LOWER:
  1023. output[i] = skiplist_get(skiplist, idx, &ret)
  1024. elif interpolation_type == HIGHER:
  1025. output[i] = skiplist_get(skiplist, idx + 1, &ret)
  1026. elif interpolation_type == NEAREST:
  1027. # the same behaviour as round()
  1028. if idx_with_fraction - idx == 0.5:
  1029. if idx % 2 == 0:
  1030. output[i] = skiplist_get(skiplist, idx, &ret)
  1031. else:
  1032. output[i] = skiplist_get(
  1033. skiplist, idx + 1, &ret)
  1034. elif idx_with_fraction - idx < 0.5:
  1035. output[i] = skiplist_get(skiplist, idx, &ret)
  1036. else:
  1037. output[i] = skiplist_get(skiplist, idx + 1, &ret)
  1038. elif interpolation_type == MIDPOINT:
  1039. vlow = skiplist_get(skiplist, idx, &ret)
  1040. vhigh = skiplist_get(skiplist, idx + 1, &ret)
  1041. output[i] = <float64_t>(vlow + vhigh) / 2
  1042. if ret == 0:
  1043. output[i] = NaN
  1044. else:
  1045. output[i] = NaN
  1046. skiplist_destroy(skiplist)
  1047. return output
  1048. rolling_rank_tiebreakers = {
  1049. "average": TiebreakEnumType.TIEBREAK_AVERAGE,
  1050. "min": TiebreakEnumType.TIEBREAK_MIN,
  1051. "max": TiebreakEnumType.TIEBREAK_MAX,
  1052. }
  1053. def roll_rank(const float64_t[:] values, ndarray[int64_t] start,
  1054. ndarray[int64_t] end, int64_t minp, bint percentile,
  1055. str method, bint ascending) -> np.ndarray:
  1056. """
  1057. O(N log(window)) implementation using skip list
  1058. derived from roll_quantile
  1059. """
  1060. cdef:
  1061. Py_ssize_t i, j, s, e, N = len(start)
  1062. float64_t rank_min = 0, rank = 0
  1063. int64_t nobs = 0, win
  1064. float64_t val
  1065. skiplist_t *skiplist
  1066. float64_t[::1] output
  1067. TiebreakEnumType rank_type
  1068. try:
  1069. rank_type = rolling_rank_tiebreakers[method]
  1070. except KeyError:
  1071. raise ValueError(f"Method '{method}' is not supported")
  1072. is_monotonic_increasing_bounds = is_monotonic_increasing_start_end_bounds(
  1073. start, end
  1074. )
  1075. # we use the Fixed/Variable Indexer here as the
  1076. # actual skiplist ops outweigh any window computation costs
  1077. output = np.empty(N, dtype=np.float64)
  1078. win = (end - start).max()
  1079. if win == 0:
  1080. output[:] = NaN
  1081. return np.asarray(output)
  1082. skiplist = skiplist_init(<int>win)
  1083. if skiplist == NULL:
  1084. raise MemoryError("skiplist_init failed")
  1085. with nogil:
  1086. for i in range(N):
  1087. s = start[i]
  1088. e = end[i]
  1089. if i == 0 or not is_monotonic_increasing_bounds or s >= end[i - 1]:
  1090. if i != 0:
  1091. nobs = 0
  1092. skiplist_destroy(skiplist)
  1093. skiplist = skiplist_init(<int>win)
  1094. # setup
  1095. for j in range(s, e):
  1096. val = values[j] if ascending else -values[j]
  1097. if val == val:
  1098. nobs += 1
  1099. rank = skiplist_insert(skiplist, val)
  1100. if rank == -1:
  1101. raise MemoryError("skiplist_insert failed")
  1102. if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
  1103. # The average rank of `val` is the sum of the ranks of all
  1104. # instances of `val` in the skip list divided by the number
  1105. # of instances. The sum of consecutive integers from 1 to N
  1106. # is N * (N + 1) / 2.
  1107. # The sum of the ranks is the sum of integers from the
  1108. # lowest rank to the highest rank, which is the sum of
  1109. # integers from 1 to the highest rank minus the sum of
  1110. # integers from 1 to one less than the lowest rank.
  1111. rank_min = skiplist_min_rank(skiplist, val)
  1112. rank = (((rank * (rank + 1) / 2)
  1113. - ((rank_min - 1) * rank_min / 2))
  1114. / (rank - rank_min + 1))
  1115. elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
  1116. rank = skiplist_min_rank(skiplist, val)
  1117. else:
  1118. rank = NaN
  1119. else:
  1120. # calculate deletes
  1121. for j in range(start[i - 1], s):
  1122. val = values[j] if ascending else -values[j]
  1123. if val == val:
  1124. skiplist_remove(skiplist, val)
  1125. nobs -= 1
  1126. # calculate adds
  1127. for j in range(end[i - 1], e):
  1128. val = values[j] if ascending else -values[j]
  1129. if val == val:
  1130. nobs += 1
  1131. rank = skiplist_insert(skiplist, val)
  1132. if rank == -1:
  1133. raise MemoryError("skiplist_insert failed")
  1134. if rank_type == TiebreakEnumType.TIEBREAK_AVERAGE:
  1135. rank_min = skiplist_min_rank(skiplist, val)
  1136. rank = (((rank * (rank + 1) / 2)
  1137. - ((rank_min - 1) * rank_min / 2))
  1138. / (rank - rank_min + 1))
  1139. elif rank_type == TiebreakEnumType.TIEBREAK_MIN:
  1140. rank = skiplist_min_rank(skiplist, val)
  1141. else:
  1142. rank = NaN
  1143. if nobs >= minp:
  1144. output[i] = rank / nobs if percentile else rank
  1145. else:
  1146. output[i] = NaN
  1147. skiplist_destroy(skiplist)
  1148. return np.asarray(output)
  1149. def roll_apply(object obj,
  1150. ndarray[int64_t] start, ndarray[int64_t] end,
  1151. int64_t minp,
  1152. object function, bint raw,
  1153. tuple args, dict kwargs) -> np.ndarray:
  1154. cdef:
  1155. ndarray[float64_t] output, counts
  1156. ndarray[float64_t, cast=True] arr
  1157. Py_ssize_t i, s, e, N = len(start), n = len(obj)
  1158. if n == 0:
  1159. return np.array([], dtype=np.float64)
  1160. arr = np.asarray(obj)
  1161. # ndarray input
  1162. if raw and not arr.flags.c_contiguous:
  1163. arr = arr.copy("C")
  1164. counts = roll_sum(np.isfinite(arr).astype(float), start, end, minp)
  1165. output = np.empty(N, dtype=np.float64)
  1166. for i in range(N):
  1167. s = start[i]
  1168. e = end[i]
  1169. if counts[i] >= minp:
  1170. if raw:
  1171. output[i] = function(arr[s:e], *args, **kwargs)
  1172. else:
  1173. output[i] = function(obj.iloc[s:e], *args, **kwargs)
  1174. else:
  1175. output[i] = NaN
  1176. return output
  1177. # ----------------------------------------------------------------------
  1178. # Rolling sum and mean for weighted window
  1179. def roll_weighted_sum(
  1180. const float64_t[:] values, const float64_t[:] weights, int minp
  1181. ) -> np.ndarray:
  1182. return _roll_weighted_sum_mean(values, weights, minp, avg=0)
  1183. def roll_weighted_mean(
  1184. const float64_t[:] values, const float64_t[:] weights, int minp
  1185. ) -> np.ndarray:
  1186. return _roll_weighted_sum_mean(values, weights, minp, avg=1)
  1187. cdef float64_t[:] _roll_weighted_sum_mean(const float64_t[:] values,
  1188. const float64_t[:] weights,
  1189. int minp, bint avg):
  1190. """
  1191. Assume len(weights) << len(values)
  1192. """
  1193. cdef:
  1194. float64_t[:] output, tot_wgt, counts
  1195. Py_ssize_t in_i, win_i, win_n, in_n
  1196. float64_t val_in, val_win, c, w
  1197. in_n = len(values)
  1198. win_n = len(weights)
  1199. output = np.zeros(in_n, dtype=np.float64)
  1200. counts = np.zeros(in_n, dtype=np.float64)
  1201. if avg:
  1202. tot_wgt = np.zeros(in_n, dtype=np.float64)
  1203. elif minp > in_n:
  1204. minp = in_n + 1
  1205. minp = max(minp, 1)
  1206. with nogil:
  1207. if avg:
  1208. for win_i in range(win_n):
  1209. val_win = weights[win_i]
  1210. if val_win != val_win:
  1211. continue
  1212. for in_i in range(in_n - (win_n - win_i) + 1):
  1213. val_in = values[in_i]
  1214. if val_in == val_in:
  1215. output[in_i + (win_n - win_i) - 1] += val_in * val_win
  1216. counts[in_i + (win_n - win_i) - 1] += 1
  1217. tot_wgt[in_i + (win_n - win_i) - 1] += val_win
  1218. for in_i in range(in_n):
  1219. c = counts[in_i]
  1220. if c < minp:
  1221. output[in_i] = NaN
  1222. else:
  1223. w = tot_wgt[in_i]
  1224. if w == 0:
  1225. output[in_i] = NaN
  1226. else:
  1227. output[in_i] /= tot_wgt[in_i]
  1228. else:
  1229. for win_i in range(win_n):
  1230. val_win = weights[win_i]
  1231. if val_win != val_win:
  1232. continue
  1233. for in_i in range(in_n - (win_n - win_i) + 1):
  1234. val_in = values[in_i]
  1235. if val_in == val_in:
  1236. output[in_i + (win_n - win_i) - 1] += val_in * val_win
  1237. counts[in_i + (win_n - win_i) - 1] += 1
  1238. for in_i in range(in_n):
  1239. c = counts[in_i]
  1240. if c < minp:
  1241. output[in_i] = NaN
  1242. return output
  1243. # ----------------------------------------------------------------------
  1244. # Rolling var for weighted window
  1245. cdef float64_t calc_weighted_var(float64_t t,
  1246. float64_t sum_w,
  1247. Py_ssize_t win_n,
  1248. unsigned int ddof,
  1249. float64_t nobs,
  1250. int64_t minp) nogil:
  1251. """
  1252. Calculate weighted variance for a window using West's method.
  1253. Paper: https://dl.acm.org/citation.cfm?id=359153
  1254. Parameters
  1255. ----------
  1256. t: float64_t
  1257. sum of weighted squared differences
  1258. sum_w: float64_t
  1259. sum of weights
  1260. win_n: Py_ssize_t
  1261. window size
  1262. ddof: unsigned int
  1263. delta degrees of freedom
  1264. nobs: float64_t
  1265. number of observations
  1266. minp: int64_t
  1267. minimum number of observations
  1268. Returns
  1269. -------
  1270. result : float64_t
  1271. weighted variance of the window
  1272. """
  1273. cdef:
  1274. float64_t result
  1275. # Variance is unchanged if no observation is added or removed
  1276. if (nobs >= minp) and (nobs > ddof):
  1277. # pathological case
  1278. if nobs == 1:
  1279. result = 0
  1280. else:
  1281. result = t * win_n / ((win_n - ddof) * sum_w)
  1282. if result < 0:
  1283. result = 0
  1284. else:
  1285. result = NaN
  1286. return result
  1287. cdef void add_weighted_var(float64_t val,
  1288. float64_t w,
  1289. float64_t *t,
  1290. float64_t *sum_w,
  1291. float64_t *mean,
  1292. float64_t *nobs) nogil:
  1293. """
  1294. Update weighted mean, sum of weights and sum of weighted squared
  1295. differences to include value and weight pair in weighted variance
  1296. calculation using West's method.
  1297. Paper: https://dl.acm.org/citation.cfm?id=359153
  1298. Parameters
  1299. ----------
  1300. val: float64_t
  1301. window values
  1302. w: float64_t
  1303. window weights
  1304. t: float64_t
  1305. sum of weighted squared differences
  1306. sum_w: float64_t
  1307. sum of weights
  1308. mean: float64_t
  1309. weighted mean
  1310. nobs: float64_t
  1311. number of observations
  1312. """
  1313. cdef:
  1314. float64_t temp, q, r
  1315. if val != val:
  1316. return
  1317. nobs[0] = nobs[0] + 1
  1318. q = val - mean[0]
  1319. temp = sum_w[0] + w
  1320. r = q * w / temp
  1321. mean[0] = mean[0] + r
  1322. t[0] = t[0] + r * sum_w[0] * q
  1323. sum_w[0] = temp
  1324. cdef void remove_weighted_var(float64_t val,
  1325. float64_t w,
  1326. float64_t *t,
  1327. float64_t *sum_w,
  1328. float64_t *mean,
  1329. float64_t *nobs) nogil:
  1330. """
  1331. Update weighted mean, sum of weights and sum of weighted squared
  1332. differences to remove value and weight pair from weighted variance
  1333. calculation using West's method.
  1334. Paper: https://dl.acm.org/citation.cfm?id=359153
  1335. Parameters
  1336. ----------
  1337. val: float64_t
  1338. window values
  1339. w: float64_t
  1340. window weights
  1341. t: float64_t
  1342. sum of weighted squared differences
  1343. sum_w: float64_t
  1344. sum of weights
  1345. mean: float64_t
  1346. weighted mean
  1347. nobs: float64_t
  1348. number of observations
  1349. """
  1350. cdef:
  1351. float64_t temp, q, r
  1352. if val == val:
  1353. nobs[0] = nobs[0] - 1
  1354. if nobs[0]:
  1355. q = val - mean[0]
  1356. temp = sum_w[0] - w
  1357. r = q * w / temp
  1358. mean[0] = mean[0] - r
  1359. t[0] = t[0] - r * sum_w[0] * q
  1360. sum_w[0] = temp
  1361. else:
  1362. t[0] = 0
  1363. sum_w[0] = 0
  1364. mean[0] = 0
  1365. def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights,
  1366. int64_t minp, unsigned int ddof):
  1367. """
  1368. Calculates weighted rolling variance using West's online algorithm.
  1369. Paper: https://dl.acm.org/citation.cfm?id=359153
  1370. Parameters
  1371. ----------
  1372. values: float64_t[:]
  1373. values to roll window over
  1374. weights: float64_t[:]
  1375. array of weights whose length is window size
  1376. minp: int64_t
  1377. minimum number of observations to calculate
  1378. variance of a window
  1379. ddof: unsigned int
  1380. the divisor used in variance calculations
  1381. is the window size - ddof
  1382. Returns
  1383. -------
  1384. output: float64_t[:]
  1385. weighted variances of windows
  1386. """
  1387. cdef:
  1388. float64_t t = 0, sum_w = 0, mean = 0, nobs = 0
  1389. float64_t val, pre_val, w, pre_w
  1390. Py_ssize_t i, n, win_n
  1391. float64_t[:] output
  1392. n = len(values)
  1393. win_n = len(weights)
  1394. output = np.empty(n, dtype=np.float64)
  1395. with nogil:
  1396. for i in range(min(win_n, n)):
  1397. add_weighted_var(values[i], weights[i], &t,
  1398. &sum_w, &mean, &nobs)
  1399. output[i] = calc_weighted_var(t, sum_w, win_n,
  1400. ddof, nobs, minp)
  1401. for i in range(win_n, n):
  1402. val = values[i]
  1403. pre_val = values[i - win_n]
  1404. w = weights[i % win_n]
  1405. pre_w = weights[(i - win_n) % win_n]
  1406. if val == val:
  1407. if pre_val == pre_val:
  1408. remove_weighted_var(pre_val, pre_w, &t,
  1409. &sum_w, &mean, &nobs)
  1410. add_weighted_var(val, w, &t, &sum_w, &mean, &nobs)
  1411. elif pre_val == pre_val:
  1412. remove_weighted_var(pre_val, pre_w, &t,
  1413. &sum_w, &mean, &nobs)
  1414. output[i] = calc_weighted_var(t, sum_w, win_n,
  1415. ddof, nobs, minp)
  1416. return output
  1417. # ----------------------------------------------------------------------
  1418. # Exponentially weighted moving
  1419. @cython.cpow(True)
  1420. def ewm(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end,
  1421. int minp, float64_t com, bint adjust, bint ignore_na,
  1422. const float64_t[:] deltas=None, bint normalize=True) -> np.ndarray:
  1423. """
  1424. Compute exponentially-weighted moving average or sum using center-of-mass.
  1425. Parameters
  1426. ----------
  1427. vals : ndarray (float64 type)
  1428. start: ndarray (int64 type)
  1429. end: ndarray (int64 type)
  1430. minp : int
  1431. com : float64
  1432. adjust : bool
  1433. ignore_na : bool
  1434. deltas : ndarray (float64 type), optional. If None, implicitly assumes equally
  1435. spaced points (used when `times` is not passed)
  1436. normalize : bool, optional.
  1437. If True, calculate the mean. If False, calculate the sum.
  1438. Returns
  1439. -------
  1440. np.ndarray[float64_t]
  1441. """
  1442. cdef:
  1443. Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start)
  1444. const float64_t[:] sub_vals
  1445. const float64_t[:] sub_deltas=None
  1446. ndarray[float64_t] sub_output, output = np.empty(N, dtype=np.float64)
  1447. float64_t alpha, old_wt_factor, new_wt, weighted, old_wt, cur
  1448. bint is_observation, use_deltas
  1449. if N == 0:
  1450. return output
  1451. use_deltas = deltas is not None
  1452. alpha = 1. / (1. + com)
  1453. old_wt_factor = 1. - alpha
  1454. new_wt = 1. if adjust else alpha
  1455. for j in range(M):
  1456. s = start[j]
  1457. e = end[j]
  1458. sub_vals = vals[s:e]
  1459. # note that len(deltas) = len(vals) - 1 and deltas[i] is to be used in
  1460. # conjunction with vals[i+1]
  1461. if use_deltas:
  1462. sub_deltas = deltas[s:e - 1]
  1463. win_size = len(sub_vals)
  1464. sub_output = np.empty(win_size, dtype=np.float64)
  1465. weighted = sub_vals[0]
  1466. is_observation = weighted == weighted
  1467. nobs = int(is_observation)
  1468. sub_output[0] = weighted if nobs >= minp else NaN
  1469. old_wt = 1.
  1470. with nogil:
  1471. for i in range(1, win_size):
  1472. cur = sub_vals[i]
  1473. is_observation = cur == cur
  1474. nobs += is_observation
  1475. if weighted == weighted:
  1476. if is_observation or not ignore_na:
  1477. if normalize:
  1478. if use_deltas:
  1479. old_wt *= old_wt_factor ** sub_deltas[i - 1]
  1480. else:
  1481. old_wt *= old_wt_factor
  1482. else:
  1483. weighted = old_wt_factor * weighted
  1484. if is_observation:
  1485. if normalize:
  1486. # avoid numerical errors on constant series
  1487. if weighted != cur:
  1488. weighted = old_wt * weighted + new_wt * cur
  1489. weighted /= (old_wt + new_wt)
  1490. if adjust:
  1491. old_wt += new_wt
  1492. else:
  1493. old_wt = 1.
  1494. else:
  1495. weighted += cur
  1496. elif is_observation:
  1497. weighted = cur
  1498. sub_output[i] = weighted if nobs >= minp else NaN
  1499. output[s:e] = sub_output
  1500. return output
  1501. def ewmcov(const float64_t[:] input_x, const int64_t[:] start, const int64_t[:] end,
  1502. int minp, const float64_t[:] input_y, float64_t com, bint adjust,
  1503. bint ignore_na, bint bias) -> np.ndarray:
  1504. """
  1505. Compute exponentially-weighted moving variance using center-of-mass.
  1506. Parameters
  1507. ----------
  1508. input_x : ndarray (float64 type)
  1509. start: ndarray (int64 type)
  1510. end: ndarray (int64 type)
  1511. minp : int
  1512. input_y : ndarray (float64 type)
  1513. com : float64
  1514. adjust : bool
  1515. ignore_na : bool
  1516. bias : bool
  1517. Returns
  1518. -------
  1519. np.ndarray[float64_t]
  1520. """
  1521. cdef:
  1522. Py_ssize_t i, j, s, e, win_size, nobs
  1523. Py_ssize_t N = len(input_x), M = len(input_y), L = len(start)
  1524. float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov
  1525. float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y
  1526. float64_t numerator, denominator
  1527. const float64_t[:] sub_x_vals, sub_y_vals
  1528. ndarray[float64_t] sub_out, output = np.empty(N, dtype=np.float64)
  1529. bint is_observation
  1530. if M != N:
  1531. raise ValueError(f"arrays are of different lengths ({N} and {M})")
  1532. if N == 0:
  1533. return output
  1534. alpha = 1. / (1. + com)
  1535. old_wt_factor = 1. - alpha
  1536. new_wt = 1. if adjust else alpha
  1537. for j in range(L):
  1538. s = start[j]
  1539. e = end[j]
  1540. sub_x_vals = input_x[s:e]
  1541. sub_y_vals = input_y[s:e]
  1542. win_size = len(sub_x_vals)
  1543. sub_out = np.empty(win_size, dtype=np.float64)
  1544. mean_x = sub_x_vals[0]
  1545. mean_y = sub_y_vals[0]
  1546. is_observation = (mean_x == mean_x) and (mean_y == mean_y)
  1547. nobs = int(is_observation)
  1548. if not is_observation:
  1549. mean_x = NaN
  1550. mean_y = NaN
  1551. sub_out[0] = (0. if bias else NaN) if nobs >= minp else NaN
  1552. cov = 0.
  1553. sum_wt = 1.
  1554. sum_wt2 = 1.
  1555. old_wt = 1.
  1556. with nogil:
  1557. for i in range(1, win_size):
  1558. cur_x = sub_x_vals[i]
  1559. cur_y = sub_y_vals[i]
  1560. is_observation = (cur_x == cur_x) and (cur_y == cur_y)
  1561. nobs += is_observation
  1562. if mean_x == mean_x:
  1563. if is_observation or not ignore_na:
  1564. sum_wt *= old_wt_factor
  1565. sum_wt2 *= (old_wt_factor * old_wt_factor)
  1566. old_wt *= old_wt_factor
  1567. if is_observation:
  1568. old_mean_x = mean_x
  1569. old_mean_y = mean_y
  1570. # avoid numerical errors on constant series
  1571. if mean_x != cur_x:
  1572. mean_x = ((old_wt * old_mean_x) +
  1573. (new_wt * cur_x)) / (old_wt + new_wt)
  1574. # avoid numerical errors on constant series
  1575. if mean_y != cur_y:
  1576. mean_y = ((old_wt * old_mean_y) +
  1577. (new_wt * cur_y)) / (old_wt + new_wt)
  1578. cov = ((old_wt * (cov + ((old_mean_x - mean_x) *
  1579. (old_mean_y - mean_y)))) +
  1580. (new_wt * ((cur_x - mean_x) *
  1581. (cur_y - mean_y)))) / (old_wt + new_wt)
  1582. sum_wt += new_wt
  1583. sum_wt2 += (new_wt * new_wt)
  1584. old_wt += new_wt
  1585. if not adjust:
  1586. sum_wt /= old_wt
  1587. sum_wt2 /= (old_wt * old_wt)
  1588. old_wt = 1.
  1589. elif is_observation:
  1590. mean_x = cur_x
  1591. mean_y = cur_y
  1592. if nobs >= minp:
  1593. if not bias:
  1594. numerator = sum_wt * sum_wt
  1595. denominator = numerator - sum_wt2
  1596. if denominator > 0:
  1597. sub_out[i] = (numerator / denominator) * cov
  1598. else:
  1599. sub_out[i] = NaN
  1600. else:
  1601. sub_out[i] = cov
  1602. else:
  1603. sub_out[i] = NaN
  1604. output[s:e] = sub_out
  1605. return output