test_algos.py 82 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407
  1. from datetime import datetime
  2. from itertools import permutations
  3. import struct
  4. import numpy as np
  5. import pytest
  6. from pandas._libs import (
  7. algos as libalgos,
  8. hashtable as ht,
  9. )
  10. import pandas.util._test_decorators as td
  11. from pandas.core.dtypes.common import (
  12. is_bool_dtype,
  13. is_complex_dtype,
  14. is_float_dtype,
  15. is_integer_dtype,
  16. is_object_dtype,
  17. )
  18. from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
  19. import pandas as pd
  20. from pandas import (
  21. Categorical,
  22. CategoricalIndex,
  23. DataFrame,
  24. DatetimeIndex,
  25. Index,
  26. IntervalIndex,
  27. MultiIndex,
  28. NaT,
  29. Period,
  30. PeriodIndex,
  31. Series,
  32. Timedelta,
  33. Timestamp,
  34. date_range,
  35. timedelta_range,
  36. to_datetime,
  37. to_timedelta,
  38. )
  39. import pandas._testing as tm
  40. import pandas.core.algorithms as algos
  41. from pandas.core.arrays import (
  42. DatetimeArray,
  43. TimedeltaArray,
  44. )
  45. import pandas.core.common as com
  46. class TestFactorize:
  47. @pytest.mark.parametrize("sort", [True, False])
  48. def test_factorize(self, index_or_series_obj, sort):
  49. obj = index_or_series_obj
  50. result_codes, result_uniques = obj.factorize(sort=sort)
  51. constructor = Index
  52. if isinstance(obj, MultiIndex):
  53. constructor = MultiIndex.from_tuples
  54. expected_arr = obj.unique()
  55. if expected_arr.dtype == np.float16:
  56. expected_arr = expected_arr.astype(np.float32)
  57. expected_uniques = constructor(expected_arr)
  58. if (
  59. isinstance(obj, Index)
  60. and expected_uniques.dtype == bool
  61. and obj.dtype == object
  62. ):
  63. expected_uniques = expected_uniques.astype(object)
  64. if sort:
  65. expected_uniques = expected_uniques.sort_values()
  66. # construct an integer ndarray so that
  67. # `expected_uniques.take(expected_codes)` is equal to `obj`
  68. expected_uniques_list = list(expected_uniques)
  69. expected_codes = [expected_uniques_list.index(val) for val in obj]
  70. expected_codes = np.asarray(expected_codes, dtype=np.intp)
  71. tm.assert_numpy_array_equal(result_codes, expected_codes)
  72. tm.assert_index_equal(result_uniques, expected_uniques, exact=True)
  73. def test_series_factorize_use_na_sentinel_false(self):
  74. # GH#35667
  75. values = np.array([1, 2, 1, np.nan])
  76. ser = Series(values)
  77. codes, uniques = ser.factorize(use_na_sentinel=False)
  78. expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
  79. expected_uniques = Index([1.0, 2.0, np.nan])
  80. tm.assert_numpy_array_equal(codes, expected_codes)
  81. tm.assert_index_equal(uniques, expected_uniques)
  82. def test_basic(self):
  83. codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
  84. tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
  85. codes, uniques = algos.factorize(
  86. ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True
  87. )
  88. exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
  89. tm.assert_numpy_array_equal(codes, exp)
  90. exp = np.array(["a", "b", "c"], dtype=object)
  91. tm.assert_numpy_array_equal(uniques, exp)
  92. arr = np.arange(5, dtype=np.intp)[::-1]
  93. codes, uniques = algos.factorize(arr)
  94. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  95. tm.assert_numpy_array_equal(codes, exp)
  96. exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
  97. tm.assert_numpy_array_equal(uniques, exp)
  98. codes, uniques = algos.factorize(arr, sort=True)
  99. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  100. tm.assert_numpy_array_equal(codes, exp)
  101. exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
  102. tm.assert_numpy_array_equal(uniques, exp)
  103. arr = np.arange(5.0)[::-1]
  104. codes, uniques = algos.factorize(arr)
  105. exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  106. tm.assert_numpy_array_equal(codes, exp)
  107. exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
  108. tm.assert_numpy_array_equal(uniques, exp)
  109. codes, uniques = algos.factorize(arr, sort=True)
  110. exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
  111. tm.assert_numpy_array_equal(codes, exp)
  112. exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
  113. tm.assert_numpy_array_equal(uniques, exp)
  114. def test_mixed(self):
  115. # doc example reshaping.rst
  116. x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
  117. codes, uniques = algos.factorize(x)
  118. exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
  119. tm.assert_numpy_array_equal(codes, exp)
  120. exp = Index(["A", "B", 3.14, np.inf])
  121. tm.assert_index_equal(uniques, exp)
  122. codes, uniques = algos.factorize(x, sort=True)
  123. exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
  124. tm.assert_numpy_array_equal(codes, exp)
  125. exp = Index([3.14, np.inf, "A", "B"])
  126. tm.assert_index_equal(uniques, exp)
  127. def test_datelike(self):
  128. # M8
  129. v1 = Timestamp("20130101 09:00:00.00004")
  130. v2 = Timestamp("20130101")
  131. x = Series([v1, v1, v1, v2, v2, v1])
  132. codes, uniques = algos.factorize(x)
  133. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  134. tm.assert_numpy_array_equal(codes, exp)
  135. exp = DatetimeIndex([v1, v2])
  136. tm.assert_index_equal(uniques, exp)
  137. codes, uniques = algos.factorize(x, sort=True)
  138. exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
  139. tm.assert_numpy_array_equal(codes, exp)
  140. exp = DatetimeIndex([v2, v1])
  141. tm.assert_index_equal(uniques, exp)
  142. # period
  143. v1 = Period("201302", freq="M")
  144. v2 = Period("201303", freq="M")
  145. x = Series([v1, v1, v1, v2, v2, v1])
  146. # periods are not 'sorted' as they are converted back into an index
  147. codes, uniques = algos.factorize(x)
  148. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  149. tm.assert_numpy_array_equal(codes, exp)
  150. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  151. codes, uniques = algos.factorize(x, sort=True)
  152. exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
  153. tm.assert_numpy_array_equal(codes, exp)
  154. tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
  155. # GH 5986
  156. v1 = to_timedelta("1 day 1 min")
  157. v2 = to_timedelta("1 day")
  158. x = Series([v1, v2, v1, v1, v2, v2, v1])
  159. codes, uniques = algos.factorize(x)
  160. exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
  161. tm.assert_numpy_array_equal(codes, exp)
  162. tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
  163. codes, uniques = algos.factorize(x, sort=True)
  164. exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
  165. tm.assert_numpy_array_equal(codes, exp)
  166. tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
  167. def test_factorize_nan(self):
  168. # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
  169. # rizer.factorize should not raise an exception if na_sentinel indexes
  170. # outside of reverse_indexer
  171. key = np.array([1, 2, 1, np.nan], dtype="O")
  172. rizer = ht.ObjectFactorizer(len(key))
  173. for na_sentinel in (-1, 20):
  174. ids = rizer.factorize(key, na_sentinel=na_sentinel)
  175. expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp)
  176. assert len(set(key)) == len(set(expected))
  177. tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
  178. tm.assert_numpy_array_equal(ids, expected)
  179. def test_factorizer_with_mask(self):
  180. # GH#49549
  181. data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
  182. mask = np.array([False, False, False, False, False, True])
  183. rizer = ht.Int64Factorizer(len(data))
  184. result = rizer.factorize(data, mask=mask)
  185. expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
  186. tm.assert_numpy_array_equal(result, expected)
  187. expected_uniques = np.array([1, 2, 3], dtype="int64")
  188. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  189. def test_factorizer_object_with_nan(self):
  190. # GH#49549
  191. data = np.array([1, 2, 3, 1, np.nan])
  192. rizer = ht.ObjectFactorizer(len(data))
  193. result = rizer.factorize(data.astype(object))
  194. expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
  195. tm.assert_numpy_array_equal(result, expected)
  196. expected_uniques = np.array([1, 2, 3], dtype=object)
  197. tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
  198. @pytest.mark.parametrize(
  199. "data, expected_codes, expected_uniques",
  200. [
  201. (
  202. [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
  203. [0, 1, 2, 1, 3],
  204. [(1, 1), (1, 2), (0, 0), "nonsense"],
  205. ),
  206. (
  207. [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
  208. [0, 1, 2, 1, 3],
  209. [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
  210. ),
  211. ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
  212. ],
  213. )
  214. def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
  215. # GH9454
  216. codes, uniques = pd.factorize(data)
  217. tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
  218. expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
  219. tm.assert_numpy_array_equal(uniques, expected_uniques_array)
  220. def test_complex_sorting(self):
  221. # gh 12666 - check no segfault
  222. x17 = np.array([complex(i) for i in range(17)], dtype=object)
  223. msg = "'[<>]' not supported between instances of .*"
  224. with pytest.raises(TypeError, match=msg):
  225. algos.factorize(x17[::-1], sort=True)
  226. def test_numeric_dtype_factorize(self, any_real_numpy_dtype):
  227. # GH41132
  228. dtype = any_real_numpy_dtype
  229. data = np.array([1, 2, 2, 1], dtype=dtype)
  230. expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
  231. expected_uniques = np.array([1, 2], dtype=dtype)
  232. codes, uniques = algos.factorize(data)
  233. tm.assert_numpy_array_equal(codes, expected_codes)
  234. tm.assert_numpy_array_equal(uniques, expected_uniques)
  235. def test_float64_factorize(self, writable):
  236. data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
  237. data.setflags(write=writable)
  238. expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
  239. expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
  240. codes, uniques = algos.factorize(data)
  241. tm.assert_numpy_array_equal(codes, expected_codes)
  242. tm.assert_numpy_array_equal(uniques, expected_uniques)
  243. def test_uint64_factorize(self, writable):
  244. data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
  245. data.setflags(write=writable)
  246. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  247. expected_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
  248. codes, uniques = algos.factorize(data)
  249. tm.assert_numpy_array_equal(codes, expected_codes)
  250. tm.assert_numpy_array_equal(uniques, expected_uniques)
  251. def test_int64_factorize(self, writable):
  252. data = np.array([2**63 - 1, -(2**63), 2**63 - 1], dtype=np.int64)
  253. data.setflags(write=writable)
  254. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  255. expected_uniques = np.array([2**63 - 1, -(2**63)], dtype=np.int64)
  256. codes, uniques = algos.factorize(data)
  257. tm.assert_numpy_array_equal(codes, expected_codes)
  258. tm.assert_numpy_array_equal(uniques, expected_uniques)
  259. def test_string_factorize(self, writable):
  260. data = np.array(["a", "c", "a", "b", "c"], dtype=object)
  261. data.setflags(write=writable)
  262. expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
  263. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  264. codes, uniques = algos.factorize(data)
  265. tm.assert_numpy_array_equal(codes, expected_codes)
  266. tm.assert_numpy_array_equal(uniques, expected_uniques)
  267. def test_object_factorize(self, writable):
  268. data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
  269. data.setflags(write=writable)
  270. expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
  271. expected_uniques = np.array(["a", "c", "b"], dtype=object)
  272. codes, uniques = algos.factorize(data)
  273. tm.assert_numpy_array_equal(codes, expected_codes)
  274. tm.assert_numpy_array_equal(uniques, expected_uniques)
  275. def test_datetime64_factorize(self, writable):
  276. # GH35650 Verify whether read-only datetime64 array can be factorized
  277. data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
  278. data.setflags(write=writable)
  279. expected_codes = np.array([0], dtype=np.intp)
  280. expected_uniques = np.array(
  281. ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
  282. )
  283. codes, uniques = pd.factorize(data)
  284. tm.assert_numpy_array_equal(codes, expected_codes)
  285. tm.assert_numpy_array_equal(uniques, expected_uniques)
  286. @pytest.mark.parametrize("sort", [True, False])
  287. def test_factorize_rangeindex(self, sort):
  288. # increasing -> sort doesn't matter
  289. ri = pd.RangeIndex.from_range(range(10))
  290. expected = np.arange(10, dtype=np.intp), ri
  291. result = algos.factorize(ri, sort=sort)
  292. tm.assert_numpy_array_equal(result[0], expected[0])
  293. tm.assert_index_equal(result[1], expected[1], exact=True)
  294. result = ri.factorize(sort=sort)
  295. tm.assert_numpy_array_equal(result[0], expected[0])
  296. tm.assert_index_equal(result[1], expected[1], exact=True)
  297. @pytest.mark.parametrize("sort", [True, False])
  298. def test_factorize_rangeindex_decreasing(self, sort):
  299. # decreasing -> sort matters
  300. ri = pd.RangeIndex.from_range(range(10))
  301. expected = np.arange(10, dtype=np.intp), ri
  302. ri2 = ri[::-1]
  303. expected = expected[0], ri2
  304. if sort:
  305. expected = expected[0][::-1], expected[1][::-1]
  306. result = algos.factorize(ri2, sort=sort)
  307. tm.assert_numpy_array_equal(result[0], expected[0])
  308. tm.assert_index_equal(result[1], expected[1], exact=True)
  309. result = ri2.factorize(sort=sort)
  310. tm.assert_numpy_array_equal(result[0], expected[0])
  311. tm.assert_index_equal(result[1], expected[1], exact=True)
  312. def test_deprecate_order(self):
  313. # gh 19727 - check warning is raised for deprecated keyword, order.
  314. # Test not valid once order keyword is removed.
  315. data = np.array([2**63, 1, 2**63], dtype=np.uint64)
  316. with pytest.raises(TypeError, match="got an unexpected keyword"):
  317. algos.factorize(data, order=True)
  318. with tm.assert_produces_warning(False):
  319. algos.factorize(data)
  320. @pytest.mark.parametrize(
  321. "data",
  322. [
  323. np.array([0, 1, 0], dtype="u8"),
  324. np.array([-(2**63), 1, -(2**63)], dtype="i8"),
  325. np.array(["__nan__", "foo", "__nan__"], dtype="object"),
  326. ],
  327. )
  328. def test_parametrized_factorize_na_value_default(self, data):
  329. # arrays that include the NA default for that type, but isn't used.
  330. codes, uniques = algos.factorize(data)
  331. expected_uniques = data[[0, 1]]
  332. expected_codes = np.array([0, 1, 0], dtype=np.intp)
  333. tm.assert_numpy_array_equal(codes, expected_codes)
  334. tm.assert_numpy_array_equal(uniques, expected_uniques)
  335. @pytest.mark.parametrize(
  336. "data, na_value",
  337. [
  338. (np.array([0, 1, 0, 2], dtype="u8"), 0),
  339. (np.array([1, 0, 1, 2], dtype="u8"), 1),
  340. (np.array([-(2**63), 1, -(2**63), 0], dtype="i8"), -(2**63)),
  341. (np.array([1, -(2**63), 1, 0], dtype="i8"), 1),
  342. (np.array(["a", "", "a", "b"], dtype=object), "a"),
  343. (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
  344. (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
  345. ],
  346. )
  347. def test_parametrized_factorize_na_value(self, data, na_value):
  348. codes, uniques = algos.factorize_array(data, na_value=na_value)
  349. expected_uniques = data[[1, 3]]
  350. expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
  351. tm.assert_numpy_array_equal(codes, expected_codes)
  352. tm.assert_numpy_array_equal(uniques, expected_uniques)
  353. @pytest.mark.parametrize("sort", [True, False])
  354. @pytest.mark.parametrize(
  355. "data, uniques",
  356. [
  357. (
  358. np.array(["b", "a", None, "b"], dtype=object),
  359. np.array(["b", "a"], dtype=object),
  360. ),
  361. (
  362. pd.array([2, 1, np.nan, 2], dtype="Int64"),
  363. pd.array([2, 1], dtype="Int64"),
  364. ),
  365. ],
  366. ids=["numpy_array", "extension_array"],
  367. )
  368. def test_factorize_use_na_sentinel(self, sort, data, uniques):
  369. codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True)
  370. if sort:
  371. expected_codes = np.array([1, 0, -1, 1], dtype=np.intp)
  372. expected_uniques = algos.safe_sort(uniques)
  373. else:
  374. expected_codes = np.array([0, 1, -1, 0], dtype=np.intp)
  375. expected_uniques = uniques
  376. tm.assert_numpy_array_equal(codes, expected_codes)
  377. if isinstance(data, np.ndarray):
  378. tm.assert_numpy_array_equal(uniques, expected_uniques)
  379. else:
  380. tm.assert_extension_array_equal(uniques, expected_uniques)
  381. @pytest.mark.parametrize(
  382. "data, expected_codes, expected_uniques",
  383. [
  384. (
  385. ["a", None, "b", "a"],
  386. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  387. np.array(["a", np.nan, "b"], dtype=object),
  388. ),
  389. (
  390. ["a", np.nan, "b", "a"],
  391. np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
  392. np.array(["a", np.nan, "b"], dtype=object),
  393. ),
  394. ],
  395. )
  396. def test_object_factorize_use_na_sentinel_false(
  397. self, data, expected_codes, expected_uniques
  398. ):
  399. codes, uniques = algos.factorize(data, use_na_sentinel=False)
  400. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  401. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  402. @pytest.mark.parametrize(
  403. "data, expected_codes, expected_uniques",
  404. [
  405. (
  406. [1, None, 1, 2],
  407. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  408. np.array([1, np.nan, 2], dtype="O"),
  409. ),
  410. (
  411. [1, np.nan, 1, 2],
  412. np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
  413. np.array([1, np.nan, 2], dtype=np.float64),
  414. ),
  415. ],
  416. )
  417. def test_int_factorize_use_na_sentinel_false(
  418. self, data, expected_codes, expected_uniques
  419. ):
  420. codes, uniques = algos.factorize(data, use_na_sentinel=False)
  421. tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
  422. tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
  423. @pytest.mark.parametrize(
  424. "data, expected_codes, expected_uniques",
  425. [
  426. (
  427. Index(Categorical(["a", "a", "b"])),
  428. np.array([0, 0, 1], dtype=np.intp),
  429. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  430. ),
  431. (
  432. Series(Categorical(["a", "a", "b"])),
  433. np.array([0, 0, 1], dtype=np.intp),
  434. CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
  435. ),
  436. (
  437. Series(DatetimeIndex(["2017", "2017"], tz="US/Eastern")),
  438. np.array([0, 0], dtype=np.intp),
  439. DatetimeIndex(["2017"], tz="US/Eastern"),
  440. ),
  441. ],
  442. )
  443. def test_factorize_mixed_values(self, data, expected_codes, expected_uniques):
  444. # GH 19721
  445. codes, uniques = algos.factorize(data)
  446. tm.assert_numpy_array_equal(codes, expected_codes)
  447. tm.assert_index_equal(uniques, expected_uniques)
  448. class TestUnique:
  449. def test_ints(self):
  450. arr = np.random.randint(0, 100, size=50)
  451. result = algos.unique(arr)
  452. assert isinstance(result, np.ndarray)
  453. def test_objects(self):
  454. arr = np.random.randint(0, 100, size=50).astype("O")
  455. result = algos.unique(arr)
  456. assert isinstance(result, np.ndarray)
  457. def test_object_refcount_bug(self):
  458. lst = ["A", "B", "C", "D", "E"]
  459. for i in range(1000):
  460. len(algos.unique(lst))
  461. def test_on_index_object(self):
  462. mindex = MultiIndex.from_arrays(
  463. [np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
  464. )
  465. expected = mindex.values
  466. expected.sort()
  467. mindex = mindex.repeat(2)
  468. result = pd.unique(mindex)
  469. result.sort()
  470. tm.assert_almost_equal(result, expected)
  471. def test_dtype_preservation(self, any_numpy_dtype):
  472. # GH 15442
  473. if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):
  474. data = [1, 2, 2]
  475. uniques = [1, 2]
  476. elif is_integer_dtype(any_numpy_dtype):
  477. data = [1, 2, 2]
  478. uniques = [1, 2]
  479. elif is_float_dtype(any_numpy_dtype):
  480. data = [1, 2, 2]
  481. uniques = [1.0, 2.0]
  482. elif is_complex_dtype(any_numpy_dtype):
  483. data = [complex(1, 0), complex(2, 0), complex(2, 0)]
  484. uniques = [complex(1, 0), complex(2, 0)]
  485. elif is_bool_dtype(any_numpy_dtype):
  486. data = [True, True, False]
  487. uniques = [True, False]
  488. elif is_object_dtype(any_numpy_dtype):
  489. data = ["A", "B", "B"]
  490. uniques = ["A", "B"]
  491. else:
  492. # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
  493. data = [1, 2, 2]
  494. uniques = [1, 2]
  495. result = Series(data, dtype=any_numpy_dtype).unique()
  496. expected = np.array(uniques, dtype=any_numpy_dtype)
  497. if any_numpy_dtype in tm.STRING_DTYPES:
  498. expected = expected.astype(object)
  499. if expected.dtype.kind in ["m", "M"]:
  500. # We get TimedeltaArray/DatetimeArray
  501. assert isinstance(result, (DatetimeArray, TimedeltaArray))
  502. result = np.array(result)
  503. tm.assert_numpy_array_equal(result, expected)
  504. def test_datetime64_dtype_array_returned(self):
  505. # GH 9431
  506. expected = np.array(
  507. [
  508. "2015-01-03T00:00:00.000000000",
  509. "2015-01-01T00:00:00.000000000",
  510. ],
  511. dtype="M8[ns]",
  512. )
  513. dt_index = to_datetime(
  514. [
  515. "2015-01-03T00:00:00.000000000",
  516. "2015-01-01T00:00:00.000000000",
  517. "2015-01-01T00:00:00.000000000",
  518. ]
  519. )
  520. result = algos.unique(dt_index)
  521. tm.assert_numpy_array_equal(result, expected)
  522. assert result.dtype == expected.dtype
  523. s = Series(dt_index)
  524. result = algos.unique(s)
  525. tm.assert_numpy_array_equal(result, expected)
  526. assert result.dtype == expected.dtype
  527. arr = s.values
  528. result = algos.unique(arr)
  529. tm.assert_numpy_array_equal(result, expected)
  530. assert result.dtype == expected.dtype
  531. def test_datetime_non_ns(self):
  532. a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
  533. result = pd.unique(a)
  534. expected = np.array(["2000", "2001"], dtype="datetime64[s]")
  535. tm.assert_numpy_array_equal(result, expected)
  536. def test_timedelta_non_ns(self):
  537. a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
  538. result = pd.unique(a)
  539. expected = np.array([2000, 2001], dtype="timedelta64[s]")
  540. tm.assert_numpy_array_equal(result, expected)
  541. def test_timedelta64_dtype_array_returned(self):
  542. # GH 9431
  543. expected = np.array([31200, 45678, 10000], dtype="m8[ns]")
  544. td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
  545. result = algos.unique(td_index)
  546. tm.assert_numpy_array_equal(result, expected)
  547. assert result.dtype == expected.dtype
  548. s = Series(td_index)
  549. result = algos.unique(s)
  550. tm.assert_numpy_array_equal(result, expected)
  551. assert result.dtype == expected.dtype
  552. arr = s.values
  553. result = algos.unique(arr)
  554. tm.assert_numpy_array_equal(result, expected)
  555. assert result.dtype == expected.dtype
  556. def test_uint64_overflow(self):
  557. s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
  558. exp = np.array([1, 2, 2**63], dtype=np.uint64)
  559. tm.assert_numpy_array_equal(algos.unique(s), exp)
  560. def test_nan_in_object_array(self):
  561. duplicated_items = ["a", np.nan, "c", "c"]
  562. result = pd.unique(duplicated_items)
  563. expected = np.array(["a", np.nan, "c"], dtype=object)
  564. tm.assert_numpy_array_equal(result, expected)
  565. def test_categorical(self):
  566. # we are expecting to return in the order
  567. # of appearance
  568. expected = Categorical(list("bac"))
  569. # we are expecting to return in the order
  570. # of the categories
  571. expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
  572. # GH 15939
  573. c = Categorical(list("baabc"))
  574. result = c.unique()
  575. tm.assert_categorical_equal(result, expected)
  576. result = algos.unique(c)
  577. tm.assert_categorical_equal(result, expected)
  578. c = Categorical(list("baabc"), ordered=True)
  579. result = c.unique()
  580. tm.assert_categorical_equal(result, expected_o)
  581. result = algos.unique(c)
  582. tm.assert_categorical_equal(result, expected_o)
  583. # Series of categorical dtype
  584. s = Series(Categorical(list("baabc")), name="foo")
  585. result = s.unique()
  586. tm.assert_categorical_equal(result, expected)
  587. result = pd.unique(s)
  588. tm.assert_categorical_equal(result, expected)
  589. # CI -> return CI
  590. ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
  591. expected = CategoricalIndex(expected)
  592. result = ci.unique()
  593. tm.assert_index_equal(result, expected)
  594. result = pd.unique(ci)
  595. tm.assert_index_equal(result, expected)
  596. def test_datetime64tz_aware(self):
  597. # GH 15939
  598. result = Series(
  599. Index(
  600. [
  601. Timestamp("20160101", tz="US/Eastern"),
  602. Timestamp("20160101", tz="US/Eastern"),
  603. ]
  604. )
  605. ).unique()
  606. expected = DatetimeArray._from_sequence(
  607. np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")])
  608. )
  609. tm.assert_extension_array_equal(result, expected)
  610. result = Index(
  611. [
  612. Timestamp("20160101", tz="US/Eastern"),
  613. Timestamp("20160101", tz="US/Eastern"),
  614. ]
  615. ).unique()
  616. expected = DatetimeIndex(
  617. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  618. )
  619. tm.assert_index_equal(result, expected)
  620. result = pd.unique(
  621. Series(
  622. Index(
  623. [
  624. Timestamp("20160101", tz="US/Eastern"),
  625. Timestamp("20160101", tz="US/Eastern"),
  626. ]
  627. )
  628. )
  629. )
  630. expected = DatetimeArray._from_sequence(
  631. np.array([Timestamp("2016-01-01", tz="US/Eastern")])
  632. )
  633. tm.assert_extension_array_equal(result, expected)
  634. result = pd.unique(
  635. Index(
  636. [
  637. Timestamp("20160101", tz="US/Eastern"),
  638. Timestamp("20160101", tz="US/Eastern"),
  639. ]
  640. )
  641. )
  642. expected = DatetimeIndex(
  643. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  644. )
  645. tm.assert_index_equal(result, expected)
  646. def test_order_of_appearance(self):
  647. # 9346
  648. # light testing of guarantee of order of appearance
  649. # these also are the doc-examples
  650. result = pd.unique(Series([2, 1, 3, 3]))
  651. tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
  652. result = pd.unique(Series([2] + [1] * 5))
  653. tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
  654. result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")]))
  655. expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]")
  656. tm.assert_numpy_array_equal(result, expected)
  657. result = pd.unique(
  658. Index(
  659. [
  660. Timestamp("20160101", tz="US/Eastern"),
  661. Timestamp("20160101", tz="US/Eastern"),
  662. ]
  663. )
  664. )
  665. expected = DatetimeIndex(
  666. ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
  667. )
  668. tm.assert_index_equal(result, expected)
  669. result = pd.unique(list("aabc"))
  670. expected = np.array(["a", "b", "c"], dtype=object)
  671. tm.assert_numpy_array_equal(result, expected)
  672. result = pd.unique(Series(Categorical(list("aabc"))))
  673. expected = Categorical(list("abc"))
  674. tm.assert_categorical_equal(result, expected)
  675. @pytest.mark.parametrize(
  676. "arg ,expected",
  677. [
  678. (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
  679. (("foo",), np.array(["foo"], dtype=object)),
  680. ],
  681. )
  682. def test_tuple_with_strings(self, arg, expected):
  683. # see GH 17108
  684. result = pd.unique(arg)
  685. tm.assert_numpy_array_equal(result, expected)
  686. def test_obj_none_preservation(self):
  687. # GH 20866
  688. arr = np.array(["foo", None], dtype=object)
  689. result = pd.unique(arr)
  690. expected = np.array(["foo", None], dtype=object)
  691. tm.assert_numpy_array_equal(result, expected, strict_nan=True)
  692. def test_signed_zero(self):
  693. # GH 21866
  694. a = np.array([-0.0, 0.0])
  695. result = pd.unique(a)
  696. expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
  697. tm.assert_numpy_array_equal(result, expected)
  698. def test_different_nans(self):
  699. # GH 21866
  700. # create different nans from bit-patterns:
  701. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  702. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  703. assert NAN1 != NAN1
  704. assert NAN2 != NAN2
  705. a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
  706. result = pd.unique(a)
  707. expected = np.array([np.nan])
  708. tm.assert_numpy_array_equal(result, expected)
  709. @pytest.mark.parametrize("el_type", [np.float64, object])
  710. def test_first_nan_kept(self, el_type):
  711. # GH 22295
  712. # create different nans from bit-patterns:
  713. bits_for_nan1 = 0xFFF8000000000001
  714. bits_for_nan2 = 0x7FF8000000000001
  715. NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
  716. NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
  717. assert NAN1 != NAN1
  718. assert NAN2 != NAN2
  719. a = np.array([NAN1, NAN2], dtype=el_type)
  720. result = pd.unique(a)
  721. assert result.size == 1
  722. # use bit patterns to identify which nan was kept:
  723. result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
  724. assert result_nan_bits == bits_for_nan1
  725. def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
  726. # GH 22295
  727. if unique_nulls_fixture is unique_nulls_fixture2:
  728. return # skip it, values not unique
  729. a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object)
  730. result = pd.unique(a)
  731. assert result.size == 2
  732. assert a[0] is unique_nulls_fixture
  733. assert a[1] is unique_nulls_fixture2
  734. def test_unique_masked(self, any_numeric_ea_dtype):
  735. # GH#48019
  736. ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
  737. result = pd.unique(ser)
  738. expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
  739. tm.assert_extension_array_equal(result, expected)
  740. def test_nunique_ints(index_or_series_or_array):
  741. # GH#36327
  742. values = index_or_series_or_array(np.random.randint(0, 20, 30))
  743. result = algos.nunique_ints(values)
  744. expected = len(algos.unique(values))
  745. assert result == expected
  746. class TestIsin:
  747. def test_invalid(self):
  748. msg = (
  749. r"only list-like objects are allowed to be passed to isin\(\), "
  750. r"you passed a \[int\]"
  751. )
  752. with pytest.raises(TypeError, match=msg):
  753. algos.isin(1, 1)
  754. with pytest.raises(TypeError, match=msg):
  755. algos.isin(1, [1])
  756. with pytest.raises(TypeError, match=msg):
  757. algos.isin([1], 1)
  758. def test_basic(self):
  759. result = algos.isin([1, 2], [1])
  760. expected = np.array([True, False])
  761. tm.assert_numpy_array_equal(result, expected)
  762. result = algos.isin(np.array([1, 2]), [1])
  763. expected = np.array([True, False])
  764. tm.assert_numpy_array_equal(result, expected)
  765. result = algos.isin(Series([1, 2]), [1])
  766. expected = np.array([True, False])
  767. tm.assert_numpy_array_equal(result, expected)
  768. result = algos.isin(Series([1, 2]), Series([1]))
  769. expected = np.array([True, False])
  770. tm.assert_numpy_array_equal(result, expected)
  771. result = algos.isin(Series([1, 2]), {1})
  772. expected = np.array([True, False])
  773. tm.assert_numpy_array_equal(result, expected)
  774. result = algos.isin(["a", "b"], ["a"])
  775. expected = np.array([True, False])
  776. tm.assert_numpy_array_equal(result, expected)
  777. result = algos.isin(Series(["a", "b"]), Series(["a"]))
  778. expected = np.array([True, False])
  779. tm.assert_numpy_array_equal(result, expected)
  780. result = algos.isin(Series(["a", "b"]), {"a"})
  781. expected = np.array([True, False])
  782. tm.assert_numpy_array_equal(result, expected)
  783. result = algos.isin(["a", "b"], [1])
  784. expected = np.array([False, False])
  785. tm.assert_numpy_array_equal(result, expected)
  786. def test_i8(self):
  787. arr = date_range("20130101", periods=3).values
  788. result = algos.isin(arr, [arr[0]])
  789. expected = np.array([True, False, False])
  790. tm.assert_numpy_array_equal(result, expected)
  791. result = algos.isin(arr, arr[0:2])
  792. expected = np.array([True, True, False])
  793. tm.assert_numpy_array_equal(result, expected)
  794. result = algos.isin(arr, set(arr[0:2]))
  795. expected = np.array([True, True, False])
  796. tm.assert_numpy_array_equal(result, expected)
  797. arr = timedelta_range("1 day", periods=3).values
  798. result = algos.isin(arr, [arr[0]])
  799. expected = np.array([True, False, False])
  800. tm.assert_numpy_array_equal(result, expected)
  801. result = algos.isin(arr, arr[0:2])
  802. expected = np.array([True, True, False])
  803. tm.assert_numpy_array_equal(result, expected)
  804. result = algos.isin(arr, set(arr[0:2]))
  805. expected = np.array([True, True, False])
  806. tm.assert_numpy_array_equal(result, expected)
  807. @pytest.mark.parametrize("dtype1", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
  808. @pytest.mark.parametrize("dtype", ["i8", "f8", "u8"])
  809. def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1):
  810. # Anything but object and we get all-False shortcut
  811. dta = date_range("2013-01-01", periods=3)._values
  812. if dtype1 == "period[D]":
  813. # TODO: fix Series.view to get this on its own
  814. arr = dta.to_period("D")
  815. elif dtype1 == "M8[ns, UTC]":
  816. # TODO: fix Series.view to get this on its own
  817. arr = dta.tz_localize("UTC")
  818. else:
  819. arr = Series(dta.view("i8")).view(dtype1)._values
  820. comps = arr.view("i8").astype(dtype)
  821. result = algos.isin(comps, arr)
  822. expected = np.zeros(comps.shape, dtype=bool)
  823. tm.assert_numpy_array_equal(result, expected)
  824. def test_large(self):
  825. s = date_range("20000101", periods=2000000, freq="s").values
  826. result = algos.isin(s, s[0:2])
  827. expected = np.zeros(len(s), dtype=bool)
  828. expected[0] = True
  829. expected[1] = True
  830. tm.assert_numpy_array_equal(result, expected)
  831. def test_categorical_from_codes(self):
  832. # GH 16639
  833. vals = np.array([0, 1, 2, 0])
  834. cats = ["a", "b", "c"]
  835. Sd = Series(Categorical([1]).from_codes(vals, cats))
  836. St = Series(Categorical([1]).from_codes(np.array([0, 1]), cats))
  837. expected = np.array([True, True, False, True])
  838. result = algos.isin(Sd, St)
  839. tm.assert_numpy_array_equal(expected, result)
  840. def test_categorical_isin(self):
  841. vals = np.array([0, 1, 2, 0])
  842. cats = ["a", "b", "c"]
  843. cat = Categorical([1]).from_codes(vals, cats)
  844. other = Categorical([1]).from_codes(np.array([0, 1]), cats)
  845. expected = np.array([True, True, False, True])
  846. result = algos.isin(cat, other)
  847. tm.assert_numpy_array_equal(expected, result)
  848. def test_same_nan_is_in(self):
  849. # GH 22160
  850. # nan is special, because from " a is b" doesn't follow "a == b"
  851. # at least, isin() should follow python's "np.nan in [nan] == True"
  852. # casting to -> np.float64 -> another float-object somewhere on
  853. # the way could lead jepardize this behavior
  854. comps = [np.nan] # could be casted to float64
  855. values = [np.nan]
  856. expected = np.array([True])
  857. result = algos.isin(comps, values)
  858. tm.assert_numpy_array_equal(expected, result)
  859. def test_same_nan_is_in_large(self):
  860. # https://github.com/pandas-dev/pandas/issues/22205
  861. s = np.tile(1.0, 1_000_001)
  862. s[0] = np.nan
  863. result = algos.isin(s, [np.nan, 1])
  864. expected = np.ones(len(s), dtype=bool)
  865. tm.assert_numpy_array_equal(result, expected)
  866. def test_same_nan_is_in_large_series(self):
  867. # https://github.com/pandas-dev/pandas/issues/22205
  868. s = np.tile(1.0, 1_000_001)
  869. series = Series(s)
  870. s[0] = np.nan
  871. result = series.isin([np.nan, 1])
  872. expected = Series(np.ones(len(s), dtype=bool))
  873. tm.assert_series_equal(result, expected)
  874. def test_same_object_is_in(self):
  875. # GH 22160
  876. # there could be special treatment for nans
  877. # the user however could define a custom class
  878. # with similar behavior, then we at least should
  879. # fall back to usual python's behavior: "a in [a] == True"
  880. class LikeNan:
  881. def __eq__(self, other) -> bool:
  882. return False
  883. def __hash__(self):
  884. return 0
  885. a, b = LikeNan(), LikeNan()
  886. # same object -> True
  887. tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
  888. # different objects -> False
  889. tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
  890. def test_different_nans(self):
  891. # GH 22160
  892. # all nans are handled as equivalent
  893. comps = [float("nan")]
  894. values = [float("nan")]
  895. assert comps[0] is not values[0] # different nan-objects
  896. # as list of python-objects:
  897. result = algos.isin(comps, values)
  898. tm.assert_numpy_array_equal(np.array([True]), result)
  899. # as object-array:
  900. result = algos.isin(
  901. np.asarray(comps, dtype=object), np.asarray(values, dtype=object)
  902. )
  903. tm.assert_numpy_array_equal(np.array([True]), result)
  904. # as float64-array:
  905. result = algos.isin(
  906. np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
  907. )
  908. tm.assert_numpy_array_equal(np.array([True]), result)
  909. def test_no_cast(self):
  910. # GH 22160
  911. # ensure 42 is not casted to a string
  912. comps = ["ss", 42]
  913. values = ["42"]
  914. expected = np.array([False, False])
  915. result = algos.isin(comps, values)
  916. tm.assert_numpy_array_equal(expected, result)
  917. @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
  918. def test_empty(self, empty):
  919. # see gh-16991
  920. vals = Index(["a", "b"])
  921. expected = np.array([False, False])
  922. result = algos.isin(vals, empty)
  923. tm.assert_numpy_array_equal(expected, result)
  924. def test_different_nan_objects(self):
  925. # GH 22119
  926. comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
  927. vals = np.array([float("nan")], dtype=object)
  928. expected = np.array([False, False, True])
  929. result = algos.isin(comps, vals)
  930. tm.assert_numpy_array_equal(expected, result)
  931. def test_different_nans_as_float64(self):
  932. # GH 21866
  933. # create different nans from bit-patterns,
  934. # these nans will land in different buckets in the hash-table
  935. # if no special care is taken
  936. NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
  937. NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
  938. assert NAN1 != NAN1
  939. assert NAN2 != NAN2
  940. # check that NAN1 and NAN2 are equivalent:
  941. arr = np.array([NAN1, NAN2], dtype=np.float64)
  942. lookup1 = np.array([NAN1], dtype=np.float64)
  943. result = algos.isin(arr, lookup1)
  944. expected = np.array([True, True])
  945. tm.assert_numpy_array_equal(result, expected)
  946. lookup2 = np.array([NAN2], dtype=np.float64)
  947. result = algos.isin(arr, lookup2)
  948. expected = np.array([True, True])
  949. tm.assert_numpy_array_equal(result, expected)
  950. def test_isin_int_df_string_search(self):
  951. """Comparing df with int`s (1,2) with a string at isin() ("1")
  952. -> should not match values because int 1 is not equal str 1"""
  953. df = DataFrame({"values": [1, 2]})
  954. result = df.isin(["1"])
  955. expected_false = DataFrame({"values": [False, False]})
  956. tm.assert_frame_equal(result, expected_false)
  957. def test_isin_nan_df_string_search(self):
  958. """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
  959. -> should not match values because np.nan is not equal str NaN"""
  960. df = DataFrame({"values": [np.nan, 2]})
  961. result = df.isin(["NaN"])
  962. expected_false = DataFrame({"values": [False, False]})
  963. tm.assert_frame_equal(result, expected_false)
  964. def test_isin_float_df_string_search(self):
  965. """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
  966. -> should not match values because float 1.4245 is not equal str 1.4245"""
  967. df = DataFrame({"values": [1.4245, 2.32441]})
  968. result = df.isin(["1.4245"])
  969. expected_false = DataFrame({"values": [False, False]})
  970. tm.assert_frame_equal(result, expected_false)
  971. def test_isin_unsigned_dtype(self):
  972. # GH#46485
  973. ser = Series([1378774140726870442], dtype=np.uint64)
  974. result = ser.isin([1378774140726870528])
  975. expected = Series(False)
  976. tm.assert_series_equal(result, expected)
  977. class TestValueCounts:
  978. def test_value_counts(self):
  979. np.random.seed(1234)
  980. from pandas.core.reshape.tile import cut
  981. arr = np.random.randn(4)
  982. factor = cut(arr, 4)
  983. # assert isinstance(factor, n)
  984. result = algos.value_counts(factor)
  985. breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
  986. index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
  987. expected = Series([1, 1, 1, 1], index=index, name="count")
  988. tm.assert_series_equal(result.sort_index(), expected.sort_index())
  989. def test_value_counts_bins(self):
  990. s = [1, 2, 3, 4]
  991. result = algos.value_counts(s, bins=1)
  992. expected = Series(
  993. [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count"
  994. )
  995. tm.assert_series_equal(result, expected)
  996. result = algos.value_counts(s, bins=2, sort=False)
  997. expected = Series(
  998. [2, 2],
  999. index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]),
  1000. name="count",
  1001. )
  1002. tm.assert_series_equal(result, expected)
  1003. def test_value_counts_dtypes(self):
  1004. result = algos.value_counts([1, 1.0])
  1005. assert len(result) == 1
  1006. result = algos.value_counts([1, 1.0], bins=1)
  1007. assert len(result) == 1
  1008. result = algos.value_counts(Series([1, 1.0, "1"])) # object
  1009. assert len(result) == 2
  1010. msg = "bins argument only works with numeric data"
  1011. with pytest.raises(TypeError, match=msg):
  1012. algos.value_counts(["1", 1], bins=1)
  1013. def test_value_counts_nat(self):
  1014. td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
  1015. dt = to_datetime(["NaT", "2014-01-01"])
  1016. for s in [td, dt]:
  1017. vc = algos.value_counts(s)
  1018. vc_with_na = algos.value_counts(s, dropna=False)
  1019. assert len(vc) == 1
  1020. assert len(vc_with_na) == 2
  1021. exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count")
  1022. tm.assert_series_equal(algos.value_counts(dt), exp_dt)
  1023. # TODO same for (timedelta)
  1024. def test_value_counts_datetime_outofbounds(self):
  1025. # GH 13663
  1026. s = Series(
  1027. [
  1028. datetime(3000, 1, 1),
  1029. datetime(5000, 1, 1),
  1030. datetime(5000, 1, 1),
  1031. datetime(6000, 1, 1),
  1032. datetime(3000, 1, 1),
  1033. datetime(3000, 1, 1),
  1034. ]
  1035. )
  1036. res = s.value_counts()
  1037. exp_index = Index(
  1038. [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
  1039. dtype=object,
  1040. )
  1041. exp = Series([3, 2, 1], index=exp_index, name="count")
  1042. tm.assert_series_equal(res, exp)
  1043. # GH 12424
  1044. res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
  1045. exp = Series(["2362-01-01", np.nan], dtype=object)
  1046. tm.assert_series_equal(res, exp)
  1047. def test_categorical(self):
  1048. s = Series(Categorical(list("aaabbc")))
  1049. result = s.value_counts()
  1050. expected = Series(
  1051. [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count"
  1052. )
  1053. tm.assert_series_equal(result, expected, check_index_type=True)
  1054. # preserve order?
  1055. s = s.cat.as_ordered()
  1056. result = s.value_counts()
  1057. expected.index = expected.index.as_ordered()
  1058. tm.assert_series_equal(result, expected, check_index_type=True)
  1059. def test_categorical_nans(self):
  1060. s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan)
  1061. s.iloc[1] = np.nan
  1062. result = s.value_counts()
  1063. expected = Series(
  1064. [4, 3, 2],
  1065. index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
  1066. name="count",
  1067. )
  1068. tm.assert_series_equal(result, expected, check_index_type=True)
  1069. result = s.value_counts(dropna=False)
  1070. expected = Series(
  1071. [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count"
  1072. )
  1073. tm.assert_series_equal(result, expected, check_index_type=True)
  1074. # out of order
  1075. s = Series(
  1076. Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
  1077. )
  1078. s.iloc[1] = np.nan
  1079. result = s.value_counts()
  1080. expected = Series(
  1081. [4, 3, 2],
  1082. index=CategoricalIndex(
  1083. ["a", "b", "c"],
  1084. categories=["b", "a", "c"],
  1085. ordered=True,
  1086. ),
  1087. name="count",
  1088. )
  1089. tm.assert_series_equal(result, expected, check_index_type=True)
  1090. result = s.value_counts(dropna=False)
  1091. expected = Series(
  1092. [4, 3, 2, 1],
  1093. index=CategoricalIndex(
  1094. ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
  1095. ),
  1096. name="count",
  1097. )
  1098. tm.assert_series_equal(result, expected, check_index_type=True)
  1099. def test_categorical_zeroes(self):
  1100. # keep the `d` category with 0
  1101. s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
  1102. result = s.value_counts()
  1103. expected = Series(
  1104. [3, 2, 1, 0],
  1105. index=Categorical(
  1106. ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
  1107. ),
  1108. name="count",
  1109. )
  1110. tm.assert_series_equal(result, expected, check_index_type=True)
  1111. def test_dropna(self):
  1112. # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
  1113. tm.assert_series_equal(
  1114. Series([True, True, False]).value_counts(dropna=True),
  1115. Series([2, 1], index=[True, False], name="count"),
  1116. )
  1117. tm.assert_series_equal(
  1118. Series([True, True, False]).value_counts(dropna=False),
  1119. Series([2, 1], index=[True, False], name="count"),
  1120. )
  1121. tm.assert_series_equal(
  1122. Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
  1123. Series([3, 2], index=Index([True, False], dtype=object), name="count"),
  1124. )
  1125. tm.assert_series_equal(
  1126. Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
  1127. Series([5, 3, 2], index=[True, False, np.nan], name="count"),
  1128. )
  1129. tm.assert_series_equal(
  1130. Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
  1131. Series([2, 1], index=[5.0, 10.3], name="count"),
  1132. )
  1133. tm.assert_series_equal(
  1134. Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
  1135. Series([2, 1], index=[5.0, 10.3], name="count"),
  1136. )
  1137. tm.assert_series_equal(
  1138. Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
  1139. Series([2, 1], index=[5.0, 10.3], name="count"),
  1140. )
  1141. result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
  1142. expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan], name="count")
  1143. tm.assert_series_equal(result, expected)
  1144. @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]"))
  1145. def test_value_counts_normalized(self, dtype):
  1146. # GH12558
  1147. s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
  1148. s_typed = s.astype(dtype)
  1149. result = s_typed.value_counts(normalize=True, dropna=False)
  1150. expected = Series(
  1151. [0.5, 0.3, 0.2],
  1152. index=Series([np.nan, 2.0, 1.0], dtype=dtype),
  1153. name="proportion",
  1154. )
  1155. tm.assert_series_equal(result, expected)
  1156. result = s_typed.value_counts(normalize=True, dropna=True)
  1157. expected = Series(
  1158. [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion"
  1159. )
  1160. tm.assert_series_equal(result, expected)
  1161. def test_value_counts_uint64(self):
  1162. arr = np.array([2**63], dtype=np.uint64)
  1163. expected = Series([1], index=[2**63], name="count")
  1164. result = algos.value_counts(arr)
  1165. tm.assert_series_equal(result, expected)
  1166. arr = np.array([-1, 2**63], dtype=object)
  1167. expected = Series([1, 1], index=[-1, 2**63], name="count")
  1168. result = algos.value_counts(arr)
  1169. tm.assert_series_equal(result, expected)
  1170. class TestDuplicated:
  1171. def test_duplicated_with_nas(self):
  1172. keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
  1173. result = algos.duplicated(keys)
  1174. expected = np.array([False, False, False, True, False, True])
  1175. tm.assert_numpy_array_equal(result, expected)
  1176. result = algos.duplicated(keys, keep="first")
  1177. expected = np.array([False, False, False, True, False, True])
  1178. tm.assert_numpy_array_equal(result, expected)
  1179. result = algos.duplicated(keys, keep="last")
  1180. expected = np.array([True, False, True, False, False, False])
  1181. tm.assert_numpy_array_equal(result, expected)
  1182. result = algos.duplicated(keys, keep=False)
  1183. expected = np.array([True, False, True, True, False, True])
  1184. tm.assert_numpy_array_equal(result, expected)
  1185. keys = np.empty(8, dtype=object)
  1186. for i, t in enumerate(
  1187. zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2)
  1188. ):
  1189. keys[i] = t
  1190. result = algos.duplicated(keys)
  1191. falses = [False] * 4
  1192. trues = [True] * 4
  1193. expected = np.array(falses + trues)
  1194. tm.assert_numpy_array_equal(result, expected)
  1195. result = algos.duplicated(keys, keep="last")
  1196. expected = np.array(trues + falses)
  1197. tm.assert_numpy_array_equal(result, expected)
  1198. result = algos.duplicated(keys, keep=False)
  1199. expected = np.array(trues + trues)
  1200. tm.assert_numpy_array_equal(result, expected)
  1201. @pytest.mark.parametrize(
  1202. "case",
  1203. [
  1204. np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
  1205. np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
  1206. np.array(
  1207. [
  1208. 1 + 1j,
  1209. 2 + 2j,
  1210. 1 + 1j,
  1211. 5 + 5j,
  1212. 3 + 3j,
  1213. 2 + 2j,
  1214. 4 + 4j,
  1215. 1 + 1j,
  1216. 5 + 5j,
  1217. 6 + 6j,
  1218. ]
  1219. ),
  1220. np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
  1221. np.array(
  1222. [1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64
  1223. ),
  1224. ],
  1225. )
  1226. def test_numeric_object_likes(self, case):
  1227. exp_first = np.array(
  1228. [False, False, True, False, False, True, False, True, True, False]
  1229. )
  1230. exp_last = np.array(
  1231. [True, True, True, True, False, False, False, False, False, False]
  1232. )
  1233. exp_false = exp_first | exp_last
  1234. res_first = algos.duplicated(case, keep="first")
  1235. tm.assert_numpy_array_equal(res_first, exp_first)
  1236. res_last = algos.duplicated(case, keep="last")
  1237. tm.assert_numpy_array_equal(res_last, exp_last)
  1238. res_false = algos.duplicated(case, keep=False)
  1239. tm.assert_numpy_array_equal(res_false, exp_false)
  1240. # index
  1241. for idx in [Index(case), Index(case, dtype="category")]:
  1242. res_first = idx.duplicated(keep="first")
  1243. tm.assert_numpy_array_equal(res_first, exp_first)
  1244. res_last = idx.duplicated(keep="last")
  1245. tm.assert_numpy_array_equal(res_last, exp_last)
  1246. res_false = idx.duplicated(keep=False)
  1247. tm.assert_numpy_array_equal(res_false, exp_false)
  1248. # series
  1249. for s in [Series(case), Series(case, dtype="category")]:
  1250. res_first = s.duplicated(keep="first")
  1251. tm.assert_series_equal(res_first, Series(exp_first))
  1252. res_last = s.duplicated(keep="last")
  1253. tm.assert_series_equal(res_last, Series(exp_last))
  1254. res_false = s.duplicated(keep=False)
  1255. tm.assert_series_equal(res_false, Series(exp_false))
  1256. def test_datetime_likes(self):
  1257. dt = [
  1258. "2011-01-01",
  1259. "2011-01-02",
  1260. "2011-01-01",
  1261. "NaT",
  1262. "2011-01-03",
  1263. "2011-01-02",
  1264. "2011-01-04",
  1265. "2011-01-01",
  1266. "NaT",
  1267. "2011-01-06",
  1268. ]
  1269. td = [
  1270. "1 days",
  1271. "2 days",
  1272. "1 days",
  1273. "NaT",
  1274. "3 days",
  1275. "2 days",
  1276. "4 days",
  1277. "1 days",
  1278. "NaT",
  1279. "6 days",
  1280. ]
  1281. cases = [
  1282. np.array([Timestamp(d) for d in dt]),
  1283. np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
  1284. np.array([Period(d, freq="D") for d in dt]),
  1285. np.array([np.datetime64(d) for d in dt]),
  1286. np.array([Timedelta(d) for d in td]),
  1287. ]
  1288. exp_first = np.array(
  1289. [False, False, True, False, False, True, False, True, True, False]
  1290. )
  1291. exp_last = np.array(
  1292. [True, True, True, True, False, False, False, False, False, False]
  1293. )
  1294. exp_false = exp_first | exp_last
  1295. for case in cases:
  1296. res_first = algos.duplicated(case, keep="first")
  1297. tm.assert_numpy_array_equal(res_first, exp_first)
  1298. res_last = algos.duplicated(case, keep="last")
  1299. tm.assert_numpy_array_equal(res_last, exp_last)
  1300. res_false = algos.duplicated(case, keep=False)
  1301. tm.assert_numpy_array_equal(res_false, exp_false)
  1302. # index
  1303. for idx in [
  1304. Index(case),
  1305. Index(case, dtype="category"),
  1306. Index(case, dtype=object),
  1307. ]:
  1308. res_first = idx.duplicated(keep="first")
  1309. tm.assert_numpy_array_equal(res_first, exp_first)
  1310. res_last = idx.duplicated(keep="last")
  1311. tm.assert_numpy_array_equal(res_last, exp_last)
  1312. res_false = idx.duplicated(keep=False)
  1313. tm.assert_numpy_array_equal(res_false, exp_false)
  1314. # series
  1315. for s in [
  1316. Series(case),
  1317. Series(case, dtype="category"),
  1318. Series(case, dtype=object),
  1319. ]:
  1320. res_first = s.duplicated(keep="first")
  1321. tm.assert_series_equal(res_first, Series(exp_first))
  1322. res_last = s.duplicated(keep="last")
  1323. tm.assert_series_equal(res_last, Series(exp_last))
  1324. res_false = s.duplicated(keep=False)
  1325. tm.assert_series_equal(res_false, Series(exp_false))
  1326. @pytest.mark.parametrize("case", [Index([1, 2, 3]), pd.RangeIndex(0, 3)])
  1327. def test_unique_index(self, case):
  1328. assert case.is_unique is True
  1329. tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
  1330. @pytest.mark.parametrize(
  1331. "arr, uniques",
  1332. [
  1333. (
  1334. [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
  1335. [(0, 0), (0, 1), (1, 0), (1, 1)],
  1336. ),
  1337. (
  1338. [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
  1339. [("b", "c"), ("a", "b")],
  1340. ),
  1341. ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
  1342. ],
  1343. )
  1344. def test_unique_tuples(self, arr, uniques):
  1345. # https://github.com/pandas-dev/pandas/issues/16519
  1346. expected = np.empty(len(uniques), dtype=object)
  1347. expected[:] = uniques
  1348. result = pd.unique(arr)
  1349. tm.assert_numpy_array_equal(result, expected)
  1350. @pytest.mark.parametrize(
  1351. "array,expected",
  1352. [
  1353. (
  1354. [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
  1355. # Should return a complex dtype in the future
  1356. np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object),
  1357. )
  1358. ],
  1359. )
  1360. def test_unique_complex_numbers(self, array, expected):
  1361. # GH 17927
  1362. result = pd.unique(array)
  1363. tm.assert_numpy_array_equal(result, expected)
  1364. class TestHashTable:
  1365. @pytest.mark.parametrize(
  1366. "htable, tm_dtype",
  1367. [
  1368. (ht.PyObjectHashTable, "String"),
  1369. (ht.StringHashTable, "String"),
  1370. (ht.Float64HashTable, "Float"),
  1371. (ht.Int64HashTable, "Int"),
  1372. (ht.UInt64HashTable, "UInt"),
  1373. ],
  1374. )
  1375. def test_hashtable_unique(self, htable, tm_dtype, writable):
  1376. # output of maker has guaranteed unique elements
  1377. maker = getattr(tm, "make" + tm_dtype + "Index")
  1378. s = Series(maker(1000))
  1379. if htable == ht.Float64HashTable:
  1380. # add NaN for float column
  1381. s.loc[500] = np.nan
  1382. elif htable == ht.PyObjectHashTable:
  1383. # use different NaN types for object column
  1384. s.loc[500:502] = [np.nan, None, NaT]
  1385. # create duplicated selection
  1386. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1387. s_duplicated.values.setflags(write=writable)
  1388. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1389. # and is tested separately; keeps first occurrence like ht.unique()
  1390. expected_unique = s_duplicated.drop_duplicates(keep="first").values
  1391. result_unique = htable().unique(s_duplicated.values)
  1392. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1393. # test return_inverse=True
  1394. # reconstruction can only succeed if the inverse is correct
  1395. result_unique, result_inverse = htable().unique(
  1396. s_duplicated.values, return_inverse=True
  1397. )
  1398. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1399. reconstr = result_unique[result_inverse]
  1400. tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
  1401. @pytest.mark.parametrize(
  1402. "htable, tm_dtype",
  1403. [
  1404. (ht.PyObjectHashTable, "String"),
  1405. (ht.StringHashTable, "String"),
  1406. (ht.Float64HashTable, "Float"),
  1407. (ht.Int64HashTable, "Int"),
  1408. (ht.UInt64HashTable, "UInt"),
  1409. ],
  1410. )
  1411. def test_hashtable_factorize(self, htable, tm_dtype, writable):
  1412. # output of maker has guaranteed unique elements
  1413. maker = getattr(tm, "make" + tm_dtype + "Index")
  1414. s = Series(maker(1000))
  1415. if htable == ht.Float64HashTable:
  1416. # add NaN for float column
  1417. s.loc[500] = np.nan
  1418. elif htable == ht.PyObjectHashTable:
  1419. # use different NaN types for object column
  1420. s.loc[500:502] = [np.nan, None, NaT]
  1421. # create duplicated selection
  1422. s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
  1423. s_duplicated.values.setflags(write=writable)
  1424. na_mask = s_duplicated.isna().values
  1425. result_unique, result_inverse = htable().factorize(s_duplicated.values)
  1426. # drop_duplicates has own cython code (hash_table_func_helper.pxi)
  1427. # and is tested separately; keeps first occurrence like ht.factorize()
  1428. # since factorize removes all NaNs, we do the same here
  1429. expected_unique = s_duplicated.dropna().drop_duplicates().values
  1430. tm.assert_numpy_array_equal(result_unique, expected_unique)
  1431. # reconstruction can only succeed if the inverse is correct. Since
  1432. # factorize removes the NaNs, those have to be excluded here as well
  1433. result_reconstruct = result_unique[result_inverse[~na_mask]]
  1434. expected_reconstruct = s_duplicated.dropna().values
  1435. tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
  1436. class TestRank:
  1437. @td.skip_if_no_scipy
  1438. @pytest.mark.parametrize(
  1439. "arr",
  1440. [
  1441. [np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan],
  1442. [4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan],
  1443. ],
  1444. )
  1445. def test_scipy_compat(self, arr):
  1446. from scipy.stats import rankdata
  1447. arr = np.array(arr)
  1448. mask = ~np.isfinite(arr)
  1449. arr = arr.copy()
  1450. result = libalgos.rank_1d(arr)
  1451. arr[mask] = np.inf
  1452. exp = rankdata(arr)
  1453. exp[mask] = np.nan
  1454. tm.assert_almost_equal(result, exp)
  1455. @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
  1456. def test_basic(self, writable, dtype):
  1457. exp = np.array([1, 2], dtype=np.float64)
  1458. data = np.array([1, 100], dtype=dtype)
  1459. data.setflags(write=writable)
  1460. ser = Series(data)
  1461. result = algos.rank(ser)
  1462. tm.assert_numpy_array_equal(result, exp)
  1463. @pytest.mark.parametrize("dtype", [np.float64, np.uint64])
  1464. def test_uint64_overflow(self, dtype):
  1465. exp = np.array([1, 2], dtype=np.float64)
  1466. s = Series([1, 2**63], dtype=dtype)
  1467. tm.assert_numpy_array_equal(algos.rank(s), exp)
  1468. def test_too_many_ndims(self):
  1469. arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
  1470. msg = "Array with ndim > 2 are not supported"
  1471. with pytest.raises(TypeError, match=msg):
  1472. algos.rank(arr)
  1473. @pytest.mark.single_cpu
  1474. def test_pct_max_many_rows(self):
  1475. # GH 18271
  1476. values = np.arange(2**24 + 1)
  1477. result = algos.rank(values, pct=True).max()
  1478. assert result == 1
  1479. values = np.arange(2**25 + 2).reshape(2**24 + 1, 2)
  1480. result = algos.rank(values, pct=True).max()
  1481. assert result == 1
  1482. def test_pad_backfill_object_segfault():
  1483. old = np.array([], dtype="O")
  1484. new = np.array([datetime(2010, 12, 31)], dtype="O")
  1485. result = libalgos.pad["object"](old, new)
  1486. expected = np.array([-1], dtype=np.intp)
  1487. tm.assert_numpy_array_equal(result, expected)
  1488. result = libalgos.pad["object"](new, old)
  1489. expected = np.array([], dtype=np.intp)
  1490. tm.assert_numpy_array_equal(result, expected)
  1491. result = libalgos.backfill["object"](old, new)
  1492. expected = np.array([-1], dtype=np.intp)
  1493. tm.assert_numpy_array_equal(result, expected)
  1494. result = libalgos.backfill["object"](new, old)
  1495. expected = np.array([], dtype=np.intp)
  1496. tm.assert_numpy_array_equal(result, expected)
  1497. class TestTseriesUtil:
  1498. def test_backfill(self):
  1499. old = Index([1, 5, 10])
  1500. new = Index(list(range(12)))
  1501. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1502. expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp)
  1503. tm.assert_numpy_array_equal(filler, expect_filler)
  1504. # corner case
  1505. old = Index([1, 4])
  1506. new = Index(list(range(5, 10)))
  1507. filler = libalgos.backfill["int64_t"](old.values, new.values)
  1508. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
  1509. tm.assert_numpy_array_equal(filler, expect_filler)
  1510. def test_pad(self):
  1511. old = Index([1, 5, 10])
  1512. new = Index(list(range(12)))
  1513. filler = libalgos.pad["int64_t"](old.values, new.values)
  1514. expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp)
  1515. tm.assert_numpy_array_equal(filler, expect_filler)
  1516. # corner case
  1517. old = Index([5, 10])
  1518. new = Index(np.arange(5, dtype=np.int64))
  1519. filler = libalgos.pad["int64_t"](old.values, new.values)
  1520. expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
  1521. tm.assert_numpy_array_equal(filler, expect_filler)
  1522. def test_is_lexsorted():
  1523. failure = [
  1524. np.array(
  1525. [
  1526. 3,
  1527. 3,
  1528. 3,
  1529. 3,
  1530. 3,
  1531. 3,
  1532. 3,
  1533. 3,
  1534. 3,
  1535. 3,
  1536. 3,
  1537. 3,
  1538. 3,
  1539. 3,
  1540. 3,
  1541. 3,
  1542. 3,
  1543. 3,
  1544. 3,
  1545. 3,
  1546. 3,
  1547. 3,
  1548. 3,
  1549. 3,
  1550. 3,
  1551. 3,
  1552. 3,
  1553. 3,
  1554. 3,
  1555. 3,
  1556. 3,
  1557. 2,
  1558. 2,
  1559. 2,
  1560. 2,
  1561. 2,
  1562. 2,
  1563. 2,
  1564. 2,
  1565. 2,
  1566. 2,
  1567. 2,
  1568. 2,
  1569. 2,
  1570. 2,
  1571. 2,
  1572. 2,
  1573. 2,
  1574. 2,
  1575. 2,
  1576. 2,
  1577. 2,
  1578. 2,
  1579. 2,
  1580. 2,
  1581. 2,
  1582. 2,
  1583. 2,
  1584. 2,
  1585. 2,
  1586. 2,
  1587. 2,
  1588. 1,
  1589. 1,
  1590. 1,
  1591. 1,
  1592. 1,
  1593. 1,
  1594. 1,
  1595. 1,
  1596. 1,
  1597. 1,
  1598. 1,
  1599. 1,
  1600. 1,
  1601. 1,
  1602. 1,
  1603. 1,
  1604. 1,
  1605. 1,
  1606. 1,
  1607. 1,
  1608. 1,
  1609. 1,
  1610. 1,
  1611. 1,
  1612. 1,
  1613. 1,
  1614. 1,
  1615. 1,
  1616. 1,
  1617. 1,
  1618. 1,
  1619. 0,
  1620. 0,
  1621. 0,
  1622. 0,
  1623. 0,
  1624. 0,
  1625. 0,
  1626. 0,
  1627. 0,
  1628. 0,
  1629. 0,
  1630. 0,
  1631. 0,
  1632. 0,
  1633. 0,
  1634. 0,
  1635. 0,
  1636. 0,
  1637. 0,
  1638. 0,
  1639. 0,
  1640. 0,
  1641. 0,
  1642. 0,
  1643. 0,
  1644. 0,
  1645. 0,
  1646. 0,
  1647. 0,
  1648. 0,
  1649. 0,
  1650. ],
  1651. dtype="int64",
  1652. ),
  1653. np.array(
  1654. [
  1655. 30,
  1656. 29,
  1657. 28,
  1658. 27,
  1659. 26,
  1660. 25,
  1661. 24,
  1662. 23,
  1663. 22,
  1664. 21,
  1665. 20,
  1666. 19,
  1667. 18,
  1668. 17,
  1669. 16,
  1670. 15,
  1671. 14,
  1672. 13,
  1673. 12,
  1674. 11,
  1675. 10,
  1676. 9,
  1677. 8,
  1678. 7,
  1679. 6,
  1680. 5,
  1681. 4,
  1682. 3,
  1683. 2,
  1684. 1,
  1685. 0,
  1686. 30,
  1687. 29,
  1688. 28,
  1689. 27,
  1690. 26,
  1691. 25,
  1692. 24,
  1693. 23,
  1694. 22,
  1695. 21,
  1696. 20,
  1697. 19,
  1698. 18,
  1699. 17,
  1700. 16,
  1701. 15,
  1702. 14,
  1703. 13,
  1704. 12,
  1705. 11,
  1706. 10,
  1707. 9,
  1708. 8,
  1709. 7,
  1710. 6,
  1711. 5,
  1712. 4,
  1713. 3,
  1714. 2,
  1715. 1,
  1716. 0,
  1717. 30,
  1718. 29,
  1719. 28,
  1720. 27,
  1721. 26,
  1722. 25,
  1723. 24,
  1724. 23,
  1725. 22,
  1726. 21,
  1727. 20,
  1728. 19,
  1729. 18,
  1730. 17,
  1731. 16,
  1732. 15,
  1733. 14,
  1734. 13,
  1735. 12,
  1736. 11,
  1737. 10,
  1738. 9,
  1739. 8,
  1740. 7,
  1741. 6,
  1742. 5,
  1743. 4,
  1744. 3,
  1745. 2,
  1746. 1,
  1747. 0,
  1748. 30,
  1749. 29,
  1750. 28,
  1751. 27,
  1752. 26,
  1753. 25,
  1754. 24,
  1755. 23,
  1756. 22,
  1757. 21,
  1758. 20,
  1759. 19,
  1760. 18,
  1761. 17,
  1762. 16,
  1763. 15,
  1764. 14,
  1765. 13,
  1766. 12,
  1767. 11,
  1768. 10,
  1769. 9,
  1770. 8,
  1771. 7,
  1772. 6,
  1773. 5,
  1774. 4,
  1775. 3,
  1776. 2,
  1777. 1,
  1778. 0,
  1779. ],
  1780. dtype="int64",
  1781. ),
  1782. ]
  1783. assert not libalgos.is_lexsorted(failure)
  1784. def test_groupsort_indexer():
  1785. a = np.random.randint(0, 1000, 100).astype(np.intp)
  1786. b = np.random.randint(0, 1000, 100).astype(np.intp)
  1787. result = libalgos.groupsort_indexer(a, 1000)[0]
  1788. # need to use a stable sort
  1789. # np.argsort returns int, groupsort_indexer
  1790. # always returns intp
  1791. expected = np.argsort(a, kind="mergesort")
  1792. expected = expected.astype(np.intp)
  1793. tm.assert_numpy_array_equal(result, expected)
  1794. # compare with lexsort
  1795. # np.lexsort returns int, groupsort_indexer
  1796. # always returns intp
  1797. key = a * 1000 + b
  1798. result = libalgos.groupsort_indexer(key, 1000000)[0]
  1799. expected = np.lexsort((b, a))
  1800. expected = expected.astype(np.intp)
  1801. tm.assert_numpy_array_equal(result, expected)
  1802. def test_infinity_sort():
  1803. # GH 13445
  1804. # numpy's argsort can be unhappy if something is less than
  1805. # itself. Instead, let's give our infinities a self-consistent
  1806. # ordering, but outside the float extended real line.
  1807. Inf = libalgos.Infinity()
  1808. NegInf = libalgos.NegInfinity()
  1809. ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
  1810. assert all(Inf >= x for x in ref_nums)
  1811. assert all(Inf > x or x is Inf for x in ref_nums)
  1812. assert Inf >= Inf and Inf == Inf
  1813. assert not Inf < Inf and not Inf > Inf
  1814. assert libalgos.Infinity() == libalgos.Infinity()
  1815. assert not libalgos.Infinity() != libalgos.Infinity()
  1816. assert all(NegInf <= x for x in ref_nums)
  1817. assert all(NegInf < x or x is NegInf for x in ref_nums)
  1818. assert NegInf <= NegInf and NegInf == NegInf
  1819. assert not NegInf < NegInf and not NegInf > NegInf
  1820. assert libalgos.NegInfinity() == libalgos.NegInfinity()
  1821. assert not libalgos.NegInfinity() != libalgos.NegInfinity()
  1822. for perm in permutations(ref_nums):
  1823. assert sorted(perm) == ref_nums
  1824. # smoke tests
  1825. np.array([libalgos.Infinity()] * 32).argsort()
  1826. np.array([libalgos.NegInfinity()] * 32).argsort()
  1827. def test_infinity_against_nan():
  1828. Inf = libalgos.Infinity()
  1829. NegInf = libalgos.NegInfinity()
  1830. assert not Inf > np.nan
  1831. assert not Inf >= np.nan
  1832. assert not Inf < np.nan
  1833. assert not Inf <= np.nan
  1834. assert not Inf == np.nan
  1835. assert Inf != np.nan
  1836. assert not NegInf > np.nan
  1837. assert not NegInf >= np.nan
  1838. assert not NegInf < np.nan
  1839. assert not NegInf <= np.nan
  1840. assert not NegInf == np.nan
  1841. assert NegInf != np.nan
  1842. def test_ensure_platform_int():
  1843. arr = np.arange(100, dtype=np.intp)
  1844. result = libalgos.ensure_platform_int(arr)
  1845. assert result is arr
  1846. def test_int64_add_overflow():
  1847. # see gh-14068
  1848. msg = "Overflow in int64 addition"
  1849. m = np.iinfo(np.int64).max
  1850. n = np.iinfo(np.int64).min
  1851. with pytest.raises(OverflowError, match=msg):
  1852. algos.checked_add_with_arr(np.array([m, m]), m)
  1853. with pytest.raises(OverflowError, match=msg):
  1854. algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
  1855. with pytest.raises(OverflowError, match=msg):
  1856. algos.checked_add_with_arr(np.array([n, n]), n)
  1857. with pytest.raises(OverflowError, match=msg):
  1858. algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
  1859. with pytest.raises(OverflowError, match=msg):
  1860. algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
  1861. with pytest.raises(OverflowError, match=msg):
  1862. algos.checked_add_with_arr(
  1863. np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True])
  1864. )
  1865. with pytest.raises(OverflowError, match=msg):
  1866. algos.checked_add_with_arr(
  1867. np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True])
  1868. )
  1869. with pytest.raises(OverflowError, match=msg):
  1870. algos.checked_add_with_arr(
  1871. np.array([m, m]),
  1872. np.array([m, m]),
  1873. arr_mask=np.array([False, True]),
  1874. b_mask=np.array([False, True]),
  1875. )
  1876. with pytest.raises(OverflowError, match=msg):
  1877. algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m]))
  1878. # Check that the nan boolean arrays override whether or not
  1879. # the addition overflows. We don't check the result but just
  1880. # the fact that an OverflowError is not raised.
  1881. algos.checked_add_with_arr(
  1882. np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True])
  1883. )
  1884. algos.checked_add_with_arr(
  1885. np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True])
  1886. )
  1887. algos.checked_add_with_arr(
  1888. np.array([m, m]),
  1889. np.array([m, m]),
  1890. arr_mask=np.array([True, False]),
  1891. b_mask=np.array([False, True]),
  1892. )
  1893. class TestMode:
  1894. def test_no_mode(self):
  1895. exp = Series([], dtype=np.float64, index=Index([], dtype=int))
  1896. tm.assert_numpy_array_equal(algos.mode([]), exp.values)
  1897. @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
  1898. def test_mode_single(self, dt):
  1899. # GH 15714
  1900. exp_single = [1]
  1901. data_single = [1]
  1902. exp_multi = [1]
  1903. data_multi = [1, 1]
  1904. ser = Series(data_single, dtype=dt)
  1905. exp = Series(exp_single, dtype=dt)
  1906. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1907. tm.assert_series_equal(ser.mode(), exp)
  1908. ser = Series(data_multi, dtype=dt)
  1909. exp = Series(exp_multi, dtype=dt)
  1910. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1911. tm.assert_series_equal(ser.mode(), exp)
  1912. def test_mode_obj_int(self):
  1913. exp = Series([1], dtype=int)
  1914. tm.assert_numpy_array_equal(algos.mode([1]), exp.values)
  1915. exp = Series(["a", "b", "c"], dtype=object)
  1916. tm.assert_numpy_array_equal(algos.mode(["a", "b", "c"]), exp.values)
  1917. @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
  1918. def test_number_mode(self, dt):
  1919. exp_single = [1]
  1920. data_single = [1] * 5 + [2] * 3
  1921. exp_multi = [1, 3]
  1922. data_multi = [1] * 5 + [2] * 3 + [3] * 5
  1923. ser = Series(data_single, dtype=dt)
  1924. exp = Series(exp_single, dtype=dt)
  1925. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1926. tm.assert_series_equal(ser.mode(), exp)
  1927. ser = Series(data_multi, dtype=dt)
  1928. exp = Series(exp_multi, dtype=dt)
  1929. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1930. tm.assert_series_equal(ser.mode(), exp)
  1931. def test_strobj_mode(self):
  1932. exp = ["b"]
  1933. data = ["a"] * 2 + ["b"] * 3
  1934. ser = Series(data, dtype="c")
  1935. exp = Series(exp, dtype="c")
  1936. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1937. tm.assert_series_equal(ser.mode(), exp)
  1938. @pytest.mark.parametrize("dt", [str, object])
  1939. def test_strobj_multi_char(self, dt):
  1940. exp = ["bar"]
  1941. data = ["foo"] * 2 + ["bar"] * 3
  1942. ser = Series(data, dtype=dt)
  1943. exp = Series(exp, dtype=dt)
  1944. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1945. tm.assert_series_equal(ser.mode(), exp)
  1946. def test_datelike_mode(self):
  1947. exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1948. ser = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
  1949. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1950. tm.assert_series_equal(ser.mode(), exp)
  1951. exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
  1952. ser = Series(
  1953. ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
  1954. dtype="M8[ns]",
  1955. )
  1956. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1957. tm.assert_series_equal(ser.mode(), exp)
  1958. def test_timedelta_mode(self):
  1959. exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
  1960. ser = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
  1961. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1962. tm.assert_series_equal(ser.mode(), exp)
  1963. exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
  1964. ser = Series(
  1965. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  1966. dtype="timedelta64[ns]",
  1967. )
  1968. tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
  1969. tm.assert_series_equal(ser.mode(), exp)
  1970. def test_mixed_dtype(self):
  1971. exp = Series(["foo"])
  1972. ser = Series([1, "foo", "foo"])
  1973. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1974. tm.assert_series_equal(ser.mode(), exp)
  1975. def test_uint64_overflow(self):
  1976. exp = Series([2**63], dtype=np.uint64)
  1977. ser = Series([1, 2**63, 2**63], dtype=np.uint64)
  1978. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1979. tm.assert_series_equal(ser.mode(), exp)
  1980. exp = Series([1, 2**63], dtype=np.uint64)
  1981. ser = Series([1, 2**63], dtype=np.uint64)
  1982. tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
  1983. tm.assert_series_equal(ser.mode(), exp)
  1984. def test_categorical(self):
  1985. c = Categorical([1, 2])
  1986. exp = c
  1987. res = Series(c).mode()._values
  1988. tm.assert_categorical_equal(res, exp)
  1989. c = Categorical([1, "a", "a"])
  1990. exp = Categorical(["a"], categories=[1, "a"])
  1991. res = Series(c).mode()._values
  1992. tm.assert_categorical_equal(res, exp)
  1993. c = Categorical([1, 1, 2, 3, 3])
  1994. exp = Categorical([1, 3], categories=[1, 2, 3])
  1995. res = Series(c).mode()._values
  1996. tm.assert_categorical_equal(res, exp)
  1997. def test_index(self):
  1998. idx = Index([1, 2, 3])
  1999. exp = Series([1, 2, 3], dtype=np.int64)
  2000. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  2001. idx = Index([1, "a", "a"])
  2002. exp = Series(["a"], dtype=object)
  2003. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  2004. idx = Index([1, 1, 2, 3, 3])
  2005. exp = Series([1, 3], dtype=np.int64)
  2006. tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
  2007. idx = Index(
  2008. ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
  2009. dtype="timedelta64[ns]",
  2010. )
  2011. with pytest.raises(AttributeError, match="TimedeltaIndex"):
  2012. # algos.mode expects Arraylike, does *not* unwrap TimedeltaIndex
  2013. algos.mode(idx)
  2014. def test_ser_mode_with_name(self):
  2015. # GH 46737
  2016. ser = Series([1, 1, 3], name="foo")
  2017. result = ser.mode()
  2018. expected = Series([1], name="foo")
  2019. tm.assert_series_equal(result, expected)
  2020. class TestDiff:
  2021. @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
  2022. def test_diff_datetimelike_nat(self, dtype):
  2023. # NaT - NaT is NaT, not 0
  2024. arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4)
  2025. arr[:, 2] = arr.dtype.type("NaT", "ns")
  2026. result = algos.diff(arr, 1, axis=0)
  2027. expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4
  2028. expected[:, 2] = np.timedelta64("NaT", "ns")
  2029. expected[0, :] = np.timedelta64("NaT", "ns")
  2030. tm.assert_numpy_array_equal(result, expected)
  2031. result = algos.diff(arr.T, 1, axis=1)
  2032. tm.assert_numpy_array_equal(result, expected.T)
  2033. def test_diff_ea_axis(self):
  2034. dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
  2035. msg = "cannot diff DatetimeArray on axis=1"
  2036. with pytest.raises(ValueError, match=msg):
  2037. algos.diff(dta, 1, axis=1)
  2038. @pytest.mark.parametrize("dtype", ["int8", "int16"])
  2039. def test_diff_low_precision_int(self, dtype):
  2040. arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
  2041. result = algos.diff(arr, 1)
  2042. expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
  2043. tm.assert_numpy_array_equal(result, expected)
  2044. @pytest.mark.parametrize("op", [np.array, pd.array])
  2045. def test_union_with_duplicates(op):
  2046. # GH#36289
  2047. lvals = op([3, 1, 3, 4])
  2048. rvals = op([2, 3, 1, 1])
  2049. expected = op([3, 3, 1, 1, 4, 2])
  2050. if isinstance(expected, np.ndarray):
  2051. result = algos.union_with_duplicates(lvals, rvals)
  2052. tm.assert_numpy_array_equal(result, expected)
  2053. else:
  2054. result = algos.union_with_duplicates(lvals, rvals)
  2055. tm.assert_extension_array_equal(result, expected)