1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407 |
- from datetime import datetime
- from itertools import permutations
- import struct
- import numpy as np
- import pytest
- from pandas._libs import (
- algos as libalgos,
- hashtable as ht,
- )
- import pandas.util._test_decorators as td
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_complex_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- )
- from pandas.core.dtypes.dtypes import CategoricalDtype as CDT
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- DataFrame,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- NaT,
- Period,
- PeriodIndex,
- Series,
- Timedelta,
- Timestamp,
- date_range,
- timedelta_range,
- to_datetime,
- to_timedelta,
- )
- import pandas._testing as tm
- import pandas.core.algorithms as algos
- from pandas.core.arrays import (
- DatetimeArray,
- TimedeltaArray,
- )
- import pandas.core.common as com
- class TestFactorize:
- @pytest.mark.parametrize("sort", [True, False])
- def test_factorize(self, index_or_series_obj, sort):
- obj = index_or_series_obj
- result_codes, result_uniques = obj.factorize(sort=sort)
- constructor = Index
- if isinstance(obj, MultiIndex):
- constructor = MultiIndex.from_tuples
- expected_arr = obj.unique()
- if expected_arr.dtype == np.float16:
- expected_arr = expected_arr.astype(np.float32)
- expected_uniques = constructor(expected_arr)
- if (
- isinstance(obj, Index)
- and expected_uniques.dtype == bool
- and obj.dtype == object
- ):
- expected_uniques = expected_uniques.astype(object)
- if sort:
- expected_uniques = expected_uniques.sort_values()
- # construct an integer ndarray so that
- # `expected_uniques.take(expected_codes)` is equal to `obj`
- expected_uniques_list = list(expected_uniques)
- expected_codes = [expected_uniques_list.index(val) for val in obj]
- expected_codes = np.asarray(expected_codes, dtype=np.intp)
- tm.assert_numpy_array_equal(result_codes, expected_codes)
- tm.assert_index_equal(result_uniques, expected_uniques, exact=True)
- def test_series_factorize_use_na_sentinel_false(self):
- # GH#35667
- values = np.array([1, 2, 1, np.nan])
- ser = Series(values)
- codes, uniques = ser.factorize(use_na_sentinel=False)
- expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
- expected_uniques = Index([1.0, 2.0, np.nan])
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_index_equal(uniques, expected_uniques)
- def test_basic(self):
- codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
- tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
- codes, uniques = algos.factorize(
- ["a", "b", "b", "a", "a", "c", "c", "c"], sort=True
- )
- exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = np.array(["a", "b", "c"], dtype=object)
- tm.assert_numpy_array_equal(uniques, exp)
- arr = np.arange(5, dtype=np.intp)[::-1]
- codes, uniques = algos.factorize(arr)
- exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
- tm.assert_numpy_array_equal(uniques, exp)
- codes, uniques = algos.factorize(arr, sort=True)
- exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
- tm.assert_numpy_array_equal(uniques, exp)
- arr = np.arange(5.0)[::-1]
- codes, uniques = algos.factorize(arr)
- exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
- tm.assert_numpy_array_equal(uniques, exp)
- codes, uniques = algos.factorize(arr, sort=True)
- exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
- tm.assert_numpy_array_equal(uniques, exp)
- def test_mixed(self):
- # doc example reshaping.rst
- x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
- codes, uniques = algos.factorize(x)
- exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = Index(["A", "B", 3.14, np.inf])
- tm.assert_index_equal(uniques, exp)
- codes, uniques = algos.factorize(x, sort=True)
- exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = Index([3.14, np.inf, "A", "B"])
- tm.assert_index_equal(uniques, exp)
- def test_datelike(self):
- # M8
- v1 = Timestamp("20130101 09:00:00.00004")
- v2 = Timestamp("20130101")
- x = Series([v1, v1, v1, v2, v2, v1])
- codes, uniques = algos.factorize(x)
- exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = DatetimeIndex([v1, v2])
- tm.assert_index_equal(uniques, exp)
- codes, uniques = algos.factorize(x, sort=True)
- exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- exp = DatetimeIndex([v2, v1])
- tm.assert_index_equal(uniques, exp)
- # period
- v1 = Period("201302", freq="M")
- v2 = Period("201303", freq="M")
- x = Series([v1, v1, v1, v2, v2, v1])
- # periods are not 'sorted' as they are converted back into an index
- codes, uniques = algos.factorize(x)
- exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
- codes, uniques = algos.factorize(x, sort=True)
- exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
- # GH 5986
- v1 = to_timedelta("1 day 1 min")
- v2 = to_timedelta("1 day")
- x = Series([v1, v2, v1, v1, v2, v2, v1])
- codes, uniques = algos.factorize(x)
- exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
- codes, uniques = algos.factorize(x, sort=True)
- exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, exp)
- tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
- def test_factorize_nan(self):
- # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
- # rizer.factorize should not raise an exception if na_sentinel indexes
- # outside of reverse_indexer
- key = np.array([1, 2, 1, np.nan], dtype="O")
- rizer = ht.ObjectFactorizer(len(key))
- for na_sentinel in (-1, 20):
- ids = rizer.factorize(key, na_sentinel=na_sentinel)
- expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp)
- assert len(set(key)) == len(set(expected))
- tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
- tm.assert_numpy_array_equal(ids, expected)
- def test_factorizer_with_mask(self):
- # GH#49549
- data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
- mask = np.array([False, False, False, False, False, True])
- rizer = ht.Int64Factorizer(len(data))
- result = rizer.factorize(data, mask=mask)
- expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- expected_uniques = np.array([1, 2, 3], dtype="int64")
- tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
- def test_factorizer_object_with_nan(self):
- # GH#49549
- data = np.array([1, 2, 3, 1, np.nan])
- rizer = ht.ObjectFactorizer(len(data))
- result = rizer.factorize(data.astype(object))
- expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- expected_uniques = np.array([1, 2, 3], dtype=object)
- tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
- @pytest.mark.parametrize(
- "data, expected_codes, expected_uniques",
- [
- (
- [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
- [0, 1, 2, 1, 3],
- [(1, 1), (1, 2), (0, 0), "nonsense"],
- ),
- (
- [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
- [0, 1, 2, 1, 3],
- [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
- ),
- ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
- ],
- )
- def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
- # GH9454
- codes, uniques = pd.factorize(data)
- tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
- expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
- tm.assert_numpy_array_equal(uniques, expected_uniques_array)
- def test_complex_sorting(self):
- # gh 12666 - check no segfault
- x17 = np.array([complex(i) for i in range(17)], dtype=object)
- msg = "'[<>]' not supported between instances of .*"
- with pytest.raises(TypeError, match=msg):
- algos.factorize(x17[::-1], sort=True)
- def test_numeric_dtype_factorize(self, any_real_numpy_dtype):
- # GH41132
- dtype = any_real_numpy_dtype
- data = np.array([1, 2, 2, 1], dtype=dtype)
- expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
- expected_uniques = np.array([1, 2], dtype=dtype)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_float64_factorize(self, writable):
- data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
- data.setflags(write=writable)
- expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
- expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_uint64_factorize(self, writable):
- data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
- data.setflags(write=writable)
- expected_codes = np.array([0, 1, 0], dtype=np.intp)
- expected_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_int64_factorize(self, writable):
- data = np.array([2**63 - 1, -(2**63), 2**63 - 1], dtype=np.int64)
- data.setflags(write=writable)
- expected_codes = np.array([0, 1, 0], dtype=np.intp)
- expected_uniques = np.array([2**63 - 1, -(2**63)], dtype=np.int64)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_string_factorize(self, writable):
- data = np.array(["a", "c", "a", "b", "c"], dtype=object)
- data.setflags(write=writable)
- expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
- expected_uniques = np.array(["a", "c", "b"], dtype=object)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_object_factorize(self, writable):
- data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
- data.setflags(write=writable)
- expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
- expected_uniques = np.array(["a", "c", "b"], dtype=object)
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- def test_datetime64_factorize(self, writable):
- # GH35650 Verify whether read-only datetime64 array can be factorized
- data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
- data.setflags(write=writable)
- expected_codes = np.array([0], dtype=np.intp)
- expected_uniques = np.array(
- ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
- )
- codes, uniques = pd.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- @pytest.mark.parametrize("sort", [True, False])
- def test_factorize_rangeindex(self, sort):
- # increasing -> sort doesn't matter
- ri = pd.RangeIndex.from_range(range(10))
- expected = np.arange(10, dtype=np.intp), ri
- result = algos.factorize(ri, sort=sort)
- tm.assert_numpy_array_equal(result[0], expected[0])
- tm.assert_index_equal(result[1], expected[1], exact=True)
- result = ri.factorize(sort=sort)
- tm.assert_numpy_array_equal(result[0], expected[0])
- tm.assert_index_equal(result[1], expected[1], exact=True)
- @pytest.mark.parametrize("sort", [True, False])
- def test_factorize_rangeindex_decreasing(self, sort):
- # decreasing -> sort matters
- ri = pd.RangeIndex.from_range(range(10))
- expected = np.arange(10, dtype=np.intp), ri
- ri2 = ri[::-1]
- expected = expected[0], ri2
- if sort:
- expected = expected[0][::-1], expected[1][::-1]
- result = algos.factorize(ri2, sort=sort)
- tm.assert_numpy_array_equal(result[0], expected[0])
- tm.assert_index_equal(result[1], expected[1], exact=True)
- result = ri2.factorize(sort=sort)
- tm.assert_numpy_array_equal(result[0], expected[0])
- tm.assert_index_equal(result[1], expected[1], exact=True)
- def test_deprecate_order(self):
- # gh 19727 - check warning is raised for deprecated keyword, order.
- # Test not valid once order keyword is removed.
- data = np.array([2**63, 1, 2**63], dtype=np.uint64)
- with pytest.raises(TypeError, match="got an unexpected keyword"):
- algos.factorize(data, order=True)
- with tm.assert_produces_warning(False):
- algos.factorize(data)
- @pytest.mark.parametrize(
- "data",
- [
- np.array([0, 1, 0], dtype="u8"),
- np.array([-(2**63), 1, -(2**63)], dtype="i8"),
- np.array(["__nan__", "foo", "__nan__"], dtype="object"),
- ],
- )
- def test_parametrized_factorize_na_value_default(self, data):
- # arrays that include the NA default for that type, but isn't used.
- codes, uniques = algos.factorize(data)
- expected_uniques = data[[0, 1]]
- expected_codes = np.array([0, 1, 0], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- @pytest.mark.parametrize(
- "data, na_value",
- [
- (np.array([0, 1, 0, 2], dtype="u8"), 0),
- (np.array([1, 0, 1, 2], dtype="u8"), 1),
- (np.array([-(2**63), 1, -(2**63), 0], dtype="i8"), -(2**63)),
- (np.array([1, -(2**63), 1, 0], dtype="i8"), 1),
- (np.array(["a", "", "a", "b"], dtype=object), "a"),
- (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
- (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
- ],
- )
- def test_parametrized_factorize_na_value(self, data, na_value):
- codes, uniques = algos.factorize_array(data, na_value=na_value)
- expected_uniques = data[[1, 3]]
- expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- @pytest.mark.parametrize("sort", [True, False])
- @pytest.mark.parametrize(
- "data, uniques",
- [
- (
- np.array(["b", "a", None, "b"], dtype=object),
- np.array(["b", "a"], dtype=object),
- ),
- (
- pd.array([2, 1, np.nan, 2], dtype="Int64"),
- pd.array([2, 1], dtype="Int64"),
- ),
- ],
- ids=["numpy_array", "extension_array"],
- )
- def test_factorize_use_na_sentinel(self, sort, data, uniques):
- codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True)
- if sort:
- expected_codes = np.array([1, 0, -1, 1], dtype=np.intp)
- expected_uniques = algos.safe_sort(uniques)
- else:
- expected_codes = np.array([0, 1, -1, 0], dtype=np.intp)
- expected_uniques = uniques
- tm.assert_numpy_array_equal(codes, expected_codes)
- if isinstance(data, np.ndarray):
- tm.assert_numpy_array_equal(uniques, expected_uniques)
- else:
- tm.assert_extension_array_equal(uniques, expected_uniques)
- @pytest.mark.parametrize(
- "data, expected_codes, expected_uniques",
- [
- (
- ["a", None, "b", "a"],
- np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
- np.array(["a", np.nan, "b"], dtype=object),
- ),
- (
- ["a", np.nan, "b", "a"],
- np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
- np.array(["a", np.nan, "b"], dtype=object),
- ),
- ],
- )
- def test_object_factorize_use_na_sentinel_false(
- self, data, expected_codes, expected_uniques
- ):
- codes, uniques = algos.factorize(data, use_na_sentinel=False)
- tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
- tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
- @pytest.mark.parametrize(
- "data, expected_codes, expected_uniques",
- [
- (
- [1, None, 1, 2],
- np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
- np.array([1, np.nan, 2], dtype="O"),
- ),
- (
- [1, np.nan, 1, 2],
- np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
- np.array([1, np.nan, 2], dtype=np.float64),
- ),
- ],
- )
- def test_int_factorize_use_na_sentinel_false(
- self, data, expected_codes, expected_uniques
- ):
- codes, uniques = algos.factorize(data, use_na_sentinel=False)
- tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
- tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
- @pytest.mark.parametrize(
- "data, expected_codes, expected_uniques",
- [
- (
- Index(Categorical(["a", "a", "b"])),
- np.array([0, 0, 1], dtype=np.intp),
- CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
- ),
- (
- Series(Categorical(["a", "a", "b"])),
- np.array([0, 0, 1], dtype=np.intp),
- CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
- ),
- (
- Series(DatetimeIndex(["2017", "2017"], tz="US/Eastern")),
- np.array([0, 0], dtype=np.intp),
- DatetimeIndex(["2017"], tz="US/Eastern"),
- ),
- ],
- )
- def test_factorize_mixed_values(self, data, expected_codes, expected_uniques):
- # GH 19721
- codes, uniques = algos.factorize(data)
- tm.assert_numpy_array_equal(codes, expected_codes)
- tm.assert_index_equal(uniques, expected_uniques)
- class TestUnique:
- def test_ints(self):
- arr = np.random.randint(0, 100, size=50)
- result = algos.unique(arr)
- assert isinstance(result, np.ndarray)
- def test_objects(self):
- arr = np.random.randint(0, 100, size=50).astype("O")
- result = algos.unique(arr)
- assert isinstance(result, np.ndarray)
- def test_object_refcount_bug(self):
- lst = ["A", "B", "C", "D", "E"]
- for i in range(1000):
- len(algos.unique(lst))
- def test_on_index_object(self):
- mindex = MultiIndex.from_arrays(
- [np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
- )
- expected = mindex.values
- expected.sort()
- mindex = mindex.repeat(2)
- result = pd.unique(mindex)
- result.sort()
- tm.assert_almost_equal(result, expected)
- def test_dtype_preservation(self, any_numpy_dtype):
- # GH 15442
- if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):
- data = [1, 2, 2]
- uniques = [1, 2]
- elif is_integer_dtype(any_numpy_dtype):
- data = [1, 2, 2]
- uniques = [1, 2]
- elif is_float_dtype(any_numpy_dtype):
- data = [1, 2, 2]
- uniques = [1.0, 2.0]
- elif is_complex_dtype(any_numpy_dtype):
- data = [complex(1, 0), complex(2, 0), complex(2, 0)]
- uniques = [complex(1, 0), complex(2, 0)]
- elif is_bool_dtype(any_numpy_dtype):
- data = [True, True, False]
- uniques = [True, False]
- elif is_object_dtype(any_numpy_dtype):
- data = ["A", "B", "B"]
- uniques = ["A", "B"]
- else:
- # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
- data = [1, 2, 2]
- uniques = [1, 2]
- result = Series(data, dtype=any_numpy_dtype).unique()
- expected = np.array(uniques, dtype=any_numpy_dtype)
- if any_numpy_dtype in tm.STRING_DTYPES:
- expected = expected.astype(object)
- if expected.dtype.kind in ["m", "M"]:
- # We get TimedeltaArray/DatetimeArray
- assert isinstance(result, (DatetimeArray, TimedeltaArray))
- result = np.array(result)
- tm.assert_numpy_array_equal(result, expected)
- def test_datetime64_dtype_array_returned(self):
- # GH 9431
- expected = np.array(
- [
- "2015-01-03T00:00:00.000000000",
- "2015-01-01T00:00:00.000000000",
- ],
- dtype="M8[ns]",
- )
- dt_index = to_datetime(
- [
- "2015-01-03T00:00:00.000000000",
- "2015-01-01T00:00:00.000000000",
- "2015-01-01T00:00:00.000000000",
- ]
- )
- result = algos.unique(dt_index)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- s = Series(dt_index)
- result = algos.unique(s)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- arr = s.values
- result = algos.unique(arr)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- def test_datetime_non_ns(self):
- a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
- result = pd.unique(a)
- expected = np.array(["2000", "2001"], dtype="datetime64[s]")
- tm.assert_numpy_array_equal(result, expected)
- def test_timedelta_non_ns(self):
- a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
- result = pd.unique(a)
- expected = np.array([2000, 2001], dtype="timedelta64[s]")
- tm.assert_numpy_array_equal(result, expected)
- def test_timedelta64_dtype_array_returned(self):
- # GH 9431
- expected = np.array([31200, 45678, 10000], dtype="m8[ns]")
- td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
- result = algos.unique(td_index)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- s = Series(td_index)
- result = algos.unique(s)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- arr = s.values
- result = algos.unique(arr)
- tm.assert_numpy_array_equal(result, expected)
- assert result.dtype == expected.dtype
- def test_uint64_overflow(self):
- s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
- exp = np.array([1, 2, 2**63], dtype=np.uint64)
- tm.assert_numpy_array_equal(algos.unique(s), exp)
- def test_nan_in_object_array(self):
- duplicated_items = ["a", np.nan, "c", "c"]
- result = pd.unique(duplicated_items)
- expected = np.array(["a", np.nan, "c"], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- def test_categorical(self):
- # we are expecting to return in the order
- # of appearance
- expected = Categorical(list("bac"))
- # we are expecting to return in the order
- # of the categories
- expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
- # GH 15939
- c = Categorical(list("baabc"))
- result = c.unique()
- tm.assert_categorical_equal(result, expected)
- result = algos.unique(c)
- tm.assert_categorical_equal(result, expected)
- c = Categorical(list("baabc"), ordered=True)
- result = c.unique()
- tm.assert_categorical_equal(result, expected_o)
- result = algos.unique(c)
- tm.assert_categorical_equal(result, expected_o)
- # Series of categorical dtype
- s = Series(Categorical(list("baabc")), name="foo")
- result = s.unique()
- tm.assert_categorical_equal(result, expected)
- result = pd.unique(s)
- tm.assert_categorical_equal(result, expected)
- # CI -> return CI
- ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
- expected = CategoricalIndex(expected)
- result = ci.unique()
- tm.assert_index_equal(result, expected)
- result = pd.unique(ci)
- tm.assert_index_equal(result, expected)
- def test_datetime64tz_aware(self):
- # GH 15939
- result = Series(
- Index(
- [
- Timestamp("20160101", tz="US/Eastern"),
- Timestamp("20160101", tz="US/Eastern"),
- ]
- )
- ).unique()
- expected = DatetimeArray._from_sequence(
- np.array([Timestamp("2016-01-01 00:00:00-0500", tz="US/Eastern")])
- )
- tm.assert_extension_array_equal(result, expected)
- result = Index(
- [
- Timestamp("20160101", tz="US/Eastern"),
- Timestamp("20160101", tz="US/Eastern"),
- ]
- ).unique()
- expected = DatetimeIndex(
- ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
- )
- tm.assert_index_equal(result, expected)
- result = pd.unique(
- Series(
- Index(
- [
- Timestamp("20160101", tz="US/Eastern"),
- Timestamp("20160101", tz="US/Eastern"),
- ]
- )
- )
- )
- expected = DatetimeArray._from_sequence(
- np.array([Timestamp("2016-01-01", tz="US/Eastern")])
- )
- tm.assert_extension_array_equal(result, expected)
- result = pd.unique(
- Index(
- [
- Timestamp("20160101", tz="US/Eastern"),
- Timestamp("20160101", tz="US/Eastern"),
- ]
- )
- )
- expected = DatetimeIndex(
- ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
- )
- tm.assert_index_equal(result, expected)
- def test_order_of_appearance(self):
- # 9346
- # light testing of guarantee of order of appearance
- # these also are the doc-examples
- result = pd.unique(Series([2, 1, 3, 3]))
- tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
- result = pd.unique(Series([2] + [1] * 5))
- tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
- result = pd.unique(Series([Timestamp("20160101"), Timestamp("20160101")]))
- expected = np.array(["2016-01-01T00:00:00.000000000"], dtype="datetime64[ns]")
- tm.assert_numpy_array_equal(result, expected)
- result = pd.unique(
- Index(
- [
- Timestamp("20160101", tz="US/Eastern"),
- Timestamp("20160101", tz="US/Eastern"),
- ]
- )
- )
- expected = DatetimeIndex(
- ["2016-01-01 00:00:00"], dtype="datetime64[ns, US/Eastern]", freq=None
- )
- tm.assert_index_equal(result, expected)
- result = pd.unique(list("aabc"))
- expected = np.array(["a", "b", "c"], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- result = pd.unique(Series(Categorical(list("aabc"))))
- expected = Categorical(list("abc"))
- tm.assert_categorical_equal(result, expected)
- @pytest.mark.parametrize(
- "arg ,expected",
- [
- (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
- (("foo",), np.array(["foo"], dtype=object)),
- ],
- )
- def test_tuple_with_strings(self, arg, expected):
- # see GH 17108
- result = pd.unique(arg)
- tm.assert_numpy_array_equal(result, expected)
- def test_obj_none_preservation(self):
- # GH 20866
- arr = np.array(["foo", None], dtype=object)
- result = pd.unique(arr)
- expected = np.array(["foo", None], dtype=object)
- tm.assert_numpy_array_equal(result, expected, strict_nan=True)
- def test_signed_zero(self):
- # GH 21866
- a = np.array([-0.0, 0.0])
- result = pd.unique(a)
- expected = np.array([-0.0]) # 0.0 and -0.0 are equivalent
- tm.assert_numpy_array_equal(result, expected)
- def test_different_nans(self):
- # GH 21866
- # create different nans from bit-patterns:
- NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
- NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
- assert NAN1 != NAN1
- assert NAN2 != NAN2
- a = np.array([NAN1, NAN2]) # NAN1 and NAN2 are equivalent
- result = pd.unique(a)
- expected = np.array([np.nan])
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("el_type", [np.float64, object])
- def test_first_nan_kept(self, el_type):
- # GH 22295
- # create different nans from bit-patterns:
- bits_for_nan1 = 0xFFF8000000000001
- bits_for_nan2 = 0x7FF8000000000001
- NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
- NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
- assert NAN1 != NAN1
- assert NAN2 != NAN2
- a = np.array([NAN1, NAN2], dtype=el_type)
- result = pd.unique(a)
- assert result.size == 1
- # use bit patterns to identify which nan was kept:
- result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
- assert result_nan_bits == bits_for_nan1
- def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
- # GH 22295
- if unique_nulls_fixture is unique_nulls_fixture2:
- return # skip it, values not unique
- a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object)
- result = pd.unique(a)
- assert result.size == 2
- assert a[0] is unique_nulls_fixture
- assert a[1] is unique_nulls_fixture2
- def test_unique_masked(self, any_numeric_ea_dtype):
- # GH#48019
- ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
- result = pd.unique(ser)
- expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
- tm.assert_extension_array_equal(result, expected)
- def test_nunique_ints(index_or_series_or_array):
- # GH#36327
- values = index_or_series_or_array(np.random.randint(0, 20, 30))
- result = algos.nunique_ints(values)
- expected = len(algos.unique(values))
- assert result == expected
- class TestIsin:
- def test_invalid(self):
- msg = (
- r"only list-like objects are allowed to be passed to isin\(\), "
- r"you passed a \[int\]"
- )
- with pytest.raises(TypeError, match=msg):
- algos.isin(1, 1)
- with pytest.raises(TypeError, match=msg):
- algos.isin(1, [1])
- with pytest.raises(TypeError, match=msg):
- algos.isin([1], 1)
- def test_basic(self):
- result = algos.isin([1, 2], [1])
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(np.array([1, 2]), [1])
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(Series([1, 2]), [1])
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(Series([1, 2]), Series([1]))
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(Series([1, 2]), {1})
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(["a", "b"], ["a"])
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(Series(["a", "b"]), Series(["a"]))
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(Series(["a", "b"]), {"a"})
- expected = np.array([True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(["a", "b"], [1])
- expected = np.array([False, False])
- tm.assert_numpy_array_equal(result, expected)
- def test_i8(self):
- arr = date_range("20130101", periods=3).values
- result = algos.isin(arr, [arr[0]])
- expected = np.array([True, False, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(arr, arr[0:2])
- expected = np.array([True, True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(arr, set(arr[0:2]))
- expected = np.array([True, True, False])
- tm.assert_numpy_array_equal(result, expected)
- arr = timedelta_range("1 day", periods=3).values
- result = algos.isin(arr, [arr[0]])
- expected = np.array([True, False, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(arr, arr[0:2])
- expected = np.array([True, True, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.isin(arr, set(arr[0:2]))
- expected = np.array([True, True, False])
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("dtype1", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
- @pytest.mark.parametrize("dtype", ["i8", "f8", "u8"])
- def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1):
- # Anything but object and we get all-False shortcut
- dta = date_range("2013-01-01", periods=3)._values
- if dtype1 == "period[D]":
- # TODO: fix Series.view to get this on its own
- arr = dta.to_period("D")
- elif dtype1 == "M8[ns, UTC]":
- # TODO: fix Series.view to get this on its own
- arr = dta.tz_localize("UTC")
- else:
- arr = Series(dta.view("i8")).view(dtype1)._values
- comps = arr.view("i8").astype(dtype)
- result = algos.isin(comps, arr)
- expected = np.zeros(comps.shape, dtype=bool)
- tm.assert_numpy_array_equal(result, expected)
- def test_large(self):
- s = date_range("20000101", periods=2000000, freq="s").values
- result = algos.isin(s, s[0:2])
- expected = np.zeros(len(s), dtype=bool)
- expected[0] = True
- expected[1] = True
- tm.assert_numpy_array_equal(result, expected)
- def test_categorical_from_codes(self):
- # GH 16639
- vals = np.array([0, 1, 2, 0])
- cats = ["a", "b", "c"]
- Sd = Series(Categorical([1]).from_codes(vals, cats))
- St = Series(Categorical([1]).from_codes(np.array([0, 1]), cats))
- expected = np.array([True, True, False, True])
- result = algos.isin(Sd, St)
- tm.assert_numpy_array_equal(expected, result)
- def test_categorical_isin(self):
- vals = np.array([0, 1, 2, 0])
- cats = ["a", "b", "c"]
- cat = Categorical([1]).from_codes(vals, cats)
- other = Categorical([1]).from_codes(np.array([0, 1]), cats)
- expected = np.array([True, True, False, True])
- result = algos.isin(cat, other)
- tm.assert_numpy_array_equal(expected, result)
- def test_same_nan_is_in(self):
- # GH 22160
- # nan is special, because from " a is b" doesn't follow "a == b"
- # at least, isin() should follow python's "np.nan in [nan] == True"
- # casting to -> np.float64 -> another float-object somewhere on
- # the way could lead jepardize this behavior
- comps = [np.nan] # could be casted to float64
- values = [np.nan]
- expected = np.array([True])
- result = algos.isin(comps, values)
- tm.assert_numpy_array_equal(expected, result)
- def test_same_nan_is_in_large(self):
- # https://github.com/pandas-dev/pandas/issues/22205
- s = np.tile(1.0, 1_000_001)
- s[0] = np.nan
- result = algos.isin(s, [np.nan, 1])
- expected = np.ones(len(s), dtype=bool)
- tm.assert_numpy_array_equal(result, expected)
- def test_same_nan_is_in_large_series(self):
- # https://github.com/pandas-dev/pandas/issues/22205
- s = np.tile(1.0, 1_000_001)
- series = Series(s)
- s[0] = np.nan
- result = series.isin([np.nan, 1])
- expected = Series(np.ones(len(s), dtype=bool))
- tm.assert_series_equal(result, expected)
- def test_same_object_is_in(self):
- # GH 22160
- # there could be special treatment for nans
- # the user however could define a custom class
- # with similar behavior, then we at least should
- # fall back to usual python's behavior: "a in [a] == True"
- class LikeNan:
- def __eq__(self, other) -> bool:
- return False
- def __hash__(self):
- return 0
- a, b = LikeNan(), LikeNan()
- # same object -> True
- tm.assert_numpy_array_equal(algos.isin([a], [a]), np.array([True]))
- # different objects -> False
- tm.assert_numpy_array_equal(algos.isin([a], [b]), np.array([False]))
- def test_different_nans(self):
- # GH 22160
- # all nans are handled as equivalent
- comps = [float("nan")]
- values = [float("nan")]
- assert comps[0] is not values[0] # different nan-objects
- # as list of python-objects:
- result = algos.isin(comps, values)
- tm.assert_numpy_array_equal(np.array([True]), result)
- # as object-array:
- result = algos.isin(
- np.asarray(comps, dtype=object), np.asarray(values, dtype=object)
- )
- tm.assert_numpy_array_equal(np.array([True]), result)
- # as float64-array:
- result = algos.isin(
- np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
- )
- tm.assert_numpy_array_equal(np.array([True]), result)
- def test_no_cast(self):
- # GH 22160
- # ensure 42 is not casted to a string
- comps = ["ss", 42]
- values = ["42"]
- expected = np.array([False, False])
- result = algos.isin(comps, values)
- tm.assert_numpy_array_equal(expected, result)
- @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
- def test_empty(self, empty):
- # see gh-16991
- vals = Index(["a", "b"])
- expected = np.array([False, False])
- result = algos.isin(vals, empty)
- tm.assert_numpy_array_equal(expected, result)
- def test_different_nan_objects(self):
- # GH 22119
- comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
- vals = np.array([float("nan")], dtype=object)
- expected = np.array([False, False, True])
- result = algos.isin(comps, vals)
- tm.assert_numpy_array_equal(expected, result)
- def test_different_nans_as_float64(self):
- # GH 21866
- # create different nans from bit-patterns,
- # these nans will land in different buckets in the hash-table
- # if no special care is taken
- NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
- NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
- assert NAN1 != NAN1
- assert NAN2 != NAN2
- # check that NAN1 and NAN2 are equivalent:
- arr = np.array([NAN1, NAN2], dtype=np.float64)
- lookup1 = np.array([NAN1], dtype=np.float64)
- result = algos.isin(arr, lookup1)
- expected = np.array([True, True])
- tm.assert_numpy_array_equal(result, expected)
- lookup2 = np.array([NAN2], dtype=np.float64)
- result = algos.isin(arr, lookup2)
- expected = np.array([True, True])
- tm.assert_numpy_array_equal(result, expected)
- def test_isin_int_df_string_search(self):
- """Comparing df with int`s (1,2) with a string at isin() ("1")
- -> should not match values because int 1 is not equal str 1"""
- df = DataFrame({"values": [1, 2]})
- result = df.isin(["1"])
- expected_false = DataFrame({"values": [False, False]})
- tm.assert_frame_equal(result, expected_false)
- def test_isin_nan_df_string_search(self):
- """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
- -> should not match values because np.nan is not equal str NaN"""
- df = DataFrame({"values": [np.nan, 2]})
- result = df.isin(["NaN"])
- expected_false = DataFrame({"values": [False, False]})
- tm.assert_frame_equal(result, expected_false)
- def test_isin_float_df_string_search(self):
- """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
- -> should not match values because float 1.4245 is not equal str 1.4245"""
- df = DataFrame({"values": [1.4245, 2.32441]})
- result = df.isin(["1.4245"])
- expected_false = DataFrame({"values": [False, False]})
- tm.assert_frame_equal(result, expected_false)
- def test_isin_unsigned_dtype(self):
- # GH#46485
- ser = Series([1378774140726870442], dtype=np.uint64)
- result = ser.isin([1378774140726870528])
- expected = Series(False)
- tm.assert_series_equal(result, expected)
- class TestValueCounts:
- def test_value_counts(self):
- np.random.seed(1234)
- from pandas.core.reshape.tile import cut
- arr = np.random.randn(4)
- factor = cut(arr, 4)
- # assert isinstance(factor, n)
- result = algos.value_counts(factor)
- breaks = [-1.194, -0.535, 0.121, 0.777, 1.433]
- index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True))
- expected = Series([1, 1, 1, 1], index=index, name="count")
- tm.assert_series_equal(result.sort_index(), expected.sort_index())
- def test_value_counts_bins(self):
- s = [1, 2, 3, 4]
- result = algos.value_counts(s, bins=1)
- expected = Series(
- [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count"
- )
- tm.assert_series_equal(result, expected)
- result = algos.value_counts(s, bins=2, sort=False)
- expected = Series(
- [2, 2],
- index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]),
- name="count",
- )
- tm.assert_series_equal(result, expected)
- def test_value_counts_dtypes(self):
- result = algos.value_counts([1, 1.0])
- assert len(result) == 1
- result = algos.value_counts([1, 1.0], bins=1)
- assert len(result) == 1
- result = algos.value_counts(Series([1, 1.0, "1"])) # object
- assert len(result) == 2
- msg = "bins argument only works with numeric data"
- with pytest.raises(TypeError, match=msg):
- algos.value_counts(["1", 1], bins=1)
- def test_value_counts_nat(self):
- td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
- dt = to_datetime(["NaT", "2014-01-01"])
- for s in [td, dt]:
- vc = algos.value_counts(s)
- vc_with_na = algos.value_counts(s, dropna=False)
- assert len(vc) == 1
- assert len(vc_with_na) == 2
- exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count")
- tm.assert_series_equal(algos.value_counts(dt), exp_dt)
- # TODO same for (timedelta)
- def test_value_counts_datetime_outofbounds(self):
- # GH 13663
- s = Series(
- [
- datetime(3000, 1, 1),
- datetime(5000, 1, 1),
- datetime(5000, 1, 1),
- datetime(6000, 1, 1),
- datetime(3000, 1, 1),
- datetime(3000, 1, 1),
- ]
- )
- res = s.value_counts()
- exp_index = Index(
- [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
- dtype=object,
- )
- exp = Series([3, 2, 1], index=exp_index, name="count")
- tm.assert_series_equal(res, exp)
- # GH 12424
- res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
- exp = Series(["2362-01-01", np.nan], dtype=object)
- tm.assert_series_equal(res, exp)
- def test_categorical(self):
- s = Series(Categorical(list("aaabbc")))
- result = s.value_counts()
- expected = Series(
- [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count"
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- # preserve order?
- s = s.cat.as_ordered()
- result = s.value_counts()
- expected.index = expected.index.as_ordered()
- tm.assert_series_equal(result, expected, check_index_type=True)
- def test_categorical_nans(self):
- s = Series(Categorical(list("aaaaabbbcc"))) # 4,3,2,1 (nan)
- s.iloc[1] = np.nan
- result = s.value_counts()
- expected = Series(
- [4, 3, 2],
- index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
- name="count",
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- result = s.value_counts(dropna=False)
- expected = Series(
- [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count"
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- # out of order
- s = Series(
- Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
- )
- s.iloc[1] = np.nan
- result = s.value_counts()
- expected = Series(
- [4, 3, 2],
- index=CategoricalIndex(
- ["a", "b", "c"],
- categories=["b", "a", "c"],
- ordered=True,
- ),
- name="count",
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- result = s.value_counts(dropna=False)
- expected = Series(
- [4, 3, 2, 1],
- index=CategoricalIndex(
- ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
- ),
- name="count",
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- def test_categorical_zeroes(self):
- # keep the `d` category with 0
- s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
- result = s.value_counts()
- expected = Series(
- [3, 2, 1, 0],
- index=Categorical(
- ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
- ),
- name="count",
- )
- tm.assert_series_equal(result, expected, check_index_type=True)
- def test_dropna(self):
- # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
- tm.assert_series_equal(
- Series([True, True, False]).value_counts(dropna=True),
- Series([2, 1], index=[True, False], name="count"),
- )
- tm.assert_series_equal(
- Series([True, True, False]).value_counts(dropna=False),
- Series([2, 1], index=[True, False], name="count"),
- )
- tm.assert_series_equal(
- Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
- Series([3, 2], index=Index([True, False], dtype=object), name="count"),
- )
- tm.assert_series_equal(
- Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
- Series([5, 3, 2], index=[True, False, np.nan], name="count"),
- )
- tm.assert_series_equal(
- Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
- Series([2, 1], index=[5.0, 10.3], name="count"),
- )
- tm.assert_series_equal(
- Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
- Series([2, 1], index=[5.0, 10.3], name="count"),
- )
- tm.assert_series_equal(
- Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
- Series([2, 1], index=[5.0, 10.3], name="count"),
- )
- result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
- expected = Series([3, 2, 1], index=[5.0, 10.3, np.nan], name="count")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]"))
- def test_value_counts_normalized(self, dtype):
- # GH12558
- s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
- s_typed = s.astype(dtype)
- result = s_typed.value_counts(normalize=True, dropna=False)
- expected = Series(
- [0.5, 0.3, 0.2],
- index=Series([np.nan, 2.0, 1.0], dtype=dtype),
- name="proportion",
- )
- tm.assert_series_equal(result, expected)
- result = s_typed.value_counts(normalize=True, dropna=True)
- expected = Series(
- [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion"
- )
- tm.assert_series_equal(result, expected)
- def test_value_counts_uint64(self):
- arr = np.array([2**63], dtype=np.uint64)
- expected = Series([1], index=[2**63], name="count")
- result = algos.value_counts(arr)
- tm.assert_series_equal(result, expected)
- arr = np.array([-1, 2**63], dtype=object)
- expected = Series([1, 1], index=[-1, 2**63], name="count")
- result = algos.value_counts(arr)
- tm.assert_series_equal(result, expected)
- class TestDuplicated:
- def test_duplicated_with_nas(self):
- keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
- result = algos.duplicated(keys)
- expected = np.array([False, False, False, True, False, True])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.duplicated(keys, keep="first")
- expected = np.array([False, False, False, True, False, True])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.duplicated(keys, keep="last")
- expected = np.array([True, False, True, False, False, False])
- tm.assert_numpy_array_equal(result, expected)
- result = algos.duplicated(keys, keep=False)
- expected = np.array([True, False, True, True, False, True])
- tm.assert_numpy_array_equal(result, expected)
- keys = np.empty(8, dtype=object)
- for i, t in enumerate(
- zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2)
- ):
- keys[i] = t
- result = algos.duplicated(keys)
- falses = [False] * 4
- trues = [True] * 4
- expected = np.array(falses + trues)
- tm.assert_numpy_array_equal(result, expected)
- result = algos.duplicated(keys, keep="last")
- expected = np.array(trues + falses)
- tm.assert_numpy_array_equal(result, expected)
- result = algos.duplicated(keys, keep=False)
- expected = np.array(trues + trues)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize(
- "case",
- [
- np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
- np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
- np.array(
- [
- 1 + 1j,
- 2 + 2j,
- 1 + 1j,
- 5 + 5j,
- 3 + 3j,
- 2 + 2j,
- 4 + 4j,
- 1 + 1j,
- 5 + 5j,
- 6 + 6j,
- ]
- ),
- np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
- np.array(
- [1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64
- ),
- ],
- )
- def test_numeric_object_likes(self, case):
- exp_first = np.array(
- [False, False, True, False, False, True, False, True, True, False]
- )
- exp_last = np.array(
- [True, True, True, True, False, False, False, False, False, False]
- )
- exp_false = exp_first | exp_last
- res_first = algos.duplicated(case, keep="first")
- tm.assert_numpy_array_equal(res_first, exp_first)
- res_last = algos.duplicated(case, keep="last")
- tm.assert_numpy_array_equal(res_last, exp_last)
- res_false = algos.duplicated(case, keep=False)
- tm.assert_numpy_array_equal(res_false, exp_false)
- # index
- for idx in [Index(case), Index(case, dtype="category")]:
- res_first = idx.duplicated(keep="first")
- tm.assert_numpy_array_equal(res_first, exp_first)
- res_last = idx.duplicated(keep="last")
- tm.assert_numpy_array_equal(res_last, exp_last)
- res_false = idx.duplicated(keep=False)
- tm.assert_numpy_array_equal(res_false, exp_false)
- # series
- for s in [Series(case), Series(case, dtype="category")]:
- res_first = s.duplicated(keep="first")
- tm.assert_series_equal(res_first, Series(exp_first))
- res_last = s.duplicated(keep="last")
- tm.assert_series_equal(res_last, Series(exp_last))
- res_false = s.duplicated(keep=False)
- tm.assert_series_equal(res_false, Series(exp_false))
- def test_datetime_likes(self):
- dt = [
- "2011-01-01",
- "2011-01-02",
- "2011-01-01",
- "NaT",
- "2011-01-03",
- "2011-01-02",
- "2011-01-04",
- "2011-01-01",
- "NaT",
- "2011-01-06",
- ]
- td = [
- "1 days",
- "2 days",
- "1 days",
- "NaT",
- "3 days",
- "2 days",
- "4 days",
- "1 days",
- "NaT",
- "6 days",
- ]
- cases = [
- np.array([Timestamp(d) for d in dt]),
- np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
- np.array([Period(d, freq="D") for d in dt]),
- np.array([np.datetime64(d) for d in dt]),
- np.array([Timedelta(d) for d in td]),
- ]
- exp_first = np.array(
- [False, False, True, False, False, True, False, True, True, False]
- )
- exp_last = np.array(
- [True, True, True, True, False, False, False, False, False, False]
- )
- exp_false = exp_first | exp_last
- for case in cases:
- res_first = algos.duplicated(case, keep="first")
- tm.assert_numpy_array_equal(res_first, exp_first)
- res_last = algos.duplicated(case, keep="last")
- tm.assert_numpy_array_equal(res_last, exp_last)
- res_false = algos.duplicated(case, keep=False)
- tm.assert_numpy_array_equal(res_false, exp_false)
- # index
- for idx in [
- Index(case),
- Index(case, dtype="category"),
- Index(case, dtype=object),
- ]:
- res_first = idx.duplicated(keep="first")
- tm.assert_numpy_array_equal(res_first, exp_first)
- res_last = idx.duplicated(keep="last")
- tm.assert_numpy_array_equal(res_last, exp_last)
- res_false = idx.duplicated(keep=False)
- tm.assert_numpy_array_equal(res_false, exp_false)
- # series
- for s in [
- Series(case),
- Series(case, dtype="category"),
- Series(case, dtype=object),
- ]:
- res_first = s.duplicated(keep="first")
- tm.assert_series_equal(res_first, Series(exp_first))
- res_last = s.duplicated(keep="last")
- tm.assert_series_equal(res_last, Series(exp_last))
- res_false = s.duplicated(keep=False)
- tm.assert_series_equal(res_false, Series(exp_false))
- @pytest.mark.parametrize("case", [Index([1, 2, 3]), pd.RangeIndex(0, 3)])
- def test_unique_index(self, case):
- assert case.is_unique is True
- tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
- @pytest.mark.parametrize(
- "arr, uniques",
- [
- (
- [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
- [(0, 0), (0, 1), (1, 0), (1, 1)],
- ),
- (
- [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
- [("b", "c"), ("a", "b")],
- ),
- ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
- ],
- )
- def test_unique_tuples(self, arr, uniques):
- # https://github.com/pandas-dev/pandas/issues/16519
- expected = np.empty(len(uniques), dtype=object)
- expected[:] = uniques
- result = pd.unique(arr)
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize(
- "array,expected",
- [
- (
- [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
- # Should return a complex dtype in the future
- np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object),
- )
- ],
- )
- def test_unique_complex_numbers(self, array, expected):
- # GH 17927
- result = pd.unique(array)
- tm.assert_numpy_array_equal(result, expected)
- class TestHashTable:
- @pytest.mark.parametrize(
- "htable, tm_dtype",
- [
- (ht.PyObjectHashTable, "String"),
- (ht.StringHashTable, "String"),
- (ht.Float64HashTable, "Float"),
- (ht.Int64HashTable, "Int"),
- (ht.UInt64HashTable, "UInt"),
- ],
- )
- def test_hashtable_unique(self, htable, tm_dtype, writable):
- # output of maker has guaranteed unique elements
- maker = getattr(tm, "make" + tm_dtype + "Index")
- s = Series(maker(1000))
- if htable == ht.Float64HashTable:
- # add NaN for float column
- s.loc[500] = np.nan
- elif htable == ht.PyObjectHashTable:
- # use different NaN types for object column
- s.loc[500:502] = [np.nan, None, NaT]
- # create duplicated selection
- s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
- s_duplicated.values.setflags(write=writable)
- # drop_duplicates has own cython code (hash_table_func_helper.pxi)
- # and is tested separately; keeps first occurrence like ht.unique()
- expected_unique = s_duplicated.drop_duplicates(keep="first").values
- result_unique = htable().unique(s_duplicated.values)
- tm.assert_numpy_array_equal(result_unique, expected_unique)
- # test return_inverse=True
- # reconstruction can only succeed if the inverse is correct
- result_unique, result_inverse = htable().unique(
- s_duplicated.values, return_inverse=True
- )
- tm.assert_numpy_array_equal(result_unique, expected_unique)
- reconstr = result_unique[result_inverse]
- tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
- @pytest.mark.parametrize(
- "htable, tm_dtype",
- [
- (ht.PyObjectHashTable, "String"),
- (ht.StringHashTable, "String"),
- (ht.Float64HashTable, "Float"),
- (ht.Int64HashTable, "Int"),
- (ht.UInt64HashTable, "UInt"),
- ],
- )
- def test_hashtable_factorize(self, htable, tm_dtype, writable):
- # output of maker has guaranteed unique elements
- maker = getattr(tm, "make" + tm_dtype + "Index")
- s = Series(maker(1000))
- if htable == ht.Float64HashTable:
- # add NaN for float column
- s.loc[500] = np.nan
- elif htable == ht.PyObjectHashTable:
- # use different NaN types for object column
- s.loc[500:502] = [np.nan, None, NaT]
- # create duplicated selection
- s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
- s_duplicated.values.setflags(write=writable)
- na_mask = s_duplicated.isna().values
- result_unique, result_inverse = htable().factorize(s_duplicated.values)
- # drop_duplicates has own cython code (hash_table_func_helper.pxi)
- # and is tested separately; keeps first occurrence like ht.factorize()
- # since factorize removes all NaNs, we do the same here
- expected_unique = s_duplicated.dropna().drop_duplicates().values
- tm.assert_numpy_array_equal(result_unique, expected_unique)
- # reconstruction can only succeed if the inverse is correct. Since
- # factorize removes the NaNs, those have to be excluded here as well
- result_reconstruct = result_unique[result_inverse[~na_mask]]
- expected_reconstruct = s_duplicated.dropna().values
- tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
- class TestRank:
- @td.skip_if_no_scipy
- @pytest.mark.parametrize(
- "arr",
- [
- [np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan],
- [4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan],
- ],
- )
- def test_scipy_compat(self, arr):
- from scipy.stats import rankdata
- arr = np.array(arr)
- mask = ~np.isfinite(arr)
- arr = arr.copy()
- result = libalgos.rank_1d(arr)
- arr[mask] = np.inf
- exp = rankdata(arr)
- exp[mask] = np.nan
- tm.assert_almost_equal(result, exp)
- @pytest.mark.parametrize("dtype", np.typecodes["AllInteger"])
- def test_basic(self, writable, dtype):
- exp = np.array([1, 2], dtype=np.float64)
- data = np.array([1, 100], dtype=dtype)
- data.setflags(write=writable)
- ser = Series(data)
- result = algos.rank(ser)
- tm.assert_numpy_array_equal(result, exp)
- @pytest.mark.parametrize("dtype", [np.float64, np.uint64])
- def test_uint64_overflow(self, dtype):
- exp = np.array([1, 2], dtype=np.float64)
- s = Series([1, 2**63], dtype=dtype)
- tm.assert_numpy_array_equal(algos.rank(s), exp)
- def test_too_many_ndims(self):
- arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
- msg = "Array with ndim > 2 are not supported"
- with pytest.raises(TypeError, match=msg):
- algos.rank(arr)
- @pytest.mark.single_cpu
- def test_pct_max_many_rows(self):
- # GH 18271
- values = np.arange(2**24 + 1)
- result = algos.rank(values, pct=True).max()
- assert result == 1
- values = np.arange(2**25 + 2).reshape(2**24 + 1, 2)
- result = algos.rank(values, pct=True).max()
- assert result == 1
- def test_pad_backfill_object_segfault():
- old = np.array([], dtype="O")
- new = np.array([datetime(2010, 12, 31)], dtype="O")
- result = libalgos.pad["object"](old, new)
- expected = np.array([-1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- result = libalgos.pad["object"](new, old)
- expected = np.array([], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- result = libalgos.backfill["object"](old, new)
- expected = np.array([-1], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- result = libalgos.backfill["object"](new, old)
- expected = np.array([], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- class TestTseriesUtil:
- def test_backfill(self):
- old = Index([1, 5, 10])
- new = Index(list(range(12)))
- filler = libalgos.backfill["int64_t"](old.values, new.values)
- expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(filler, expect_filler)
- # corner case
- old = Index([1, 4])
- new = Index(list(range(5, 10)))
- filler = libalgos.backfill["int64_t"](old.values, new.values)
- expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(filler, expect_filler)
- def test_pad(self):
- old = Index([1, 5, 10])
- new = Index(list(range(12)))
- filler = libalgos.pad["int64_t"](old.values, new.values)
- expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(filler, expect_filler)
- # corner case
- old = Index([5, 10])
- new = Index(np.arange(5, dtype=np.int64))
- filler = libalgos.pad["int64_t"](old.values, new.values)
- expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(filler, expect_filler)
- def test_is_lexsorted():
- failure = [
- np.array(
- [
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 3,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- 0,
- ],
- dtype="int64",
- ),
- np.array(
- [
- 30,
- 29,
- 28,
- 27,
- 26,
- 25,
- 24,
- 23,
- 22,
- 21,
- 20,
- 19,
- 18,
- 17,
- 16,
- 15,
- 14,
- 13,
- 12,
- 11,
- 10,
- 9,
- 8,
- 7,
- 6,
- 5,
- 4,
- 3,
- 2,
- 1,
- 0,
- 30,
- 29,
- 28,
- 27,
- 26,
- 25,
- 24,
- 23,
- 22,
- 21,
- 20,
- 19,
- 18,
- 17,
- 16,
- 15,
- 14,
- 13,
- 12,
- 11,
- 10,
- 9,
- 8,
- 7,
- 6,
- 5,
- 4,
- 3,
- 2,
- 1,
- 0,
- 30,
- 29,
- 28,
- 27,
- 26,
- 25,
- 24,
- 23,
- 22,
- 21,
- 20,
- 19,
- 18,
- 17,
- 16,
- 15,
- 14,
- 13,
- 12,
- 11,
- 10,
- 9,
- 8,
- 7,
- 6,
- 5,
- 4,
- 3,
- 2,
- 1,
- 0,
- 30,
- 29,
- 28,
- 27,
- 26,
- 25,
- 24,
- 23,
- 22,
- 21,
- 20,
- 19,
- 18,
- 17,
- 16,
- 15,
- 14,
- 13,
- 12,
- 11,
- 10,
- 9,
- 8,
- 7,
- 6,
- 5,
- 4,
- 3,
- 2,
- 1,
- 0,
- ],
- dtype="int64",
- ),
- ]
- assert not libalgos.is_lexsorted(failure)
- def test_groupsort_indexer():
- a = np.random.randint(0, 1000, 100).astype(np.intp)
- b = np.random.randint(0, 1000, 100).astype(np.intp)
- result = libalgos.groupsort_indexer(a, 1000)[0]
- # need to use a stable sort
- # np.argsort returns int, groupsort_indexer
- # always returns intp
- expected = np.argsort(a, kind="mergesort")
- expected = expected.astype(np.intp)
- tm.assert_numpy_array_equal(result, expected)
- # compare with lexsort
- # np.lexsort returns int, groupsort_indexer
- # always returns intp
- key = a * 1000 + b
- result = libalgos.groupsort_indexer(key, 1000000)[0]
- expected = np.lexsort((b, a))
- expected = expected.astype(np.intp)
- tm.assert_numpy_array_equal(result, expected)
- def test_infinity_sort():
- # GH 13445
- # numpy's argsort can be unhappy if something is less than
- # itself. Instead, let's give our infinities a self-consistent
- # ordering, but outside the float extended real line.
- Inf = libalgos.Infinity()
- NegInf = libalgos.NegInfinity()
- ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
- assert all(Inf >= x for x in ref_nums)
- assert all(Inf > x or x is Inf for x in ref_nums)
- assert Inf >= Inf and Inf == Inf
- assert not Inf < Inf and not Inf > Inf
- assert libalgos.Infinity() == libalgos.Infinity()
- assert not libalgos.Infinity() != libalgos.Infinity()
- assert all(NegInf <= x for x in ref_nums)
- assert all(NegInf < x or x is NegInf for x in ref_nums)
- assert NegInf <= NegInf and NegInf == NegInf
- assert not NegInf < NegInf and not NegInf > NegInf
- assert libalgos.NegInfinity() == libalgos.NegInfinity()
- assert not libalgos.NegInfinity() != libalgos.NegInfinity()
- for perm in permutations(ref_nums):
- assert sorted(perm) == ref_nums
- # smoke tests
- np.array([libalgos.Infinity()] * 32).argsort()
- np.array([libalgos.NegInfinity()] * 32).argsort()
- def test_infinity_against_nan():
- Inf = libalgos.Infinity()
- NegInf = libalgos.NegInfinity()
- assert not Inf > np.nan
- assert not Inf >= np.nan
- assert not Inf < np.nan
- assert not Inf <= np.nan
- assert not Inf == np.nan
- assert Inf != np.nan
- assert not NegInf > np.nan
- assert not NegInf >= np.nan
- assert not NegInf < np.nan
- assert not NegInf <= np.nan
- assert not NegInf == np.nan
- assert NegInf != np.nan
- def test_ensure_platform_int():
- arr = np.arange(100, dtype=np.intp)
- result = libalgos.ensure_platform_int(arr)
- assert result is arr
- def test_int64_add_overflow():
- # see gh-14068
- msg = "Overflow in int64 addition"
- m = np.iinfo(np.int64).max
- n = np.iinfo(np.int64).min
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([m, m]), m)
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]))
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([n, n]), n)
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([n, n]), np.array([n, n]))
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([m, n]), np.array([n, n]))
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(
- np.array([m, m]), np.array([m, m]), arr_mask=np.array([False, True])
- )
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(
- np.array([m, m]), np.array([m, m]), b_mask=np.array([False, True])
- )
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(
- np.array([m, m]),
- np.array([m, m]),
- arr_mask=np.array([False, True]),
- b_mask=np.array([False, True]),
- )
- with pytest.raises(OverflowError, match=msg):
- algos.checked_add_with_arr(np.array([m, m]), np.array([np.nan, m]))
- # Check that the nan boolean arrays override whether or not
- # the addition overflows. We don't check the result but just
- # the fact that an OverflowError is not raised.
- algos.checked_add_with_arr(
- np.array([m, m]), np.array([m, m]), arr_mask=np.array([True, True])
- )
- algos.checked_add_with_arr(
- np.array([m, m]), np.array([m, m]), b_mask=np.array([True, True])
- )
- algos.checked_add_with_arr(
- np.array([m, m]),
- np.array([m, m]),
- arr_mask=np.array([True, False]),
- b_mask=np.array([False, True]),
- )
- class TestMode:
- def test_no_mode(self):
- exp = Series([], dtype=np.float64, index=Index([], dtype=int))
- tm.assert_numpy_array_equal(algos.mode([]), exp.values)
- @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
- def test_mode_single(self, dt):
- # GH 15714
- exp_single = [1]
- data_single = [1]
- exp_multi = [1]
- data_multi = [1, 1]
- ser = Series(data_single, dtype=dt)
- exp = Series(exp_single, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- ser = Series(data_multi, dtype=dt)
- exp = Series(exp_multi, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_mode_obj_int(self):
- exp = Series([1], dtype=int)
- tm.assert_numpy_array_equal(algos.mode([1]), exp.values)
- exp = Series(["a", "b", "c"], dtype=object)
- tm.assert_numpy_array_equal(algos.mode(["a", "b", "c"]), exp.values)
- @pytest.mark.parametrize("dt", np.typecodes["AllInteger"] + np.typecodes["Float"])
- def test_number_mode(self, dt):
- exp_single = [1]
- data_single = [1] * 5 + [2] * 3
- exp_multi = [1, 3]
- data_multi = [1] * 5 + [2] * 3 + [3] * 5
- ser = Series(data_single, dtype=dt)
- exp = Series(exp_single, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- ser = Series(data_multi, dtype=dt)
- exp = Series(exp_multi, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_strobj_mode(self):
- exp = ["b"]
- data = ["a"] * 2 + ["b"] * 3
- ser = Series(data, dtype="c")
- exp = Series(exp, dtype="c")
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- @pytest.mark.parametrize("dt", [str, object])
- def test_strobj_multi_char(self, dt):
- exp = ["bar"]
- data = ["foo"] * 2 + ["bar"] * 3
- ser = Series(data, dtype=dt)
- exp = Series(exp, dtype=dt)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_datelike_mode(self):
- exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
- ser = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
- tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
- tm.assert_series_equal(ser.mode(), exp)
- exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
- ser = Series(
- ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
- dtype="M8[ns]",
- )
- tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_timedelta_mode(self):
- exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
- ser = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
- tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
- tm.assert_series_equal(ser.mode(), exp)
- exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
- ser = Series(
- ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
- dtype="timedelta64[ns]",
- )
- tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_mixed_dtype(self):
- exp = Series(["foo"])
- ser = Series([1, "foo", "foo"])
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_uint64_overflow(self):
- exp = Series([2**63], dtype=np.uint64)
- ser = Series([1, 2**63, 2**63], dtype=np.uint64)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- exp = Series([1, 2**63], dtype=np.uint64)
- ser = Series([1, 2**63], dtype=np.uint64)
- tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
- tm.assert_series_equal(ser.mode(), exp)
- def test_categorical(self):
- c = Categorical([1, 2])
- exp = c
- res = Series(c).mode()._values
- tm.assert_categorical_equal(res, exp)
- c = Categorical([1, "a", "a"])
- exp = Categorical(["a"], categories=[1, "a"])
- res = Series(c).mode()._values
- tm.assert_categorical_equal(res, exp)
- c = Categorical([1, 1, 2, 3, 3])
- exp = Categorical([1, 3], categories=[1, 2, 3])
- res = Series(c).mode()._values
- tm.assert_categorical_equal(res, exp)
- def test_index(self):
- idx = Index([1, 2, 3])
- exp = Series([1, 2, 3], dtype=np.int64)
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
- idx = Index([1, "a", "a"])
- exp = Series(["a"], dtype=object)
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
- idx = Index([1, 1, 2, 3, 3])
- exp = Series([1, 3], dtype=np.int64)
- tm.assert_numpy_array_equal(algos.mode(idx), exp.values)
- idx = Index(
- ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
- dtype="timedelta64[ns]",
- )
- with pytest.raises(AttributeError, match="TimedeltaIndex"):
- # algos.mode expects Arraylike, does *not* unwrap TimedeltaIndex
- algos.mode(idx)
- def test_ser_mode_with_name(self):
- # GH 46737
- ser = Series([1, 1, 3], name="foo")
- result = ser.mode()
- expected = Series([1], name="foo")
- tm.assert_series_equal(result, expected)
- class TestDiff:
- @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
- def test_diff_datetimelike_nat(self, dtype):
- # NaT - NaT is NaT, not 0
- arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4)
- arr[:, 2] = arr.dtype.type("NaT", "ns")
- result = algos.diff(arr, 1, axis=0)
- expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4
- expected[:, 2] = np.timedelta64("NaT", "ns")
- expected[0, :] = np.timedelta64("NaT", "ns")
- tm.assert_numpy_array_equal(result, expected)
- result = algos.diff(arr.T, 1, axis=1)
- tm.assert_numpy_array_equal(result, expected.T)
- def test_diff_ea_axis(self):
- dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
- msg = "cannot diff DatetimeArray on axis=1"
- with pytest.raises(ValueError, match=msg):
- algos.diff(dta, 1, axis=1)
- @pytest.mark.parametrize("dtype", ["int8", "int16"])
- def test_diff_low_precision_int(self, dtype):
- arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
- result = algos.diff(arr, 1)
- expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
- tm.assert_numpy_array_equal(result, expected)
- @pytest.mark.parametrize("op", [np.array, pd.array])
- def test_union_with_duplicates(op):
- # GH#36289
- lvals = op([3, 1, 3, 4])
- rvals = op([2, 3, 1, 1])
- expected = op([3, 3, 1, 1, 4, 2])
- if isinstance(expected, np.ndarray):
- result = algos.union_with_duplicates(lvals, rvals)
- tm.assert_numpy_array_equal(result, expected)
- else:
- result = algos.union_with_duplicates(lvals, rvals)
- tm.assert_extension_array_equal(result, expected)
|