test_indexing.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. """ test fancy indexing & misc """
  2. import array
  3. from datetime import datetime
  4. import re
  5. import weakref
  6. import numpy as np
  7. import pytest
  8. from pandas.errors import IndexingError
  9. from pandas.core.dtypes.common import (
  10. is_float_dtype,
  11. is_integer_dtype,
  12. is_object_dtype,
  13. )
  14. import pandas as pd
  15. from pandas import (
  16. DataFrame,
  17. Index,
  18. NaT,
  19. Series,
  20. date_range,
  21. offsets,
  22. timedelta_range,
  23. )
  24. import pandas._testing as tm
  25. from pandas.tests.indexing.common import _mklbl
  26. from pandas.tests.indexing.test_floats import gen_obj
  27. # ------------------------------------------------------------------------
  28. # Indexing test cases
  29. class TestFancy:
  30. """pure get/set item & fancy indexing"""
  31. def test_setitem_ndarray_1d(self):
  32. # GH5508
  33. # len of indexer vs length of the 1d ndarray
  34. df = DataFrame(index=Index(np.arange(1, 11), dtype=np.int64))
  35. df["foo"] = np.zeros(10, dtype=np.float64)
  36. df["bar"] = np.zeros(10, dtype=complex)
  37. # invalid
  38. msg = "Must have equal len keys and value when setting with an iterable"
  39. with pytest.raises(ValueError, match=msg):
  40. df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
  41. # valid
  42. df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
  43. result = df.loc[df.index[2:6], "bar"]
  44. expected = Series(
  45. [2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar"
  46. )
  47. tm.assert_series_equal(result, expected)
  48. def test_setitem_ndarray_1d_2(self):
  49. # GH5508
  50. # dtype getting changed?
  51. df = DataFrame(index=Index(np.arange(1, 11)))
  52. df["foo"] = np.zeros(10, dtype=np.float64)
  53. df["bar"] = np.zeros(10, dtype=complex)
  54. msg = "Must have equal len keys and value when setting with an iterable"
  55. with pytest.raises(ValueError, match=msg):
  56. df[2:5] = np.arange(1, 4) * 1j
  57. def test_getitem_ndarray_3d(
  58. self, index, frame_or_series, indexer_sli, using_array_manager
  59. ):
  60. # GH 25567
  61. obj = gen_obj(frame_or_series, index)
  62. idxr = indexer_sli(obj)
  63. nd3 = np.random.randint(5, size=(2, 2, 2))
  64. msgs = []
  65. if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]:
  66. msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]")
  67. if using_array_manager:
  68. msgs.append("Passed array should be 1-dimensional")
  69. if frame_or_series is Series or indexer_sli is tm.iloc:
  70. msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)")
  71. if using_array_manager:
  72. msgs.append("indexer should be 1-dimensional")
  73. if indexer_sli is tm.loc or (
  74. frame_or_series is Series and indexer_sli is tm.setitem
  75. ):
  76. msgs.append("Cannot index with multidimensional key")
  77. if frame_or_series is DataFrame and indexer_sli is tm.setitem:
  78. msgs.append("Index data must be 1-dimensional")
  79. if isinstance(index, pd.IntervalIndex) and indexer_sli is tm.iloc:
  80. msgs.append("Index data must be 1-dimensional")
  81. if isinstance(index, (pd.TimedeltaIndex, pd.DatetimeIndex, pd.PeriodIndex)):
  82. msgs.append("Data must be 1-dimensional")
  83. if len(index) == 0 or isinstance(index, pd.MultiIndex):
  84. msgs.append("positional indexers are out-of-bounds")
  85. if type(index) is Index and not isinstance(index._values, np.ndarray):
  86. # e.g. Int64
  87. msgs.append("values must be a 1D array")
  88. # string[pyarrow]
  89. msgs.append("only handle 1-dimensional arrays")
  90. msg = "|".join(msgs)
  91. potential_errors = (IndexError, ValueError, NotImplementedError)
  92. with pytest.raises(potential_errors, match=msg):
  93. idxr[nd3]
  94. def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
  95. # GH 25567
  96. obj = gen_obj(frame_or_series, index)
  97. idxr = indexer_sli(obj)
  98. nd3 = np.random.randint(5, size=(2, 2, 2))
  99. if indexer_sli is tm.iloc:
  100. err = ValueError
  101. msg = f"Cannot set values with ndim > {obj.ndim}"
  102. else:
  103. err = ValueError
  104. msg = "|".join(
  105. [
  106. r"Buffer has wrong number of dimensions \(expected 1, got 3\)",
  107. "Cannot set values with ndim > 1",
  108. "Index data must be 1-dimensional",
  109. "Data must be 1-dimensional",
  110. "Array conditional must be same shape as self",
  111. ]
  112. )
  113. with pytest.raises(err, match=msg):
  114. idxr[nd3] = 0
  115. def test_getitem_ndarray_0d(self):
  116. # GH#24924
  117. key = np.array(0)
  118. # dataframe __getitem__
  119. df = DataFrame([[1, 2], [3, 4]])
  120. result = df[key]
  121. expected = Series([1, 3], name=0)
  122. tm.assert_series_equal(result, expected)
  123. # series __getitem__
  124. ser = Series([1, 2])
  125. result = ser[key]
  126. assert result == 1
  127. def test_inf_upcast(self):
  128. # GH 16957
  129. # We should be able to use np.inf as a key
  130. # np.inf should cause an index to convert to float
  131. # Test with np.inf in rows
  132. df = DataFrame(columns=[0])
  133. df.loc[1] = 1
  134. df.loc[2] = 2
  135. df.loc[np.inf] = 3
  136. # make sure we can look up the value
  137. assert df.loc[np.inf, 0] == 3
  138. result = df.index
  139. expected = Index([1, 2, np.inf], dtype=np.float64)
  140. tm.assert_index_equal(result, expected)
  141. def test_setitem_dtype_upcast(self):
  142. # GH3216
  143. df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
  144. df["c"] = np.nan
  145. assert df["c"].dtype == np.float64
  146. df.loc[0, "c"] = "foo"
  147. expected = DataFrame(
  148. [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]
  149. )
  150. tm.assert_frame_equal(df, expected)
  151. @pytest.mark.parametrize("val", [3.14, "wxyz"])
  152. def test_setitem_dtype_upcast2(self, val):
  153. # GH10280
  154. df = DataFrame(
  155. np.arange(6, dtype="int64").reshape(2, 3),
  156. index=list("ab"),
  157. columns=["foo", "bar", "baz"],
  158. )
  159. left = df.copy()
  160. left.loc["a", "bar"] = val
  161. right = DataFrame(
  162. [[0, val, 2], [3, 4, 5]],
  163. index=list("ab"),
  164. columns=["foo", "bar", "baz"],
  165. )
  166. tm.assert_frame_equal(left, right)
  167. assert is_integer_dtype(left["foo"])
  168. assert is_integer_dtype(left["baz"])
  169. def test_setitem_dtype_upcast3(self):
  170. left = DataFrame(
  171. np.arange(6, dtype="int64").reshape(2, 3) / 10.0,
  172. index=list("ab"),
  173. columns=["foo", "bar", "baz"],
  174. )
  175. left.loc["a", "bar"] = "wxyz"
  176. right = DataFrame(
  177. [[0, "wxyz", 0.2], [0.3, 0.4, 0.5]],
  178. index=list("ab"),
  179. columns=["foo", "bar", "baz"],
  180. )
  181. tm.assert_frame_equal(left, right)
  182. assert is_float_dtype(left["foo"])
  183. assert is_float_dtype(left["baz"])
  184. def test_dups_fancy_indexing(self):
  185. # GH 3455
  186. df = tm.makeCustomDataframe(10, 3)
  187. df.columns = ["a", "a", "b"]
  188. result = df[["b", "a"]].columns
  189. expected = Index(["b", "a", "a"])
  190. tm.assert_index_equal(result, expected)
  191. def test_dups_fancy_indexing_across_dtypes(self):
  192. # across dtypes
  193. df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
  194. df.head()
  195. str(df)
  196. result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
  197. result.columns = list("aaaaaaa") # GH#3468
  198. # GH#3509 smoke tests for indexing with duplicate columns
  199. df.iloc[:, 4]
  200. result.iloc[:, 4]
  201. tm.assert_frame_equal(df, result)
  202. def test_dups_fancy_indexing_not_in_order(self):
  203. # GH 3561, dups not in selected order
  204. df = DataFrame(
  205. {"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")},
  206. index=["A", "A", "B", "C"],
  207. )
  208. rows = ["C", "B"]
  209. expected = DataFrame(
  210. {"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows
  211. )
  212. result = df.loc[rows]
  213. tm.assert_frame_equal(result, expected)
  214. result = df.loc[Index(rows)]
  215. tm.assert_frame_equal(result, expected)
  216. rows = ["C", "B", "E"]
  217. with pytest.raises(KeyError, match="not in index"):
  218. df.loc[rows]
  219. # see GH5553, make sure we use the right indexer
  220. rows = ["F", "G", "H", "C", "B", "E"]
  221. with pytest.raises(KeyError, match="not in index"):
  222. df.loc[rows]
  223. def test_dups_fancy_indexing_only_missing_label(self):
  224. # List containing only missing label
  225. dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
  226. with pytest.raises(
  227. KeyError,
  228. match=re.escape(
  229. "\"None of [Index(['E'], dtype='object')] are in the [index]\""
  230. ),
  231. ):
  232. dfnu.loc[["E"]]
  233. @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")])
  234. def test_dups_fancy_indexing_missing_label(self, vals):
  235. # GH 4619; duplicate indexer with missing label
  236. df = DataFrame({"A": vals})
  237. with pytest.raises(KeyError, match="not in index"):
  238. df.loc[[0, 8, 0]]
  239. def test_dups_fancy_indexing_non_unique(self):
  240. # non unique with non unique selector
  241. df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
  242. with pytest.raises(KeyError, match="not in index"):
  243. df.loc[["A", "A", "E"]]
  244. def test_dups_fancy_indexing2(self):
  245. # GH 5835
  246. # dups on index and missing values
  247. df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"])
  248. with pytest.raises(KeyError, match="not in index"):
  249. df.loc[:, ["A", "B", "C"]]
  250. def test_dups_fancy_indexing3(self):
  251. # GH 6504, multi-axis indexing
  252. df = DataFrame(
  253. np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"]
  254. )
  255. expected = df.iloc[0:6]
  256. result = df.loc[[1, 2]]
  257. tm.assert_frame_equal(result, expected)
  258. expected = df
  259. result = df.loc[:, ["a", "b"]]
  260. tm.assert_frame_equal(result, expected)
  261. expected = df.iloc[0:6, :]
  262. result = df.loc[[1, 2], ["a", "b"]]
  263. tm.assert_frame_equal(result, expected)
  264. def test_duplicate_int_indexing(self, indexer_sl):
  265. # GH 17347
  266. ser = Series(range(3), index=[1, 1, 3])
  267. expected = Series(range(2), index=[1, 1])
  268. result = indexer_sl(ser)[[1]]
  269. tm.assert_series_equal(result, expected)
  270. def test_indexing_mixed_frame_bug(self):
  271. # GH3492
  272. df = DataFrame(
  273. {"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}}
  274. )
  275. # this works, new column is created correctly
  276. df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x)
  277. # this does not work, ie column test is not changed
  278. idx = df["test"] == "_"
  279. temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x)
  280. df.loc[idx, "test"] = temp
  281. assert df.iloc[0, 2] == "-----"
  282. def test_multitype_list_index_access(self):
  283. # GH 10610
  284. df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23])
  285. with pytest.raises(KeyError, match=re.escape("'[26, -8] not in index'")):
  286. df[[22, 26, -8]]
  287. assert df[21].shape[0] == df.shape[0]
  288. def test_set_index_nan(self):
  289. # GH 3586
  290. df = DataFrame(
  291. {
  292. "PRuid": {
  293. 17: "nonQC",
  294. 18: "nonQC",
  295. 19: "nonQC",
  296. 20: "10",
  297. 21: "11",
  298. 22: "12",
  299. 23: "13",
  300. 24: "24",
  301. 25: "35",
  302. 26: "46",
  303. 27: "47",
  304. 28: "48",
  305. 29: "59",
  306. 30: "10",
  307. },
  308. "QC": {
  309. 17: 0.0,
  310. 18: 0.0,
  311. 19: 0.0,
  312. 20: np.nan,
  313. 21: np.nan,
  314. 22: np.nan,
  315. 23: np.nan,
  316. 24: 1.0,
  317. 25: np.nan,
  318. 26: np.nan,
  319. 27: np.nan,
  320. 28: np.nan,
  321. 29: np.nan,
  322. 30: np.nan,
  323. },
  324. "data": {
  325. 17: 7.9544899999999998,
  326. 18: 8.0142609999999994,
  327. 19: 7.8591520000000008,
  328. 20: 0.86140349999999999,
  329. 21: 0.87853110000000001,
  330. 22: 0.8427041999999999,
  331. 23: 0.78587700000000005,
  332. 24: 0.73062459999999996,
  333. 25: 0.81668560000000001,
  334. 26: 0.81927080000000008,
  335. 27: 0.80705009999999999,
  336. 28: 0.81440240000000008,
  337. 29: 0.80140849999999997,
  338. 30: 0.81307740000000006,
  339. },
  340. "year": {
  341. 17: 2006,
  342. 18: 2007,
  343. 19: 2008,
  344. 20: 1985,
  345. 21: 1985,
  346. 22: 1985,
  347. 23: 1985,
  348. 24: 1985,
  349. 25: 1985,
  350. 26: 1985,
  351. 27: 1985,
  352. 28: 1985,
  353. 29: 1985,
  354. 30: 1986,
  355. },
  356. }
  357. ).reset_index()
  358. result = (
  359. df.set_index(["year", "PRuid", "QC"])
  360. .reset_index()
  361. .reindex(columns=df.columns)
  362. )
  363. tm.assert_frame_equal(result, df)
  364. def test_multi_assign(self):
  365. # GH 3626, an assignment of a sub-df to a df
  366. df = DataFrame(
  367. {
  368. "FC": ["a", "b", "a", "b", "a", "b"],
  369. "PF": [0, 0, 0, 0, 1, 1],
  370. "col1": list(range(6)),
  371. "col2": list(range(6, 12)),
  372. }
  373. )
  374. df.iloc[1, 0] = np.nan
  375. df2 = df.copy()
  376. mask = ~df2.FC.isna()
  377. cols = ["col1", "col2"]
  378. dft = df2 * 2
  379. dft.iloc[3, 3] = np.nan
  380. expected = DataFrame(
  381. {
  382. "FC": ["a", np.nan, "a", "b", "a", "b"],
  383. "PF": [0, 0, 0, 0, 1, 1],
  384. "col1": Series([0, 1, 4, 6, 8, 10]),
  385. "col2": [12, 7, 16, np.nan, 20, 22],
  386. }
  387. )
  388. # frame on rhs
  389. df2.loc[mask, cols] = dft.loc[mask, cols]
  390. tm.assert_frame_equal(df2, expected)
  391. # with an ndarray on rhs
  392. # coerces to float64 because values has float64 dtype
  393. # GH 14001
  394. expected = DataFrame(
  395. {
  396. "FC": ["a", np.nan, "a", "b", "a", "b"],
  397. "PF": [0, 0, 0, 0, 1, 1],
  398. "col1": [0, 1, 4, 6, 8, 10],
  399. "col2": [12, 7, 16, np.nan, 20, 22],
  400. }
  401. )
  402. df2 = df.copy()
  403. df2.loc[mask, cols] = dft.loc[mask, cols].values
  404. tm.assert_frame_equal(df2, expected)
  405. def test_multi_assign_broadcasting_rhs(self):
  406. # broadcasting on the rhs is required
  407. df = DataFrame(
  408. {
  409. "A": [1, 2, 0, 0, 0],
  410. "B": [0, 0, 0, 10, 11],
  411. "C": [0, 0, 0, 10, 11],
  412. "D": [3, 4, 5, 6, 7],
  413. }
  414. )
  415. expected = df.copy()
  416. mask = expected["A"] == 0
  417. for col in ["A", "B"]:
  418. expected.loc[mask, col] = df["D"]
  419. df.loc[df["A"] == 0, ["A", "B"]] = df["D"]
  420. tm.assert_frame_equal(df, expected)
  421. def test_setitem_list(self):
  422. # GH 6043
  423. # iloc with a list
  424. df = DataFrame(index=[0, 1], columns=[0])
  425. df.iloc[1, 0] = [1, 2, 3]
  426. df.iloc[1, 0] = [1, 2]
  427. result = DataFrame(index=[0, 1], columns=[0])
  428. result.iloc[1, 0] = [1, 2]
  429. tm.assert_frame_equal(result, df)
  430. def test_string_slice(self):
  431. # GH 14424
  432. # string indexing against datetimelike with object
  433. # dtype should properly raises KeyError
  434. df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object))
  435. assert df.index._is_all_dates
  436. with pytest.raises(KeyError, match="'2011'"):
  437. df["2011"]
  438. with pytest.raises(KeyError, match="'2011'"):
  439. df.loc["2011", 0]
  440. def test_string_slice_empty(self):
  441. # GH 14424
  442. df = DataFrame()
  443. assert not df.index._is_all_dates
  444. with pytest.raises(KeyError, match="'2011'"):
  445. df["2011"]
  446. with pytest.raises(KeyError, match="^0$"):
  447. df.loc["2011", 0]
  448. def test_astype_assignment(self):
  449. # GH4312 (iloc)
  450. df_orig = DataFrame(
  451. [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  452. )
  453. df = df_orig.copy()
  454. # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace,
  455. # so object dtype is retained
  456. df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
  457. expected = DataFrame(
  458. [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  459. )
  460. expected["A"] = expected["A"].astype(object)
  461. expected["B"] = expected["B"].astype(object)
  462. tm.assert_frame_equal(df, expected)
  463. # GH5702 (loc)
  464. df = df_orig.copy()
  465. df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
  466. expected = DataFrame(
  467. [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  468. )
  469. expected["A"] = expected["A"].astype(object)
  470. tm.assert_frame_equal(df, expected)
  471. df = df_orig.copy()
  472. df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
  473. expected = DataFrame(
  474. [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
  475. )
  476. expected["B"] = expected["B"].astype(object)
  477. expected["C"] = expected["C"].astype(object)
  478. tm.assert_frame_equal(df, expected)
  479. def test_astype_assignment_full_replacements(self):
  480. # full replacements / no nans
  481. df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  482. # With the enforcement of GH#45333 in 2.0, this assignment occurs inplace,
  483. # so float64 is retained
  484. df.iloc[:, 0] = df["A"].astype(np.int64)
  485. expected = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  486. tm.assert_frame_equal(df, expected)
  487. df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
  488. df.loc[:, "A"] = df["A"].astype(np.int64)
  489. tm.assert_frame_equal(df, expected)
  490. @pytest.mark.parametrize("indexer", [tm.getitem, tm.loc])
  491. def test_index_type_coercion(self, indexer):
  492. # GH 11836
  493. # if we have an index type and set it with something that looks
  494. # to numpy like the same, but is actually, not
  495. # (e.g. setting with a float or string '0')
  496. # then we need to coerce to object
  497. # integer indexes
  498. for s in [Series(range(5)), Series(range(5), index=range(1, 6))]:
  499. assert is_integer_dtype(s.index)
  500. s2 = s.copy()
  501. indexer(s2)[0.1] = 0
  502. assert is_float_dtype(s2.index)
  503. assert indexer(s2)[0.1] == 0
  504. s2 = s.copy()
  505. indexer(s2)[0.0] = 0
  506. exp = s.index
  507. if 0 not in s:
  508. exp = Index(s.index.tolist() + [0])
  509. tm.assert_index_equal(s2.index, exp)
  510. s2 = s.copy()
  511. indexer(s2)["0"] = 0
  512. assert is_object_dtype(s2.index)
  513. for s in [Series(range(5), index=np.arange(5.0))]:
  514. assert is_float_dtype(s.index)
  515. s2 = s.copy()
  516. indexer(s2)[0.1] = 0
  517. assert is_float_dtype(s2.index)
  518. assert indexer(s2)[0.1] == 0
  519. s2 = s.copy()
  520. indexer(s2)[0.0] = 0
  521. tm.assert_index_equal(s2.index, s.index)
  522. s2 = s.copy()
  523. indexer(s2)["0"] = 0
  524. assert is_object_dtype(s2.index)
  525. class TestMisc:
  526. def test_float_index_to_mixed(self):
  527. df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)})
  528. df["a"] = 10
  529. expected = DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10})
  530. tm.assert_frame_equal(expected, df)
  531. def test_float_index_non_scalar_assignment(self):
  532. df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
  533. df.loc[df.index[:2]] = 1
  534. expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index)
  535. tm.assert_frame_equal(expected, df)
  536. def test_loc_setitem_fullindex_views(self):
  537. df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
  538. df2 = df.copy()
  539. df.loc[df.index] = df.loc[df.index]
  540. tm.assert_frame_equal(df, df2)
  541. def test_rhs_alignment(self):
  542. # GH8258, tests that both rows & columns are aligned to what is
  543. # assigned to. covers both uniform data-type & multi-type cases
  544. def run_tests(df, rhs, right_loc, right_iloc):
  545. # label, index, slice
  546. lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4)
  547. lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3)
  548. left = df.copy()
  549. left.loc[lbl_one, lbl_two] = rhs
  550. tm.assert_frame_equal(left, right_loc)
  551. left = df.copy()
  552. left.iloc[idx_one, idx_two] = rhs
  553. tm.assert_frame_equal(left, right_iloc)
  554. left = df.copy()
  555. left.iloc[slice_one, slice_two] = rhs
  556. tm.assert_frame_equal(left, right_iloc)
  557. xs = np.arange(20).reshape(5, 4)
  558. cols = ["jim", "joe", "jolie", "joline"]
  559. df = DataFrame(xs, columns=cols, index=list("abcde"), dtype="int64")
  560. # right hand side; permute the indices and multiplpy by -2
  561. rhs = -2 * df.iloc[3:0:-1, 2:0:-1]
  562. # expected `right` result; just multiply by -2
  563. right_iloc = df.copy()
  564. right_iloc["joe"] = [1, 14, 10, 6, 17]
  565. right_iloc["jolie"] = [2, 13, 9, 5, 18]
  566. right_iloc.iloc[1:4, 1:3] *= -2
  567. right_loc = df.copy()
  568. right_loc.iloc[1:4, 1:3] *= -2
  569. # run tests with uniform dtypes
  570. run_tests(df, rhs, right_loc, right_iloc)
  571. # make frames multi-type & re-run tests
  572. for frame in [df, rhs, right_loc, right_iloc]:
  573. frame["joe"] = frame["joe"].astype("float64")
  574. frame["jolie"] = frame["jolie"].map(lambda x: f"@{x}")
  575. right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0]
  576. right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"]
  577. run_tests(df, rhs, right_loc, right_iloc)
  578. @pytest.mark.parametrize(
  579. "idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]
  580. )
  581. def test_str_label_slicing_with_negative_step(self, idx):
  582. SLC = pd.IndexSlice
  583. idx = Index(idx)
  584. ser = Series(np.arange(20), index=idx)
  585. tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] :: -1], SLC[9::-1])
  586. tm.assert_indexing_slices_equivalent(ser, SLC[: idx[9] : -1], SLC[:8:-1])
  587. tm.assert_indexing_slices_equivalent(
  588. ser, SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]
  589. )
  590. tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] : idx[13] : -1], SLC[:0])
  591. def test_slice_with_zero_step_raises(self, index, indexer_sl, frame_or_series):
  592. obj = frame_or_series(np.arange(len(index)), index=index)
  593. with pytest.raises(ValueError, match="slice step cannot be zero"):
  594. indexer_sl(obj)[::0]
  595. def test_loc_setitem_indexing_assignment_dict_already_exists(self):
  596. index = Index([-5, 0, 5], name="z")
  597. df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8]}, index=index)
  598. expected = df.copy()
  599. rhs = {"x": 9, "y": 99}
  600. df.loc[5] = rhs
  601. expected.loc[5] = [9, 99]
  602. tm.assert_frame_equal(df, expected)
  603. # GH#38335 same thing, mixed dtypes
  604. df = DataFrame({"x": [1, 2, 6], "y": [2.0, 2.0, 8.0]}, index=index)
  605. df.loc[5] = rhs
  606. expected = DataFrame({"x": [1, 2, 9], "y": [2.0, 2.0, 99.0]}, index=index)
  607. tm.assert_frame_equal(df, expected)
  608. def test_iloc_getitem_indexing_dtypes_on_empty(self):
  609. # Check that .iloc returns correct dtypes GH9983
  610. df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]})
  611. df2 = df.iloc[[], :]
  612. assert df2.loc[:, "a"].dtype == np.int64
  613. tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0])
  614. @pytest.mark.parametrize("size", [5, 999999, 1000000])
  615. def test_loc_range_in_series_indexing(self, size):
  616. # range can cause an indexing error
  617. # GH 11652
  618. s = Series(index=range(size), dtype=np.float64)
  619. s.loc[range(1)] = 42
  620. tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
  621. s.loc[range(2)] = 43
  622. tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
  623. def test_partial_boolean_frame_indexing(self):
  624. # GH 17170
  625. df = DataFrame(
  626. np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC")
  627. )
  628. index_df = DataFrame(1, index=list("ab"), columns=list("AB"))
  629. result = df[index_df.notnull()]
  630. expected = DataFrame(
  631. np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]),
  632. index=list("abc"),
  633. columns=list("ABC"),
  634. )
  635. tm.assert_frame_equal(result, expected)
  636. def test_no_reference_cycle(self):
  637. df = DataFrame({"a": [0, 1], "b": [2, 3]})
  638. for name in ("loc", "iloc", "at", "iat"):
  639. getattr(df, name)
  640. wr = weakref.ref(df)
  641. del df
  642. assert wr() is None
  643. def test_label_indexing_on_nan(self, nulls_fixture):
  644. # GH 32431
  645. df = Series([1, "{1,2}", 1, nulls_fixture])
  646. vc = df.value_counts(dropna=False)
  647. result1 = vc.loc[nulls_fixture]
  648. result2 = vc[nulls_fixture]
  649. expected = 1
  650. assert result1 == expected
  651. assert result2 == expected
  652. class TestDataframeNoneCoercion:
  653. EXPECTED_SINGLE_ROW_RESULTS = [
  654. # For numeric series, we should coerce to NaN.
  655. ([1, 2, 3], [np.nan, 2, 3]),
  656. ([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
  657. # For datetime series, we should coerce to NaT.
  658. (
  659. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
  660. [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
  661. ),
  662. # For objects, we should preserve the None value.
  663. (["foo", "bar", "baz"], [None, "bar", "baz"]),
  664. ]
  665. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  666. def test_coercion_with_loc(self, expected):
  667. start_data, expected_result = expected
  668. start_dataframe = DataFrame({"foo": start_data})
  669. start_dataframe.loc[0, ["foo"]] = None
  670. expected_dataframe = DataFrame({"foo": expected_result})
  671. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  672. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  673. def test_coercion_with_setitem_and_dataframe(self, expected):
  674. start_data, expected_result = expected
  675. start_dataframe = DataFrame({"foo": start_data})
  676. start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
  677. expected_dataframe = DataFrame({"foo": expected_result})
  678. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  679. @pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
  680. def test_none_coercion_loc_and_dataframe(self, expected):
  681. start_data, expected_result = expected
  682. start_dataframe = DataFrame({"foo": start_data})
  683. start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
  684. expected_dataframe = DataFrame({"foo": expected_result})
  685. tm.assert_frame_equal(start_dataframe, expected_dataframe)
  686. def test_none_coercion_mixed_dtypes(self):
  687. start_dataframe = DataFrame(
  688. {
  689. "a": [1, 2, 3],
  690. "b": [1.0, 2.0, 3.0],
  691. "c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
  692. "d": ["a", "b", "c"],
  693. }
  694. )
  695. start_dataframe.iloc[0] = None
  696. exp = DataFrame(
  697. {
  698. "a": [np.nan, 2, 3],
  699. "b": [np.nan, 2.0, 3.0],
  700. "c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
  701. "d": [None, "b", "c"],
  702. }
  703. )
  704. tm.assert_frame_equal(start_dataframe, exp)
  705. class TestDatetimelikeCoercion:
  706. def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
  707. # dispatching _can_hold_element to underlying DatetimeArray
  708. tz = tz_naive_fixture
  709. dti = date_range("2016-01-01", periods=3, tz=tz)
  710. ser = Series(dti.copy(deep=True))
  711. values = ser._values
  712. newval = "2018-01-01"
  713. values._validate_setitem_value(newval)
  714. indexer_sli(ser)[0] = newval
  715. if tz is None:
  716. # TODO(EA2D): we can make this no-copy in tz-naive case too
  717. assert ser.dtype == dti.dtype
  718. assert ser._values._ndarray is values._ndarray
  719. else:
  720. assert ser._values is values
  721. @pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
  722. @pytest.mark.parametrize(
  723. "key", [[0, 1], slice(0, 2), np.array([True, True, False])]
  724. )
  725. def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, box):
  726. # dispatching _can_hold_element to underling DatetimeArray
  727. tz = tz_naive_fixture
  728. if isinstance(key, slice) and indexer_sli is tm.loc:
  729. key = slice(0, 1)
  730. dti = date_range("2016-01-01", periods=3, tz=tz)
  731. ser = Series(dti.copy(deep=True))
  732. values = ser._values
  733. newvals = box(["2019-01-01", "2010-01-02"])
  734. values._validate_setitem_value(newvals)
  735. indexer_sli(ser)[key] = newvals
  736. if tz is None:
  737. # TODO(EA2D): we can make this no-copy in tz-naive case too
  738. assert ser.dtype == dti.dtype
  739. assert ser._values._ndarray is values._ndarray
  740. else:
  741. assert ser._values is values
  742. @pytest.mark.parametrize("scalar", ["3 Days", offsets.Hour(4)])
  743. def test_setitem_td64_scalar(self, indexer_sli, scalar):
  744. # dispatching _can_hold_element to underling TimedeltaArray
  745. tdi = timedelta_range("1 Day", periods=3)
  746. ser = Series(tdi.copy(deep=True))
  747. values = ser._values
  748. values._validate_setitem_value(scalar)
  749. indexer_sli(ser)[0] = scalar
  750. assert ser._values._ndarray is values._ndarray
  751. @pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
  752. @pytest.mark.parametrize(
  753. "key", [[0, 1], slice(0, 2), np.array([True, True, False])]
  754. )
  755. def test_setitem_td64_string_values(self, indexer_sli, key, box):
  756. # dispatching _can_hold_element to underling TimedeltaArray
  757. if isinstance(key, slice) and indexer_sli is tm.loc:
  758. key = slice(0, 1)
  759. tdi = timedelta_range("1 Day", periods=3)
  760. ser = Series(tdi.copy(deep=True))
  761. values = ser._values
  762. newvals = box(["10 Days", "44 hours"])
  763. values._validate_setitem_value(newvals)
  764. indexer_sli(ser)[key] = newvals
  765. assert ser._values._ndarray is values._ndarray
  766. def test_extension_array_cross_section():
  767. # A cross-section of a homogeneous EA should be an EA
  768. df = DataFrame(
  769. {
  770. "A": pd.array([1, 2], dtype="Int64"),
  771. "B": pd.array([3, 4], dtype="Int64"),
  772. },
  773. index=["a", "b"],
  774. )
  775. expected = Series(pd.array([1, 3], dtype="Int64"), index=["A", "B"], name="a")
  776. result = df.loc["a"]
  777. tm.assert_series_equal(result, expected)
  778. result = df.iloc[0]
  779. tm.assert_series_equal(result, expected)
  780. def test_extension_array_cross_section_converts():
  781. # all numeric columns -> numeric series
  782. df = DataFrame(
  783. {
  784. "A": pd.array([1, 2], dtype="Int64"),
  785. "B": np.array([1, 2], dtype="int64"),
  786. },
  787. index=["a", "b"],
  788. )
  789. result = df.loc["a"]
  790. expected = Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
  791. tm.assert_series_equal(result, expected)
  792. result = df.iloc[0]
  793. tm.assert_series_equal(result, expected)
  794. # mixed columns -> object series
  795. df = DataFrame(
  796. {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])},
  797. index=["a", "b"],
  798. )
  799. result = df.loc["a"]
  800. expected = Series([1, "a"], dtype=object, index=["A", "B"], name="a")
  801. tm.assert_series_equal(result, expected)
  802. result = df.iloc[0]
  803. tm.assert_series_equal(result, expected)
  804. @pytest.mark.parametrize(
  805. "ser, keys",
  806. [(Series([10]), (0, 0)), (Series([1, 2, 3], index=list("abc")), (0, 1))],
  807. )
  808. def test_ser_tup_indexer_exceeds_dimensions(ser, keys, indexer_li):
  809. # GH#13831
  810. exp_err, exp_msg = IndexingError, "Too many indexers"
  811. with pytest.raises(exp_err, match=exp_msg):
  812. indexer_li(ser)[keys]
  813. if indexer_li == tm.iloc:
  814. # For iloc.__setitem__ we let numpy handle the error reporting.
  815. exp_err, exp_msg = IndexError, "too many indices for array"
  816. with pytest.raises(exp_err, match=exp_msg):
  817. indexer_li(ser)[keys] = 0
  818. def test_ser_list_indexer_exceeds_dimensions(indexer_li):
  819. # GH#13831
  820. # Make sure an exception is raised when a tuple exceeds the dimension of the series,
  821. # but not list when a list is used.
  822. ser = Series([10])
  823. res = indexer_li(ser)[[0, 0]]
  824. exp = Series([10, 10], index=Index([0, 0]))
  825. tm.assert_series_equal(res, exp)
  826. @pytest.mark.parametrize(
  827. "value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
  828. )
  829. def test_scalar_setitem_with_nested_value(value):
  830. # For numeric data, we try to unpack and thus raise for mismatching length
  831. df = DataFrame({"A": [1, 2, 3]})
  832. msg = "|".join(
  833. [
  834. "Must have equal len keys and value",
  835. "setting an array element with a sequence",
  836. ]
  837. )
  838. with pytest.raises(ValueError, match=msg):
  839. df.loc[0, "B"] = value
  840. # TODO For object dtype this happens as well, but should we rather preserve
  841. # the nested data and set as such?
  842. df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
  843. with pytest.raises(ValueError, match="Must have equal len keys and value"):
  844. df.loc[0, "B"] = value
  845. # if isinstance(value, np.ndarray):
  846. # assert (df.loc[0, "B"] == value).all()
  847. # else:
  848. # assert df.loc[0, "B"] == value
  849. @pytest.mark.parametrize(
  850. "value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
  851. )
  852. def test_scalar_setitem_series_with_nested_value(value, indexer_sli):
  853. # For numeric data, we try to unpack and thus raise for mismatching length
  854. ser = Series([1, 2, 3])
  855. with pytest.raises(ValueError, match="setting an array element with a sequence"):
  856. indexer_sli(ser)[0] = value
  857. # but for object dtype we preserve the nested data and set as such
  858. ser = Series([1, "a", "b"], dtype=object)
  859. indexer_sli(ser)[0] = value
  860. if isinstance(value, np.ndarray):
  861. assert (ser.loc[0] == value).all()
  862. else:
  863. assert ser.loc[0] == value
  864. @pytest.mark.parametrize(
  865. "value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
  866. )
  867. def test_scalar_setitem_with_nested_value_length1(value):
  868. # https://github.com/pandas-dev/pandas/issues/46268
  869. # For numeric data, assigning length-1 array to scalar position gets unpacked
  870. df = DataFrame({"A": [1, 2, 3]})
  871. df.loc[0, "B"] = value
  872. expected = DataFrame({"A": [1, 2, 3], "B": [0.0, np.nan, np.nan]})
  873. tm.assert_frame_equal(df, expected)
  874. # but for object dtype we preserve the nested data
  875. df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
  876. df.loc[0, "B"] = value
  877. if isinstance(value, np.ndarray):
  878. assert (df.loc[0, "B"] == value).all()
  879. else:
  880. assert df.loc[0, "B"] == value
  881. @pytest.mark.parametrize(
  882. "value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
  883. )
  884. def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli):
  885. # For numeric data, assigning length-1 array to scalar position gets unpacked
  886. # TODO this only happens in case of ndarray, should we make this consistent
  887. # for all list-likes? (as happens for DataFrame.(i)loc, see test above)
  888. ser = Series([1.0, 2.0, 3.0])
  889. if isinstance(value, np.ndarray):
  890. indexer_sli(ser)[0] = value
  891. expected = Series([0.0, 2.0, 3.0])
  892. tm.assert_series_equal(ser, expected)
  893. else:
  894. with pytest.raises(
  895. ValueError, match="setting an array element with a sequence"
  896. ):
  897. indexer_sli(ser)[0] = value
  898. # but for object dtype we preserve the nested data
  899. ser = Series([1, "a", "b"], dtype=object)
  900. indexer_sli(ser)[0] = value
  901. if isinstance(value, np.ndarray):
  902. assert (ser.loc[0] == value).all()
  903. else:
  904. assert ser.loc[0] == value