test_replace.py 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577
  1. from __future__ import annotations
  2. from datetime import datetime
  3. import re
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. Series,
  11. Timestamp,
  12. date_range,
  13. )
  14. import pandas._testing as tm
  15. @pytest.fixture
  16. def mix_ab() -> dict[str, list[int | str]]:
  17. return {"a": list(range(4)), "b": list("ab..")}
  18. @pytest.fixture
  19. def mix_abc() -> dict[str, list[float | str]]:
  20. return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}
  21. class TestDataFrameReplace:
  22. def test_replace_inplace(self, datetime_frame, float_string_frame):
  23. datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
  24. datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
  25. tsframe = datetime_frame.copy()
  26. return_value = tsframe.replace(np.nan, 0, inplace=True)
  27. assert return_value is None
  28. tm.assert_frame_equal(tsframe, datetime_frame.fillna(0))
  29. # mixed type
  30. mf = float_string_frame
  31. mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
  32. mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
  33. result = float_string_frame.replace(np.nan, 0)
  34. expected = float_string_frame.fillna(value=0)
  35. tm.assert_frame_equal(result, expected)
  36. tsframe = datetime_frame.copy()
  37. return_value = tsframe.replace([np.nan], [0], inplace=True)
  38. assert return_value is None
  39. tm.assert_frame_equal(tsframe, datetime_frame.fillna(0))
  40. @pytest.mark.parametrize(
  41. "to_replace,values,expected",
  42. [
  43. # lists of regexes and values
  44. # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
  45. (
  46. [r"\s*\.\s*", r"e|f|g"],
  47. [np.nan, "crap"],
  48. {
  49. "a": ["a", "b", np.nan, np.nan],
  50. "b": ["crap"] * 3 + ["h"],
  51. "c": ["h", "crap", "l", "o"],
  52. },
  53. ),
  54. # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
  55. (
  56. [r"\s*(\.)\s*", r"(e|f|g)"],
  57. [r"\1\1", r"\1_crap"],
  58. {
  59. "a": ["a", "b", "..", ".."],
  60. "b": ["e_crap", "f_crap", "g_crap", "h"],
  61. "c": ["h", "e_crap", "l", "o"],
  62. },
  63. ),
  64. # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
  65. # or vN)]
  66. (
  67. [r"\s*(\.)\s*", r"e"],
  68. [r"\1\1", r"crap"],
  69. {
  70. "a": ["a", "b", "..", ".."],
  71. "b": ["crap", "f", "g", "h"],
  72. "c": ["h", "crap", "l", "o"],
  73. },
  74. ),
  75. ],
  76. )
  77. @pytest.mark.parametrize("inplace", [True, False])
  78. @pytest.mark.parametrize("use_value_regex_args", [True, False])
  79. def test_regex_replace_list_obj(
  80. self, to_replace, values, expected, inplace, use_value_regex_args
  81. ):
  82. df = DataFrame({"a": list("ab.."), "b": list("efgh"), "c": list("helo")})
  83. if use_value_regex_args:
  84. result = df.replace(value=values, regex=to_replace, inplace=inplace)
  85. else:
  86. result = df.replace(to_replace, values, regex=True, inplace=inplace)
  87. if inplace:
  88. assert result is None
  89. result = df
  90. expected = DataFrame(expected)
  91. tm.assert_frame_equal(result, expected)
  92. def test_regex_replace_list_mixed(self, mix_ab):
  93. # mixed frame to make sure this doesn't break things
  94. dfmix = DataFrame(mix_ab)
  95. # lists of regexes and values
  96. # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
  97. to_replace_res = [r"\s*\.\s*", r"a"]
  98. values = [np.nan, "crap"]
  99. mix2 = {"a": list(range(4)), "b": list("ab.."), "c": list("halo")}
  100. dfmix2 = DataFrame(mix2)
  101. res = dfmix2.replace(to_replace_res, values, regex=True)
  102. expec = DataFrame(
  103. {
  104. "a": mix2["a"],
  105. "b": ["crap", "b", np.nan, np.nan],
  106. "c": ["h", "crap", "l", "o"],
  107. }
  108. )
  109. tm.assert_frame_equal(res, expec)
  110. # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
  111. to_replace_res = [r"\s*(\.)\s*", r"(a|b)"]
  112. values = [r"\1\1", r"\1_crap"]
  113. res = dfmix.replace(to_replace_res, values, regex=True)
  114. expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]})
  115. tm.assert_frame_equal(res, expec)
  116. # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
  117. # or vN)]
  118. to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
  119. values = [r"\1\1", r"crap", r"\1_crap"]
  120. res = dfmix.replace(to_replace_res, values, regex=True)
  121. expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
  122. tm.assert_frame_equal(res, expec)
  123. to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
  124. values = [r"\1\1", r"crap", r"\1_crap"]
  125. res = dfmix.replace(regex=to_replace_res, value=values)
  126. expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
  127. tm.assert_frame_equal(res, expec)
  128. def test_regex_replace_list_mixed_inplace(self, mix_ab):
  129. dfmix = DataFrame(mix_ab)
  130. # the same inplace
  131. # lists of regexes and values
  132. # list of [re1, re2, ..., reN] -> [v1, v2, ..., vN]
  133. to_replace_res = [r"\s*\.\s*", r"a"]
  134. values = [np.nan, "crap"]
  135. res = dfmix.copy()
  136. return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
  137. assert return_value is None
  138. expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]})
  139. tm.assert_frame_equal(res, expec)
  140. # list of [re1, re2, ..., reN] -> [re1, re2, .., reN]
  141. to_replace_res = [r"\s*(\.)\s*", r"(a|b)"]
  142. values = [r"\1\1", r"\1_crap"]
  143. res = dfmix.copy()
  144. return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
  145. assert return_value is None
  146. expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]})
  147. tm.assert_frame_equal(res, expec)
  148. # list of [re1, re2, ..., reN] -> [(re1 or v1), (re2 or v2), ..., (reN
  149. # or vN)]
  150. to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
  151. values = [r"\1\1", r"crap", r"\1_crap"]
  152. res = dfmix.copy()
  153. return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
  154. assert return_value is None
  155. expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
  156. tm.assert_frame_equal(res, expec)
  157. to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
  158. values = [r"\1\1", r"crap", r"\1_crap"]
  159. res = dfmix.copy()
  160. return_value = res.replace(regex=to_replace_res, value=values, inplace=True)
  161. assert return_value is None
  162. expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
  163. tm.assert_frame_equal(res, expec)
  164. def test_regex_replace_dict_mixed(self, mix_abc):
  165. dfmix = DataFrame(mix_abc)
  166. # dicts
  167. # single dict {re1: v1}, search the whole frame
  168. # need test for this...
  169. # list of dicts {re1: v1, re2: v2, ..., re3: v3}, search the whole
  170. # frame
  171. res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True)
  172. res2 = dfmix.copy()
  173. return_value = res2.replace(
  174. {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True
  175. )
  176. assert return_value is None
  177. expec = DataFrame(
  178. {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
  179. )
  180. tm.assert_frame_equal(res, expec)
  181. tm.assert_frame_equal(res2, expec)
  182. # list of dicts {re1: re11, re2: re12, ..., reN: re1N}, search the
  183. # whole frame
  184. res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True)
  185. res2 = dfmix.copy()
  186. return_value = res2.replace(
  187. {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True
  188. )
  189. assert return_value is None
  190. expec = DataFrame(
  191. {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
  192. )
  193. tm.assert_frame_equal(res, expec)
  194. tm.assert_frame_equal(res2, expec)
  195. res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"})
  196. res2 = dfmix.copy()
  197. return_value = res2.replace(
  198. regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True
  199. )
  200. assert return_value is None
  201. expec = DataFrame(
  202. {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
  203. )
  204. tm.assert_frame_equal(res, expec)
  205. tm.assert_frame_equal(res2, expec)
  206. # scalar -> dict
  207. # to_replace regex, {value: value}
  208. expec = DataFrame(
  209. {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]}
  210. )
  211. res = dfmix.replace("a", {"b": np.nan}, regex=True)
  212. res2 = dfmix.copy()
  213. return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True)
  214. assert return_value is None
  215. tm.assert_frame_equal(res, expec)
  216. tm.assert_frame_equal(res2, expec)
  217. res = dfmix.replace("a", {"b": np.nan}, regex=True)
  218. res2 = dfmix.copy()
  219. return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True)
  220. assert return_value is None
  221. expec = DataFrame(
  222. {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]}
  223. )
  224. tm.assert_frame_equal(res, expec)
  225. tm.assert_frame_equal(res2, expec)
  226. def test_regex_replace_dict_nested(self, mix_abc):
  227. # nested dicts will not work until this is implemented for Series
  228. dfmix = DataFrame(mix_abc)
  229. res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True)
  230. res2 = dfmix.copy()
  231. res4 = dfmix.copy()
  232. return_value = res2.replace(
  233. {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True
  234. )
  235. assert return_value is None
  236. res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}})
  237. return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True)
  238. assert return_value is None
  239. expec = DataFrame(
  240. {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
  241. )
  242. tm.assert_frame_equal(res, expec)
  243. tm.assert_frame_equal(res2, expec)
  244. tm.assert_frame_equal(res3, expec)
  245. tm.assert_frame_equal(res4, expec)
  246. def test_regex_replace_dict_nested_non_first_character(self, any_string_dtype):
  247. # GH 25259
  248. dtype = any_string_dtype
  249. df = DataFrame({"first": ["abc", "bca", "cab"]}, dtype=dtype)
  250. expected = DataFrame({"first": [".bc", "bc.", "c.b"]}, dtype=dtype)
  251. result = df.replace({"a": "."}, regex=True)
  252. tm.assert_frame_equal(result, expected)
  253. def test_regex_replace_dict_nested_gh4115(self):
  254. df = DataFrame({"Type": ["Q", "T", "Q", "Q", "T"], "tmp": 2})
  255. expected = DataFrame({"Type": [0, 1, 0, 0, 1], "tmp": 2})
  256. result = df.replace({"Type": {"Q": 0, "T": 1}})
  257. tm.assert_frame_equal(result, expected)
  258. def test_regex_replace_list_to_scalar(self, mix_abc):
  259. df = DataFrame(mix_abc)
  260. expec = DataFrame(
  261. {
  262. "a": mix_abc["a"],
  263. "b": np.array([np.nan] * 4),
  264. "c": [np.nan, np.nan, np.nan, "d"],
  265. }
  266. )
  267. res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True)
  268. res2 = df.copy()
  269. res3 = df.copy()
  270. return_value = res2.replace(
  271. [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True
  272. )
  273. assert return_value is None
  274. return_value = res3.replace(
  275. regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True
  276. )
  277. assert return_value is None
  278. tm.assert_frame_equal(res, expec)
  279. tm.assert_frame_equal(res2, expec)
  280. tm.assert_frame_equal(res3, expec)
  281. def test_regex_replace_str_to_numeric(self, mix_abc):
  282. # what happens when you try to replace a numeric value with a regex?
  283. df = DataFrame(mix_abc)
  284. res = df.replace(r"\s*\.\s*", 0, regex=True)
  285. res2 = df.copy()
  286. return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True)
  287. assert return_value is None
  288. res3 = df.copy()
  289. return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
  290. assert return_value is None
  291. expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]})
  292. tm.assert_frame_equal(res, expec)
  293. tm.assert_frame_equal(res2, expec)
  294. tm.assert_frame_equal(res3, expec)
  295. def test_regex_replace_regex_list_to_numeric(self, mix_abc):
  296. df = DataFrame(mix_abc)
  297. res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
  298. res2 = df.copy()
  299. return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True)
  300. assert return_value is None
  301. res3 = df.copy()
  302. return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True)
  303. assert return_value is None
  304. expec = DataFrame(
  305. {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]}
  306. )
  307. tm.assert_frame_equal(res, expec)
  308. tm.assert_frame_equal(res2, expec)
  309. tm.assert_frame_equal(res3, expec)
  310. def test_regex_replace_series_of_regexes(self, mix_abc):
  311. df = DataFrame(mix_abc)
  312. s1 = Series({"b": r"\s*\.\s*"})
  313. s2 = Series({"b": np.nan})
  314. res = df.replace(s1, s2, regex=True)
  315. res2 = df.copy()
  316. return_value = res2.replace(s1, s2, inplace=True, regex=True)
  317. assert return_value is None
  318. res3 = df.copy()
  319. return_value = res3.replace(regex=s1, value=s2, inplace=True)
  320. assert return_value is None
  321. expec = DataFrame(
  322. {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
  323. )
  324. tm.assert_frame_equal(res, expec)
  325. tm.assert_frame_equal(res2, expec)
  326. tm.assert_frame_equal(res3, expec)
  327. def test_regex_replace_numeric_to_object_conversion(self, mix_abc):
  328. df = DataFrame(mix_abc)
  329. expec = DataFrame({"a": ["a", 1, 2, 3], "b": mix_abc["b"], "c": mix_abc["c"]})
  330. res = df.replace(0, "a")
  331. tm.assert_frame_equal(res, expec)
  332. assert res.a.dtype == np.object_
  333. @pytest.mark.parametrize(
  334. "to_replace", [{"": np.nan, ",": ""}, {",": "", "": np.nan}]
  335. )
  336. def test_joint_simple_replace_and_regex_replace(self, to_replace):
  337. # GH-39338
  338. df = DataFrame(
  339. {
  340. "col1": ["1,000", "a", "3"],
  341. "col2": ["a", "", "b"],
  342. "col3": ["a", "b", "c"],
  343. }
  344. )
  345. result = df.replace(regex=to_replace)
  346. expected = DataFrame(
  347. {
  348. "col1": ["1000", "a", "3"],
  349. "col2": ["a", np.nan, "b"],
  350. "col3": ["a", "b", "c"],
  351. }
  352. )
  353. tm.assert_frame_equal(result, expected)
  354. @pytest.mark.parametrize("metachar", ["[]", "()", r"\d", r"\w", r"\s"])
  355. def test_replace_regex_metachar(self, metachar):
  356. df = DataFrame({"a": [metachar, "else"]})
  357. result = df.replace({"a": {metachar: "paren"}})
  358. expected = DataFrame({"a": ["paren", "else"]})
  359. tm.assert_frame_equal(result, expected)
  360. @pytest.mark.parametrize(
  361. "data,to_replace,expected",
  362. [
  363. (["xax", "xbx"], {"a": "c", "b": "d"}, ["xcx", "xdx"]),
  364. (["d", "", ""], {r"^\s*$": pd.NA}, ["d", pd.NA, pd.NA]),
  365. ],
  366. )
  367. def test_regex_replace_string_types(
  368. self, data, to_replace, expected, frame_or_series, any_string_dtype
  369. ):
  370. # GH-41333, GH-35977
  371. dtype = any_string_dtype
  372. obj = frame_or_series(data, dtype=dtype)
  373. result = obj.replace(to_replace, regex=True)
  374. expected = frame_or_series(expected, dtype=dtype)
  375. tm.assert_equal(result, expected)
  376. def test_replace(self, datetime_frame):
  377. datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
  378. datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
  379. zero_filled = datetime_frame.replace(np.nan, -1e8)
  380. tm.assert_frame_equal(zero_filled, datetime_frame.fillna(-1e8))
  381. tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), datetime_frame)
  382. datetime_frame.loc[datetime_frame.index[:5], "A"] = np.nan
  383. datetime_frame.loc[datetime_frame.index[-5:], "A"] = np.nan
  384. datetime_frame.loc[datetime_frame.index[:5], "B"] = -1e8
  385. # empty
  386. df = DataFrame(index=["a", "b"])
  387. tm.assert_frame_equal(df, df.replace(5, 7))
  388. # GH 11698
  389. # test for mixed data types.
  390. df = DataFrame(
  391. [("-", pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))]
  392. )
  393. df1 = df.replace("-", np.nan)
  394. expected_df = DataFrame(
  395. [(np.nan, pd.to_datetime("20150101")), ("a", pd.to_datetime("20150102"))]
  396. )
  397. tm.assert_frame_equal(df1, expected_df)
  398. def test_replace_list(self):
  399. obj = {"a": list("ab.."), "b": list("efgh"), "c": list("helo")}
  400. dfobj = DataFrame(obj)
  401. # lists of regexes and values
  402. # list of [v1, v2, ..., vN] -> [v1, v2, ..., vN]
  403. to_replace_res = [r".", r"e"]
  404. values = [np.nan, "crap"]
  405. res = dfobj.replace(to_replace_res, values)
  406. expec = DataFrame(
  407. {
  408. "a": ["a", "b", np.nan, np.nan],
  409. "b": ["crap", "f", "g", "h"],
  410. "c": ["h", "crap", "l", "o"],
  411. }
  412. )
  413. tm.assert_frame_equal(res, expec)
  414. # list of [v1, v2, ..., vN] -> [v1, v2, .., vN]
  415. to_replace_res = [r".", r"f"]
  416. values = [r"..", r"crap"]
  417. res = dfobj.replace(to_replace_res, values)
  418. expec = DataFrame(
  419. {
  420. "a": ["a", "b", "..", ".."],
  421. "b": ["e", "crap", "g", "h"],
  422. "c": ["h", "e", "l", "o"],
  423. }
  424. )
  425. tm.assert_frame_equal(res, expec)
  426. def test_replace_with_empty_list(self, frame_or_series):
  427. # GH 21977
  428. ser = Series([["a", "b"], [], np.nan, [1]])
  429. obj = DataFrame({"col": ser})
  430. obj = tm.get_obj(obj, frame_or_series)
  431. expected = obj
  432. result = obj.replace([], np.nan)
  433. tm.assert_equal(result, expected)
  434. # GH 19266
  435. msg = (
  436. "NumPy boolean array indexing assignment cannot assign {size} "
  437. "input values to the 1 output values where the mask is true"
  438. )
  439. with pytest.raises(ValueError, match=msg.format(size=0)):
  440. obj.replace({np.nan: []})
  441. with pytest.raises(ValueError, match=msg.format(size=2)):
  442. obj.replace({np.nan: ["dummy", "alt"]})
  443. def test_replace_series_dict(self):
  444. # from GH 3064
  445. df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
  446. result = df.replace(0, {"zero": 0.5, "one": 1.0})
  447. expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}})
  448. tm.assert_frame_equal(result, expected)
  449. result = df.replace(0, df.mean())
  450. tm.assert_frame_equal(result, expected)
  451. # series to series/dict
  452. df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
  453. s = Series({"zero": 0.0, "one": 2.0})
  454. result = df.replace(s, {"zero": 0.5, "one": 1.0})
  455. expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}})
  456. tm.assert_frame_equal(result, expected)
  457. result = df.replace(s, df.mean())
  458. tm.assert_frame_equal(result, expected)
  459. def test_replace_convert(self):
  460. # gh 3907
  461. df = DataFrame([["foo", "bar", "bah"], ["bar", "foo", "bah"]])
  462. m = {"foo": 1, "bar": 2, "bah": 3}
  463. rep = df.replace(m)
  464. expec = Series([np.int64] * 3)
  465. res = rep.dtypes
  466. tm.assert_series_equal(expec, res)
  467. def test_replace_mixed(self, float_string_frame):
  468. mf = float_string_frame
  469. mf.iloc[5:20, mf.columns.get_loc("foo")] = np.nan
  470. mf.iloc[-10:, mf.columns.get_loc("A")] = np.nan
  471. result = float_string_frame.replace(np.nan, -18)
  472. expected = float_string_frame.fillna(value=-18)
  473. tm.assert_frame_equal(result, expected)
  474. tm.assert_frame_equal(result.replace(-18, np.nan), float_string_frame)
  475. result = float_string_frame.replace(np.nan, -1e8)
  476. expected = float_string_frame.fillna(value=-1e8)
  477. tm.assert_frame_equal(result, expected)
  478. tm.assert_frame_equal(result.replace(-1e8, np.nan), float_string_frame)
  479. def test_replace_mixed_int_block_upcasting(self):
  480. # int block upcasting
  481. df = DataFrame(
  482. {
  483. "A": Series([1.0, 2.0], dtype="float64"),
  484. "B": Series([0, 1], dtype="int64"),
  485. }
  486. )
  487. expected = DataFrame(
  488. {
  489. "A": Series([1.0, 2.0], dtype="float64"),
  490. "B": Series([0.5, 1], dtype="float64"),
  491. }
  492. )
  493. result = df.replace(0, 0.5)
  494. tm.assert_frame_equal(result, expected)
  495. return_value = df.replace(0, 0.5, inplace=True)
  496. assert return_value is None
  497. tm.assert_frame_equal(df, expected)
  498. def test_replace_mixed_int_block_splitting(self):
  499. # int block splitting
  500. df = DataFrame(
  501. {
  502. "A": Series([1.0, 2.0], dtype="float64"),
  503. "B": Series([0, 1], dtype="int64"),
  504. "C": Series([1, 2], dtype="int64"),
  505. }
  506. )
  507. expected = DataFrame(
  508. {
  509. "A": Series([1.0, 2.0], dtype="float64"),
  510. "B": Series([0.5, 1], dtype="float64"),
  511. "C": Series([1, 2], dtype="int64"),
  512. }
  513. )
  514. result = df.replace(0, 0.5)
  515. tm.assert_frame_equal(result, expected)
  516. def test_replace_mixed2(self):
  517. # to object block upcasting
  518. df = DataFrame(
  519. {
  520. "A": Series([1.0, 2.0], dtype="float64"),
  521. "B": Series([0, 1], dtype="int64"),
  522. }
  523. )
  524. expected = DataFrame(
  525. {
  526. "A": Series([1, "foo"], dtype="object"),
  527. "B": Series([0, 1], dtype="int64"),
  528. }
  529. )
  530. result = df.replace(2, "foo")
  531. tm.assert_frame_equal(result, expected)
  532. expected = DataFrame(
  533. {
  534. "A": Series(["foo", "bar"], dtype="object"),
  535. "B": Series([0, "foo"], dtype="object"),
  536. }
  537. )
  538. result = df.replace([1, 2], ["foo", "bar"])
  539. tm.assert_frame_equal(result, expected)
  540. def test_replace_mixed3(self):
  541. # test case from
  542. df = DataFrame(
  543. {"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")}
  544. )
  545. result = df.replace(3, df.mean().to_dict())
  546. expected = df.copy().astype("float64")
  547. m = df.mean()
  548. expected.iloc[0, 0] = m[0]
  549. expected.iloc[1, 1] = m[1]
  550. tm.assert_frame_equal(result, expected)
  551. def test_replace_nullable_int_with_string_doesnt_cast(self):
  552. # GH#25438 don't cast df['a'] to float64
  553. df = DataFrame({"a": [1, 2, 3, np.nan], "b": ["some", "strings", "here", "he"]})
  554. df["a"] = df["a"].astype("Int64")
  555. res = df.replace("", np.nan)
  556. tm.assert_series_equal(res["a"], df["a"])
  557. @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"])
  558. def test_replace_with_nullable_column(self, dtype):
  559. # GH-44499
  560. nullable_ser = Series([1, 0, 1], dtype=dtype)
  561. df = DataFrame({"A": ["A", "B", "x"], "B": nullable_ser})
  562. result = df.replace("x", "X")
  563. expected = DataFrame({"A": ["A", "B", "X"], "B": nullable_ser})
  564. tm.assert_frame_equal(result, expected)
  565. def test_replace_simple_nested_dict(self):
  566. df = DataFrame({"col": range(1, 5)})
  567. expected = DataFrame({"col": ["a", 2, 3, "b"]})
  568. result = df.replace({"col": {1: "a", 4: "b"}})
  569. tm.assert_frame_equal(expected, result)
  570. # in this case, should be the same as the not nested version
  571. result = df.replace({1: "a", 4: "b"})
  572. tm.assert_frame_equal(expected, result)
  573. def test_replace_simple_nested_dict_with_nonexistent_value(self):
  574. df = DataFrame({"col": range(1, 5)})
  575. expected = DataFrame({"col": ["a", 2, 3, "b"]})
  576. result = df.replace({-1: "-", 1: "a", 4: "b"})
  577. tm.assert_frame_equal(expected, result)
  578. result = df.replace({"col": {-1: "-", 1: "a", 4: "b"}})
  579. tm.assert_frame_equal(expected, result)
  580. def test_replace_NA_with_None(self):
  581. # gh-45601
  582. df = DataFrame({"value": [42, None]}).astype({"value": "Int64"})
  583. result = df.replace({pd.NA: None})
  584. expected = DataFrame({"value": [42, None]}, dtype=object)
  585. tm.assert_frame_equal(result, expected)
  586. def test_replace_NAT_with_None(self):
  587. # gh-45836
  588. df = DataFrame([pd.NaT, pd.NaT])
  589. result = df.replace({pd.NaT: None, np.NaN: None})
  590. expected = DataFrame([None, None])
  591. tm.assert_frame_equal(result, expected)
  592. def test_replace_with_None_keeps_categorical(self):
  593. # gh-46634
  594. cat_series = Series(["b", "b", "b", "d"], dtype="category")
  595. df = DataFrame(
  596. {
  597. "id": Series([5, 4, 3, 2], dtype="float64"),
  598. "col": cat_series,
  599. }
  600. )
  601. result = df.replace({3: None})
  602. expected = DataFrame(
  603. {
  604. "id": Series([5.0, 4.0, None, 2.0], dtype="object"),
  605. "col": cat_series,
  606. }
  607. )
  608. tm.assert_frame_equal(result, expected)
  609. def test_replace_value_is_none(self, datetime_frame):
  610. orig_value = datetime_frame.iloc[0, 0]
  611. orig2 = datetime_frame.iloc[1, 0]
  612. datetime_frame.iloc[0, 0] = np.nan
  613. datetime_frame.iloc[1, 0] = 1
  614. result = datetime_frame.replace(to_replace={np.nan: 0})
  615. expected = datetime_frame.T.replace(to_replace={np.nan: 0}).T
  616. tm.assert_frame_equal(result, expected)
  617. result = datetime_frame.replace(to_replace={np.nan: 0, 1: -1e8})
  618. tsframe = datetime_frame.copy()
  619. tsframe.iloc[0, 0] = 0
  620. tsframe.iloc[1, 0] = -1e8
  621. expected = tsframe
  622. tm.assert_frame_equal(expected, result)
  623. datetime_frame.iloc[0, 0] = orig_value
  624. datetime_frame.iloc[1, 0] = orig2
  625. def test_replace_for_new_dtypes(self, datetime_frame):
  626. # dtypes
  627. tsframe = datetime_frame.copy().astype(np.float32)
  628. tsframe.loc[tsframe.index[:5], "A"] = np.nan
  629. tsframe.loc[tsframe.index[-5:], "A"] = np.nan
  630. zero_filled = tsframe.replace(np.nan, -1e8)
  631. tm.assert_frame_equal(zero_filled, tsframe.fillna(-1e8))
  632. tm.assert_frame_equal(zero_filled.replace(-1e8, np.nan), tsframe)
  633. tsframe.loc[tsframe.index[:5], "A"] = np.nan
  634. tsframe.loc[tsframe.index[-5:], "A"] = np.nan
  635. tsframe.loc[tsframe.index[:5], "B"] = -1e8
  636. b = tsframe["B"]
  637. b[b == -1e8] = np.nan
  638. tsframe["B"] = b
  639. result = tsframe.fillna(method="bfill")
  640. tm.assert_frame_equal(result, tsframe.fillna(method="bfill"))
  641. @pytest.mark.parametrize(
  642. "frame, to_replace, value, expected",
  643. [
  644. (DataFrame({"ints": [1, 2, 3]}), 1, 0, DataFrame({"ints": [0, 2, 3]})),
  645. (
  646. DataFrame({"ints": [1, 2, 3]}, dtype=np.int32),
  647. 1,
  648. 0,
  649. DataFrame({"ints": [0, 2, 3]}, dtype=np.int32),
  650. ),
  651. (
  652. DataFrame({"ints": [1, 2, 3]}, dtype=np.int16),
  653. 1,
  654. 0,
  655. DataFrame({"ints": [0, 2, 3]}, dtype=np.int16),
  656. ),
  657. (
  658. DataFrame({"bools": [True, False, True]}),
  659. False,
  660. True,
  661. DataFrame({"bools": [True, True, True]}),
  662. ),
  663. (
  664. DataFrame({"complex": [1j, 2j, 3j]}),
  665. 1j,
  666. 0,
  667. DataFrame({"complex": [0j, 2j, 3j]}),
  668. ),
  669. (
  670. DataFrame(
  671. {
  672. "datetime64": Index(
  673. [
  674. datetime(2018, 5, 28),
  675. datetime(2018, 7, 28),
  676. datetime(2018, 5, 28),
  677. ]
  678. )
  679. }
  680. ),
  681. datetime(2018, 5, 28),
  682. datetime(2018, 7, 28),
  683. DataFrame({"datetime64": Index([datetime(2018, 7, 28)] * 3)}),
  684. ),
  685. # GH 20380
  686. (
  687. DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["foo"]}),
  688. "foo",
  689. "bar",
  690. DataFrame({"dt": [datetime(3017, 12, 20)], "str": ["bar"]}),
  691. ),
  692. # GH 36782
  693. (
  694. DataFrame({"dt": [datetime(2920, 10, 1)]}),
  695. datetime(2920, 10, 1),
  696. datetime(2020, 10, 1),
  697. DataFrame({"dt": [datetime(2020, 10, 1)]}),
  698. ),
  699. (
  700. DataFrame(
  701. {
  702. "A": date_range("20130101", periods=3, tz="US/Eastern"),
  703. "B": [0, np.nan, 2],
  704. }
  705. ),
  706. Timestamp("20130102", tz="US/Eastern"),
  707. Timestamp("20130104", tz="US/Eastern"),
  708. DataFrame(
  709. {
  710. "A": [
  711. Timestamp("20130101", tz="US/Eastern"),
  712. Timestamp("20130104", tz="US/Eastern"),
  713. Timestamp("20130103", tz="US/Eastern"),
  714. ],
  715. "B": [0, np.nan, 2],
  716. }
  717. ),
  718. ),
  719. # GH 35376
  720. (
  721. DataFrame([[1, 1.0], [2, 2.0]]),
  722. 1.0,
  723. 5,
  724. DataFrame([[5, 5.0], [2, 2.0]]),
  725. ),
  726. (
  727. DataFrame([[1, 1.0], [2, 2.0]]),
  728. 1,
  729. 5,
  730. DataFrame([[5, 5.0], [2, 2.0]]),
  731. ),
  732. (
  733. DataFrame([[1, 1.0], [2, 2.0]]),
  734. 1.0,
  735. 5.0,
  736. DataFrame([[5, 5.0], [2, 2.0]]),
  737. ),
  738. (
  739. DataFrame([[1, 1.0], [2, 2.0]]),
  740. 1,
  741. 5.0,
  742. DataFrame([[5, 5.0], [2, 2.0]]),
  743. ),
  744. ],
  745. )
  746. def test_replace_dtypes(self, frame, to_replace, value, expected):
  747. result = frame.replace(to_replace, value)
  748. tm.assert_frame_equal(result, expected)
  749. def test_replace_input_formats_listlike(self):
  750. # both dicts
  751. to_rep = {"A": np.nan, "B": 0, "C": ""}
  752. values = {"A": 0, "B": -1, "C": "missing"}
  753. df = DataFrame(
  754. {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
  755. )
  756. filled = df.replace(to_rep, values)
  757. expected = {k: v.replace(to_rep[k], values[k]) for k, v in df.items()}
  758. tm.assert_frame_equal(filled, DataFrame(expected))
  759. result = df.replace([0, 2, 5], [5, 2, 0])
  760. expected = DataFrame(
  761. {"A": [np.nan, 5, np.inf], "B": [5, 2, 0], "C": ["", "asdf", "fd"]}
  762. )
  763. tm.assert_frame_equal(result, expected)
  764. # scalar to dict
  765. values = {"A": 0, "B": -1, "C": "missing"}
  766. df = DataFrame(
  767. {"A": [np.nan, 0, np.nan], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
  768. )
  769. filled = df.replace(np.nan, values)
  770. expected = {k: v.replace(np.nan, values[k]) for k, v in df.items()}
  771. tm.assert_frame_equal(filled, DataFrame(expected))
  772. # list to list
  773. to_rep = [np.nan, 0, ""]
  774. values = [-2, -1, "missing"]
  775. result = df.replace(to_rep, values)
  776. expected = df.copy()
  777. for rep, value in zip(to_rep, values):
  778. return_value = expected.replace(rep, value, inplace=True)
  779. assert return_value is None
  780. tm.assert_frame_equal(result, expected)
  781. msg = r"Replacement lists must match in length\. Expecting 3 got 2"
  782. with pytest.raises(ValueError, match=msg):
  783. df.replace(to_rep, values[1:])
  784. def test_replace_input_formats_scalar(self):
  785. df = DataFrame(
  786. {"A": [np.nan, 0, np.inf], "B": [0, 2, 5], "C": ["", "asdf", "fd"]}
  787. )
  788. # dict to scalar
  789. to_rep = {"A": np.nan, "B": 0, "C": ""}
  790. filled = df.replace(to_rep, 0)
  791. expected = {k: v.replace(to_rep[k], 0) for k, v in df.items()}
  792. tm.assert_frame_equal(filled, DataFrame(expected))
  793. msg = "value argument must be scalar, dict, or Series"
  794. with pytest.raises(TypeError, match=msg):
  795. df.replace(to_rep, [np.nan, 0, ""])
  796. # list to scalar
  797. to_rep = [np.nan, 0, ""]
  798. result = df.replace(to_rep, -1)
  799. expected = df.copy()
  800. for rep in to_rep:
  801. return_value = expected.replace(rep, -1, inplace=True)
  802. assert return_value is None
  803. tm.assert_frame_equal(result, expected)
  804. def test_replace_limit(self):
  805. # TODO
  806. pass
  807. def test_replace_dict_no_regex(self):
  808. answer = Series(
  809. {
  810. 0: "Strongly Agree",
  811. 1: "Agree",
  812. 2: "Neutral",
  813. 3: "Disagree",
  814. 4: "Strongly Disagree",
  815. }
  816. )
  817. weights = {
  818. "Agree": 4,
  819. "Disagree": 2,
  820. "Neutral": 3,
  821. "Strongly Agree": 5,
  822. "Strongly Disagree": 1,
  823. }
  824. expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
  825. result = answer.replace(weights)
  826. tm.assert_series_equal(result, expected)
  827. def test_replace_series_no_regex(self):
  828. answer = Series(
  829. {
  830. 0: "Strongly Agree",
  831. 1: "Agree",
  832. 2: "Neutral",
  833. 3: "Disagree",
  834. 4: "Strongly Disagree",
  835. }
  836. )
  837. weights = Series(
  838. {
  839. "Agree": 4,
  840. "Disagree": 2,
  841. "Neutral": 3,
  842. "Strongly Agree": 5,
  843. "Strongly Disagree": 1,
  844. }
  845. )
  846. expected = Series({0: 5, 1: 4, 2: 3, 3: 2, 4: 1})
  847. result = answer.replace(weights)
  848. tm.assert_series_equal(result, expected)
  849. def test_replace_dict_tuple_list_ordering_remains_the_same(self):
  850. df = DataFrame({"A": [np.nan, 1]})
  851. res1 = df.replace(to_replace={np.nan: 0, 1: -1e8})
  852. res2 = df.replace(to_replace=(1, np.nan), value=[-1e8, 0])
  853. res3 = df.replace(to_replace=[1, np.nan], value=[-1e8, 0])
  854. expected = DataFrame({"A": [0, -1e8]})
  855. tm.assert_frame_equal(res1, res2)
  856. tm.assert_frame_equal(res2, res3)
  857. tm.assert_frame_equal(res3, expected)
  858. def test_replace_doesnt_replace_without_regex(self):
  859. df = DataFrame(
  860. {
  861. "fol": [1, 2, 2, 3],
  862. "T_opp": ["0", "vr", "0", "0"],
  863. "T_Dir": ["0", "0", "0", "bt"],
  864. "T_Enh": ["vo", "0", "0", "0"],
  865. }
  866. )
  867. res = df.replace({r"\D": 1})
  868. tm.assert_frame_equal(df, res)
  869. def test_replace_bool_with_string(self):
  870. df = DataFrame({"a": [True, False], "b": list("ab")})
  871. result = df.replace(True, "a")
  872. expected = DataFrame({"a": ["a", False], "b": df.b})
  873. tm.assert_frame_equal(result, expected)
  874. def test_replace_pure_bool_with_string_no_op(self):
  875. df = DataFrame(np.random.rand(2, 2) > 0.5)
  876. result = df.replace("asdf", "fdsa")
  877. tm.assert_frame_equal(df, result)
  878. def test_replace_bool_with_bool(self):
  879. df = DataFrame(np.random.rand(2, 2) > 0.5)
  880. result = df.replace(False, True)
  881. expected = DataFrame(np.ones((2, 2), dtype=bool))
  882. tm.assert_frame_equal(result, expected)
  883. def test_replace_with_dict_with_bool_keys(self):
  884. df = DataFrame({0: [True, False], 1: [False, True]})
  885. result = df.replace({"asdf": "asdb", True: "yes"})
  886. expected = DataFrame({0: ["yes", False], 1: [False, "yes"]})
  887. tm.assert_frame_equal(result, expected)
  888. def test_replace_dict_strings_vs_ints(self):
  889. # GH#34789
  890. df = DataFrame({"Y0": [1, 2], "Y1": [3, 4]})
  891. result = df.replace({"replace_string": "test"})
  892. tm.assert_frame_equal(result, df)
  893. result = df["Y0"].replace({"replace_string": "test"})
  894. tm.assert_series_equal(result, df["Y0"])
  895. def test_replace_truthy(self):
  896. df = DataFrame({"a": [True, True]})
  897. r = df.replace([np.inf, -np.inf], np.nan)
  898. e = df
  899. tm.assert_frame_equal(r, e)
  900. def test_nested_dict_overlapping_keys_replace_int(self):
  901. # GH 27660 keep behaviour consistent for simple dictionary and
  902. # nested dictionary replacement
  903. df = DataFrame({"a": list(range(1, 5))})
  904. result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))})
  905. expected = df.replace(dict(zip(range(1, 5), range(2, 6))))
  906. tm.assert_frame_equal(result, expected)
  907. def test_nested_dict_overlapping_keys_replace_str(self):
  908. # GH 27660
  909. a = np.arange(1, 5)
  910. astr = a.astype(str)
  911. bstr = np.arange(2, 6).astype(str)
  912. df = DataFrame({"a": astr})
  913. result = df.replace(dict(zip(astr, bstr)))
  914. expected = df.replace({"a": dict(zip(astr, bstr))})
  915. tm.assert_frame_equal(result, expected)
  916. def test_replace_swapping_bug(self):
  917. df = DataFrame({"a": [True, False, True]})
  918. res = df.replace({"a": {True: "Y", False: "N"}})
  919. expect = DataFrame({"a": ["Y", "N", "Y"]})
  920. tm.assert_frame_equal(res, expect)
  921. df = DataFrame({"a": [0, 1, 0]})
  922. res = df.replace({"a": {0: "Y", 1: "N"}})
  923. expect = DataFrame({"a": ["Y", "N", "Y"]})
  924. tm.assert_frame_equal(res, expect)
  925. def test_replace_period(self):
  926. d = {
  927. "fname": {
  928. "out_augmented_AUG_2011.json": pd.Period(year=2011, month=8, freq="M"),
  929. "out_augmented_JAN_2011.json": pd.Period(year=2011, month=1, freq="M"),
  930. "out_augmented_MAY_2012.json": pd.Period(year=2012, month=5, freq="M"),
  931. "out_augmented_SUBSIDY_WEEK.json": pd.Period(
  932. year=2011, month=4, freq="M"
  933. ),
  934. "out_augmented_AUG_2012.json": pd.Period(year=2012, month=8, freq="M"),
  935. "out_augmented_MAY_2011.json": pd.Period(year=2011, month=5, freq="M"),
  936. "out_augmented_SEP_2013.json": pd.Period(year=2013, month=9, freq="M"),
  937. }
  938. }
  939. df = DataFrame(
  940. [
  941. "out_augmented_AUG_2012.json",
  942. "out_augmented_SEP_2013.json",
  943. "out_augmented_SUBSIDY_WEEK.json",
  944. "out_augmented_MAY_2012.json",
  945. "out_augmented_MAY_2011.json",
  946. "out_augmented_AUG_2011.json",
  947. "out_augmented_JAN_2011.json",
  948. ],
  949. columns=["fname"],
  950. )
  951. assert set(df.fname.values) == set(d["fname"].keys())
  952. expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]})
  953. assert expected.dtypes[0] == "Period[M]"
  954. result = df.replace(d)
  955. tm.assert_frame_equal(result, expected)
  956. def test_replace_datetime(self):
  957. d = {
  958. "fname": {
  959. "out_augmented_AUG_2011.json": Timestamp("2011-08"),
  960. "out_augmented_JAN_2011.json": Timestamp("2011-01"),
  961. "out_augmented_MAY_2012.json": Timestamp("2012-05"),
  962. "out_augmented_SUBSIDY_WEEK.json": Timestamp("2011-04"),
  963. "out_augmented_AUG_2012.json": Timestamp("2012-08"),
  964. "out_augmented_MAY_2011.json": Timestamp("2011-05"),
  965. "out_augmented_SEP_2013.json": Timestamp("2013-09"),
  966. }
  967. }
  968. df = DataFrame(
  969. [
  970. "out_augmented_AUG_2012.json",
  971. "out_augmented_SEP_2013.json",
  972. "out_augmented_SUBSIDY_WEEK.json",
  973. "out_augmented_MAY_2012.json",
  974. "out_augmented_MAY_2011.json",
  975. "out_augmented_AUG_2011.json",
  976. "out_augmented_JAN_2011.json",
  977. ],
  978. columns=["fname"],
  979. )
  980. assert set(df.fname.values) == set(d["fname"].keys())
  981. expected = DataFrame({"fname": [d["fname"][k] for k in df.fname.values]})
  982. result = df.replace(d)
  983. tm.assert_frame_equal(result, expected)
  984. def test_replace_datetimetz(self):
  985. # GH 11326
  986. # behaving poorly when presented with a datetime64[ns, tz]
  987. df = DataFrame(
  988. {
  989. "A": date_range("20130101", periods=3, tz="US/Eastern"),
  990. "B": [0, np.nan, 2],
  991. }
  992. )
  993. result = df.replace(np.nan, 1)
  994. expected = DataFrame(
  995. {
  996. "A": date_range("20130101", periods=3, tz="US/Eastern"),
  997. "B": Series([0, 1, 2], dtype="float64"),
  998. }
  999. )
  1000. tm.assert_frame_equal(result, expected)
  1001. result = df.fillna(1)
  1002. tm.assert_frame_equal(result, expected)
  1003. result = df.replace(0, np.nan)
  1004. expected = DataFrame(
  1005. {
  1006. "A": date_range("20130101", periods=3, tz="US/Eastern"),
  1007. "B": [np.nan, np.nan, 2],
  1008. }
  1009. )
  1010. tm.assert_frame_equal(result, expected)
  1011. result = df.replace(
  1012. Timestamp("20130102", tz="US/Eastern"),
  1013. Timestamp("20130104", tz="US/Eastern"),
  1014. )
  1015. expected = DataFrame(
  1016. {
  1017. "A": [
  1018. Timestamp("20130101", tz="US/Eastern"),
  1019. Timestamp("20130104", tz="US/Eastern"),
  1020. Timestamp("20130103", tz="US/Eastern"),
  1021. ],
  1022. "B": [0, np.nan, 2],
  1023. }
  1024. )
  1025. tm.assert_frame_equal(result, expected)
  1026. result = df.copy()
  1027. result.iloc[1, 0] = np.nan
  1028. result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Eastern"))
  1029. tm.assert_frame_equal(result, expected)
  1030. # pre-2.0 this would coerce to object with mismatched tzs
  1031. result = df.copy()
  1032. result.iloc[1, 0] = np.nan
  1033. result = result.replace({"A": pd.NaT}, Timestamp("20130104", tz="US/Pacific"))
  1034. expected = DataFrame(
  1035. {
  1036. "A": [
  1037. Timestamp("20130101", tz="US/Eastern"),
  1038. Timestamp("20130104", tz="US/Pacific").tz_convert("US/Eastern"),
  1039. Timestamp("20130103", tz="US/Eastern"),
  1040. ],
  1041. "B": [0, np.nan, 2],
  1042. }
  1043. )
  1044. tm.assert_frame_equal(result, expected)
  1045. result = df.copy()
  1046. result.iloc[1, 0] = np.nan
  1047. result = result.replace({"A": np.nan}, Timestamp("20130104"))
  1048. expected = DataFrame(
  1049. {
  1050. "A": [
  1051. Timestamp("20130101", tz="US/Eastern"),
  1052. Timestamp("20130104"),
  1053. Timestamp("20130103", tz="US/Eastern"),
  1054. ],
  1055. "B": [0, np.nan, 2],
  1056. }
  1057. )
  1058. tm.assert_frame_equal(result, expected)
  1059. def test_replace_with_empty_dictlike(self, mix_abc):
  1060. # GH 15289
  1061. df = DataFrame(mix_abc)
  1062. tm.assert_frame_equal(df, df.replace({}))
  1063. tm.assert_frame_equal(df, df.replace(Series([], dtype=object)))
  1064. tm.assert_frame_equal(df, df.replace({"b": {}}))
  1065. tm.assert_frame_equal(df, df.replace(Series({"b": {}})))
  1066. @pytest.mark.parametrize(
  1067. "to_replace, method, expected",
  1068. [
  1069. (0, "bfill", {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}),
  1070. (
  1071. np.nan,
  1072. "bfill",
  1073. {"A": [0, 1, 2], "B": [5.0, 7.0, 7.0], "C": ["a", "b", "c"]},
  1074. ),
  1075. ("d", "ffill", {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]}),
  1076. (
  1077. [0, 2],
  1078. "bfill",
  1079. {"A": [1, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
  1080. ),
  1081. (
  1082. [1, 2],
  1083. "pad",
  1084. {"A": [0, 0, 0], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
  1085. ),
  1086. (
  1087. (1, 2),
  1088. "bfill",
  1089. {"A": [0, 2, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]},
  1090. ),
  1091. (
  1092. ["b", "c"],
  1093. "ffill",
  1094. {"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "a", "a"]},
  1095. ),
  1096. ],
  1097. )
  1098. def test_replace_method(self, to_replace, method, expected):
  1099. # GH 19632
  1100. df = DataFrame({"A": [0, 1, 2], "B": [5, np.nan, 7], "C": ["a", "b", "c"]})
  1101. result = df.replace(to_replace=to_replace, value=None, method=method)
  1102. expected = DataFrame(expected)
  1103. tm.assert_frame_equal(result, expected)
  1104. @pytest.mark.parametrize(
  1105. "replace_dict, final_data",
  1106. [({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])],
  1107. )
  1108. def test_categorical_replace_with_dict(self, replace_dict, final_data):
  1109. # GH 26988
  1110. df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category")
  1111. final_data = np.array(final_data)
  1112. a = pd.Categorical(final_data[:, 0], categories=[3, 2])
  1113. ex_cat = [3, 2] if replace_dict["b"] == 1 else [1, 3]
  1114. b = pd.Categorical(final_data[:, 1], categories=ex_cat)
  1115. expected = DataFrame({"a": a, "b": b})
  1116. result = df.replace(replace_dict, 3)
  1117. tm.assert_frame_equal(result, expected)
  1118. msg = (
  1119. r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are "
  1120. "different"
  1121. )
  1122. with pytest.raises(AssertionError, match=msg):
  1123. # ensure non-inplace call does not affect original
  1124. tm.assert_frame_equal(df, expected)
  1125. return_value = df.replace(replace_dict, 3, inplace=True)
  1126. assert return_value is None
  1127. tm.assert_frame_equal(df, expected)
  1128. @pytest.mark.parametrize(
  1129. "df, to_replace, exp",
  1130. [
  1131. (
  1132. {"col1": [1, 2, 3], "col2": [4, 5, 6]},
  1133. {4: 5, 5: 6, 6: 7},
  1134. {"col1": [1, 2, 3], "col2": [5, 6, 7]},
  1135. ),
  1136. (
  1137. {"col1": [1, 2, 3], "col2": ["4", "5", "6"]},
  1138. {"4": "5", "5": "6", "6": "7"},
  1139. {"col1": [1, 2, 3], "col2": ["5", "6", "7"]},
  1140. ),
  1141. ],
  1142. )
  1143. def test_replace_commutative(self, df, to_replace, exp):
  1144. # GH 16051
  1145. # DataFrame.replace() overwrites when values are non-numeric
  1146. # also added to data frame whilst issue was for series
  1147. df = DataFrame(df)
  1148. expected = DataFrame(exp)
  1149. result = df.replace(to_replace)
  1150. tm.assert_frame_equal(result, expected)
  1151. @pytest.mark.parametrize(
  1152. "replacer",
  1153. [
  1154. Timestamp("20170827"),
  1155. np.int8(1),
  1156. np.int16(1),
  1157. np.float32(1),
  1158. np.float64(1),
  1159. ],
  1160. )
  1161. def test_replace_replacer_dtype(self, request, replacer):
  1162. # GH26632
  1163. df = DataFrame(["a"])
  1164. result = df.replace({"a": replacer, "b": replacer})
  1165. expected = DataFrame([replacer])
  1166. tm.assert_frame_equal(result, expected)
  1167. def test_replace_after_convert_dtypes(self):
  1168. # GH31517
  1169. df = DataFrame({"grp": [1, 2, 3, 4, 5]}, dtype="Int64")
  1170. result = df.replace(1, 10)
  1171. expected = DataFrame({"grp": [10, 2, 3, 4, 5]}, dtype="Int64")
  1172. tm.assert_frame_equal(result, expected)
  1173. def test_replace_invalid_to_replace(self):
  1174. # GH 18634
  1175. # API: replace() should raise an exception if invalid argument is given
  1176. df = DataFrame({"one": ["a", "b ", "c"], "two": ["d ", "e ", "f "]})
  1177. msg = (
  1178. r"Expecting 'to_replace' to be either a scalar, array-like, "
  1179. r"dict or None, got invalid type.*"
  1180. )
  1181. with pytest.raises(TypeError, match=msg):
  1182. df.replace(lambda x: x.strip())
  1183. @pytest.mark.parametrize("dtype", ["float", "float64", "int64", "Int64", "boolean"])
  1184. @pytest.mark.parametrize("value", [np.nan, pd.NA])
  1185. def test_replace_no_replacement_dtypes(self, dtype, value):
  1186. # https://github.com/pandas-dev/pandas/issues/32988
  1187. df = DataFrame(np.eye(2), dtype=dtype)
  1188. result = df.replace(to_replace=[None, -np.inf, np.inf], value=value)
  1189. tm.assert_frame_equal(result, df)
  1190. @pytest.mark.parametrize("replacement", [np.nan, 5])
  1191. def test_replace_with_duplicate_columns(self, replacement):
  1192. # GH 24798
  1193. result = DataFrame({"A": [1, 2, 3], "A1": [4, 5, 6], "B": [7, 8, 9]})
  1194. result.columns = list("AAB")
  1195. expected = DataFrame(
  1196. {"A": [1, 2, 3], "A1": [4, 5, 6], "B": [replacement, 8, 9]}
  1197. )
  1198. expected.columns = list("AAB")
  1199. result["B"] = result["B"].replace(7, replacement)
  1200. tm.assert_frame_equal(result, expected)
  1201. @pytest.mark.parametrize("value", [pd.Period("2020-01"), pd.Interval(0, 5)])
  1202. def test_replace_ea_ignore_float(self, frame_or_series, value):
  1203. # GH#34871
  1204. obj = DataFrame({"Per": [value] * 3})
  1205. obj = tm.get_obj(obj, frame_or_series)
  1206. expected = obj.copy()
  1207. result = obj.replace(1.0, 0.0)
  1208. tm.assert_equal(expected, result)
  1209. def test_replace_value_category_type(self):
  1210. """
  1211. Test for #23305: to ensure category dtypes are maintained
  1212. after replace with direct values
  1213. """
  1214. # create input data
  1215. input_dict = {
  1216. "col1": [1, 2, 3, 4],
  1217. "col2": ["a", "b", "c", "d"],
  1218. "col3": [1.5, 2.5, 3.5, 4.5],
  1219. "col4": ["cat1", "cat2", "cat3", "cat4"],
  1220. "col5": ["obj1", "obj2", "obj3", "obj4"],
  1221. }
  1222. # explicitly cast columns as category and order them
  1223. input_df = DataFrame(data=input_dict).astype(
  1224. {"col2": "category", "col4": "category"}
  1225. )
  1226. input_df["col2"] = input_df["col2"].cat.reorder_categories(
  1227. ["a", "b", "c", "d"], ordered=True
  1228. )
  1229. input_df["col4"] = input_df["col4"].cat.reorder_categories(
  1230. ["cat1", "cat2", "cat3", "cat4"], ordered=True
  1231. )
  1232. # create expected dataframe
  1233. expected_dict = {
  1234. "col1": [1, 2, 3, 4],
  1235. "col2": ["a", "b", "c", "z"],
  1236. "col3": [1.5, 2.5, 3.5, 4.5],
  1237. "col4": ["cat1", "catX", "cat3", "cat4"],
  1238. "col5": ["obj9", "obj2", "obj3", "obj4"],
  1239. }
  1240. # explicitly cast columns as category and order them
  1241. expected = DataFrame(data=expected_dict).astype(
  1242. {"col2": "category", "col4": "category"}
  1243. )
  1244. expected["col2"] = expected["col2"].cat.reorder_categories(
  1245. ["a", "b", "c", "z"], ordered=True
  1246. )
  1247. expected["col4"] = expected["col4"].cat.reorder_categories(
  1248. ["cat1", "catX", "cat3", "cat4"], ordered=True
  1249. )
  1250. # replace values in input dataframe
  1251. input_df = input_df.replace("d", "z")
  1252. input_df = input_df.replace("obj1", "obj9")
  1253. result = input_df.replace("cat2", "catX")
  1254. tm.assert_frame_equal(result, expected)
  1255. def test_replace_dict_category_type(self):
  1256. """
  1257. Test to ensure category dtypes are maintained
  1258. after replace with dict values
  1259. """
  1260. # GH#35268, GH#44940
  1261. # create input dataframe
  1262. input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]}
  1263. # explicitly cast columns as category
  1264. input_df = DataFrame(data=input_dict).astype(
  1265. {"col1": "category", "col2": "category", "col3": "category"}
  1266. )
  1267. # create expected dataframe
  1268. expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]}
  1269. # explicitly cast columns as category
  1270. expected = DataFrame(data=expected_dict).astype(
  1271. {"col1": "category", "col2": "category", "col3": "category"}
  1272. )
  1273. # replace values in input dataframe using a dict
  1274. result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})
  1275. tm.assert_frame_equal(result, expected)
  1276. def test_replace_with_compiled_regex(self):
  1277. # https://github.com/pandas-dev/pandas/issues/35680
  1278. df = DataFrame(["a", "b", "c"])
  1279. regex = re.compile("^a$")
  1280. result = df.replace({regex: "z"}, regex=True)
  1281. expected = DataFrame(["z", "b", "c"])
  1282. tm.assert_frame_equal(result, expected)
  1283. def test_replace_intervals(self):
  1284. # https://github.com/pandas-dev/pandas/issues/35931
  1285. df = DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]})
  1286. result = df.replace({"a": {pd.Interval(0, 1): "x"}})
  1287. expected = DataFrame({"a": ["x", "x"]})
  1288. tm.assert_frame_equal(result, expected)
  1289. def test_replace_unicode(self):
  1290. # GH: 16784
  1291. columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}}
  1292. df1 = DataFrame({"positive": np.ones(3)})
  1293. result = df1.replace(columns_values_map)
  1294. expected = DataFrame({"positive": np.ones(3)})
  1295. tm.assert_frame_equal(result, expected)
  1296. def test_replace_bytes(self, frame_or_series):
  1297. # GH#38900
  1298. obj = frame_or_series(["o"]).astype("|S")
  1299. expected = obj.copy()
  1300. obj = obj.replace({None: np.nan})
  1301. tm.assert_equal(obj, expected)
  1302. @pytest.mark.parametrize(
  1303. "data, to_replace, value, expected",
  1304. [
  1305. ([1], [1.0], [0], [0]),
  1306. ([1], [1], [0], [0]),
  1307. ([1.0], [1.0], [0], [0.0]),
  1308. ([1.0], [1], [0], [0.0]),
  1309. ],
  1310. )
  1311. @pytest.mark.parametrize("box", [list, tuple, np.array])
  1312. def test_replace_list_with_mixed_type(
  1313. self, data, to_replace, value, expected, box, frame_or_series
  1314. ):
  1315. # GH#40371
  1316. obj = frame_or_series(data)
  1317. expected = frame_or_series(expected)
  1318. result = obj.replace(box(to_replace), value)
  1319. tm.assert_equal(result, expected)
  1320. @pytest.mark.parametrize("val", [2, np.nan, 2.0])
  1321. def test_replace_value_none_dtype_numeric(self, val):
  1322. # GH#48231
  1323. df = DataFrame({"a": [1, val]})
  1324. result = df.replace(val, None)
  1325. expected = DataFrame({"a": [1, None]}, dtype=object)
  1326. tm.assert_frame_equal(result, expected)
  1327. df = DataFrame({"a": [1, val]})
  1328. result = df.replace({val: None})
  1329. tm.assert_frame_equal(result, expected)
  1330. class TestDataFrameReplaceRegex:
  1331. @pytest.mark.parametrize(
  1332. "data",
  1333. [
  1334. {"a": list("ab.."), "b": list("efgh")},
  1335. {"a": list("ab.."), "b": list(range(4))},
  1336. ],
  1337. )
  1338. @pytest.mark.parametrize(
  1339. "to_replace,value", [(r"\s*\.\s*", np.nan), (r"\s*(\.)\s*", r"\1\1\1")]
  1340. )
  1341. @pytest.mark.parametrize("compile_regex", [True, False])
  1342. @pytest.mark.parametrize("regex_kwarg", [True, False])
  1343. @pytest.mark.parametrize("inplace", [True, False])
  1344. def test_regex_replace_scalar(
  1345. self, data, to_replace, value, compile_regex, regex_kwarg, inplace
  1346. ):
  1347. df = DataFrame(data)
  1348. expected = df.copy()
  1349. if compile_regex:
  1350. to_replace = re.compile(to_replace)
  1351. if regex_kwarg:
  1352. regex = to_replace
  1353. to_replace = None
  1354. else:
  1355. regex = True
  1356. result = df.replace(to_replace, value, inplace=inplace, regex=regex)
  1357. if inplace:
  1358. assert result is None
  1359. result = df
  1360. if value is np.nan:
  1361. expected_replace_val = np.nan
  1362. else:
  1363. expected_replace_val = "..."
  1364. expected.loc[expected["a"] == ".", "a"] = expected_replace_val
  1365. tm.assert_frame_equal(result, expected)
  1366. @pytest.mark.parametrize("regex", [False, True])
  1367. def test_replace_regex_dtype_frame(self, regex):
  1368. # GH-48644
  1369. df1 = DataFrame({"A": ["0"], "B": ["0"]})
  1370. expected_df1 = DataFrame({"A": [1], "B": [1]})
  1371. result_df1 = df1.replace(to_replace="0", value=1, regex=regex)
  1372. tm.assert_frame_equal(result_df1, expected_df1)
  1373. df2 = DataFrame({"A": ["0"], "B": ["1"]})
  1374. expected_df2 = DataFrame({"A": [1], "B": ["1"]})
  1375. result_df2 = df2.replace(to_replace="0", value=1, regex=regex)
  1376. tm.assert_frame_equal(result_df2, expected_df2)
  1377. def test_replace_with_value_also_being_replaced(self):
  1378. # GH46306
  1379. df = DataFrame({"A": [0, 1, 2], "B": [1, 0, 2]})
  1380. result = df.replace({0: 1, 1: np.nan})
  1381. expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]})
  1382. tm.assert_frame_equal(result, expected)
  1383. def test_replace_categorical_no_replacement(self):
  1384. # GH#46672
  1385. df = DataFrame(
  1386. {
  1387. "a": ["one", "two", None, "three"],
  1388. "b": ["one", None, "two", "three"],
  1389. },
  1390. dtype="category",
  1391. )
  1392. expected = df.copy()
  1393. result = df.replace(to_replace=[".", "def"], value=["_", None])
  1394. tm.assert_frame_equal(result, expected)