test_strings.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701
  1. from datetime import (
  2. datetime,
  3. timedelta,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.strings.accessor import StringMethods
  15. @pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
  16. def test_startswith_endswith_non_str_patterns(pattern):
  17. # GH3485
  18. ser = Series(["foo", "bar"])
  19. msg = f"expected a string or tuple, not {type(pattern).__name__}"
  20. with pytest.raises(TypeError, match=msg):
  21. ser.str.startswith(pattern)
  22. with pytest.raises(TypeError, match=msg):
  23. ser.str.endswith(pattern)
  24. # test integer/float dtypes (inferred by constructor) and mixed
  25. def test_count(any_string_dtype):
  26. ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
  27. result = ser.str.count("f[o]+")
  28. expected_dtype = np.float64 if any_string_dtype == "object" else "Int64"
  29. expected = Series([1, 2, np.nan, 4], dtype=expected_dtype)
  30. tm.assert_series_equal(result, expected)
  31. def test_count_mixed_object():
  32. ser = Series(
  33. ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
  34. dtype=object,
  35. )
  36. result = ser.str.count("a")
  37. expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
  38. tm.assert_series_equal(result, expected)
  39. def test_repeat(any_string_dtype):
  40. ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype)
  41. result = ser.str.repeat(3)
  42. expected = Series(
  43. ["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype
  44. )
  45. tm.assert_series_equal(result, expected)
  46. result = ser.str.repeat([1, 2, 3, 4, 5, 6])
  47. expected = Series(
  48. ["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype
  49. )
  50. tm.assert_series_equal(result, expected)
  51. def test_repeat_mixed_object():
  52. ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
  53. result = ser.str.repeat(3)
  54. expected = Series(
  55. ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", np.nan, np.nan, np.nan]
  56. )
  57. tm.assert_series_equal(result, expected)
  58. @pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]])
  59. def test_repeat_with_null(any_string_dtype, arg, repeat):
  60. # GH: 31632
  61. ser = Series(["a", arg], dtype=any_string_dtype)
  62. result = ser.str.repeat([3, repeat])
  63. expected = Series(["aaa", np.nan], dtype=any_string_dtype)
  64. tm.assert_series_equal(result, expected)
  65. def test_empty_str_methods(any_string_dtype):
  66. empty_str = empty = Series(dtype=any_string_dtype)
  67. if any_string_dtype == "object":
  68. empty_int = Series(dtype="int64")
  69. empty_bool = Series(dtype=bool)
  70. else:
  71. empty_int = Series(dtype="Int64")
  72. empty_bool = Series(dtype="boolean")
  73. empty_object = Series(dtype=object)
  74. empty_bytes = Series(dtype=object)
  75. empty_df = DataFrame()
  76. # GH7241
  77. # (extract) on empty series
  78. tm.assert_series_equal(empty_str, empty.str.cat(empty))
  79. assert "" == empty.str.cat()
  80. tm.assert_series_equal(empty_str, empty.str.title())
  81. tm.assert_series_equal(empty_int, empty.str.count("a"))
  82. tm.assert_series_equal(empty_bool, empty.str.contains("a"))
  83. tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
  84. tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
  85. tm.assert_series_equal(empty_str, empty.str.lower())
  86. tm.assert_series_equal(empty_str, empty.str.upper())
  87. tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
  88. tm.assert_series_equal(empty_str, empty.str.repeat(3))
  89. tm.assert_series_equal(empty_bool, empty.str.match("^a"))
  90. tm.assert_frame_equal(
  91. DataFrame(columns=[0], dtype=any_string_dtype),
  92. empty.str.extract("()", expand=True),
  93. )
  94. tm.assert_frame_equal(
  95. DataFrame(columns=[0, 1], dtype=any_string_dtype),
  96. empty.str.extract("()()", expand=True),
  97. )
  98. tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
  99. tm.assert_frame_equal(
  100. DataFrame(columns=[0, 1], dtype=any_string_dtype),
  101. empty.str.extract("()()", expand=False),
  102. )
  103. tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies())
  104. tm.assert_series_equal(empty_str, empty_str.str.join(""))
  105. tm.assert_series_equal(empty_int, empty.str.len())
  106. tm.assert_series_equal(empty_object, empty_str.str.findall("a"))
  107. tm.assert_series_equal(empty_int, empty.str.find("a"))
  108. tm.assert_series_equal(empty_int, empty.str.rfind("a"))
  109. tm.assert_series_equal(empty_str, empty.str.pad(42))
  110. tm.assert_series_equal(empty_str, empty.str.center(42))
  111. tm.assert_series_equal(empty_object, empty.str.split("a"))
  112. tm.assert_series_equal(empty_object, empty.str.rsplit("a"))
  113. tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False))
  114. tm.assert_frame_equal(empty_df, empty.str.partition("a"))
  115. tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False))
  116. tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
  117. tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
  118. tm.assert_series_equal(empty_str, empty.str.slice(step=1))
  119. tm.assert_series_equal(empty_str, empty.str.strip())
  120. tm.assert_series_equal(empty_str, empty.str.lstrip())
  121. tm.assert_series_equal(empty_str, empty.str.rstrip())
  122. tm.assert_series_equal(empty_str, empty.str.wrap(42))
  123. tm.assert_series_equal(empty_str, empty.str.get(0))
  124. tm.assert_series_equal(empty_object, empty_bytes.str.decode("ascii"))
  125. tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
  126. # ismethods should always return boolean (GH 29624)
  127. tm.assert_series_equal(empty_bool, empty.str.isalnum())
  128. tm.assert_series_equal(empty_bool, empty.str.isalpha())
  129. tm.assert_series_equal(empty_bool, empty.str.isdigit())
  130. tm.assert_series_equal(empty_bool, empty.str.isspace())
  131. tm.assert_series_equal(empty_bool, empty.str.islower())
  132. tm.assert_series_equal(empty_bool, empty.str.isupper())
  133. tm.assert_series_equal(empty_bool, empty.str.istitle())
  134. tm.assert_series_equal(empty_bool, empty.str.isnumeric())
  135. tm.assert_series_equal(empty_bool, empty.str.isdecimal())
  136. tm.assert_series_equal(empty_str, empty.str.capitalize())
  137. tm.assert_series_equal(empty_str, empty.str.swapcase())
  138. tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
  139. table = str.maketrans("a", "b")
  140. tm.assert_series_equal(empty_str, empty.str.translate(table))
  141. @pytest.mark.parametrize(
  142. "method, expected",
  143. [
  144. ("isalnum", [True, True, True, True, True, False, True, True, False, False]),
  145. ("isalpha", [True, True, True, False, False, False, True, False, False, False]),
  146. (
  147. "isdigit",
  148. [False, False, False, True, False, False, False, True, False, False],
  149. ),
  150. (
  151. "isnumeric",
  152. [False, False, False, True, False, False, False, True, False, False],
  153. ),
  154. (
  155. "isspace",
  156. [False, False, False, False, False, False, False, False, False, True],
  157. ),
  158. (
  159. "islower",
  160. [False, True, False, False, False, False, False, False, False, False],
  161. ),
  162. (
  163. "isupper",
  164. [True, False, False, False, True, False, True, False, False, False],
  165. ),
  166. (
  167. "istitle",
  168. [True, False, True, False, True, False, False, False, False, False],
  169. ),
  170. ],
  171. )
  172. def test_ismethods(method, expected, any_string_dtype):
  173. ser = Series(
  174. ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "], dtype=any_string_dtype
  175. )
  176. expected_dtype = "bool" if any_string_dtype == "object" else "boolean"
  177. expected = Series(expected, dtype=expected_dtype)
  178. result = getattr(ser.str, method)()
  179. tm.assert_series_equal(result, expected)
  180. # compare with standard library
  181. expected = [getattr(item, method)() for item in ser]
  182. assert list(result) == expected
  183. @pytest.mark.parametrize(
  184. "method, expected",
  185. [
  186. ("isnumeric", [False, True, True, False, True, True, False]),
  187. ("isdecimal", [False, True, False, False, False, True, False]),
  188. ],
  189. )
  190. def test_isnumeric_unicode(method, expected, any_string_dtype):
  191. # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
  192. # 0x2605: ★ not number
  193. # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
  194. # 0xFF13: 3 Em 3
  195. ser = Series(["A", "3", "¼", "★", "፸", "3", "four"], dtype=any_string_dtype)
  196. expected_dtype = "bool" if any_string_dtype == "object" else "boolean"
  197. expected = Series(expected, dtype=expected_dtype)
  198. result = getattr(ser.str, method)()
  199. tm.assert_series_equal(result, expected)
  200. # compare with standard library
  201. expected = [getattr(item, method)() for item in ser]
  202. assert list(result) == expected
  203. @pytest.mark.parametrize(
  204. "method, expected",
  205. [
  206. ("isnumeric", [False, np.nan, True, False, np.nan, True, False]),
  207. ("isdecimal", [False, np.nan, False, False, np.nan, True, False]),
  208. ],
  209. )
  210. def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
  211. values = ["A", np.nan, "¼", "★", np.nan, "3", "four"]
  212. ser = Series(values, dtype=any_string_dtype)
  213. expected_dtype = "object" if any_string_dtype == "object" else "boolean"
  214. expected = Series(expected, dtype=expected_dtype)
  215. result = getattr(ser.str, method)()
  216. tm.assert_series_equal(result, expected)
  217. def test_spilt_join_roundtrip(any_string_dtype):
  218. ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  219. result = ser.str.split("_").str.join("_")
  220. expected = ser.astype(object)
  221. tm.assert_series_equal(result, expected)
  222. def test_spilt_join_roundtrip_mixed_object():
  223. ser = Series(
  224. ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
  225. )
  226. result = ser.str.split("_").str.join("_")
  227. expected = Series(
  228. ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", np.nan, np.nan, np.nan]
  229. )
  230. tm.assert_series_equal(result, expected)
  231. def test_len(any_string_dtype):
  232. ser = Series(
  233. ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"],
  234. dtype=any_string_dtype,
  235. )
  236. result = ser.str.len()
  237. expected_dtype = "float64" if any_string_dtype == "object" else "Int64"
  238. expected = Series([3, 4, 6, np.nan, 8, 4, 1], dtype=expected_dtype)
  239. tm.assert_series_equal(result, expected)
  240. def test_len_mixed():
  241. ser = Series(
  242. ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
  243. )
  244. result = ser.str.len()
  245. expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
  246. tm.assert_series_equal(result, expected)
  247. @pytest.mark.parametrize(
  248. "method,sub,start,end,expected",
  249. [
  250. ("index", "EF", None, None, [4, 3, 1, 0]),
  251. ("rindex", "EF", None, None, [4, 5, 7, 4]),
  252. ("index", "EF", 3, None, [4, 3, 7, 4]),
  253. ("rindex", "EF", 3, None, [4, 5, 7, 4]),
  254. ("index", "E", 4, 8, [4, 5, 7, 4]),
  255. ("rindex", "E", 0, 5, [4, 3, 1, 4]),
  256. ],
  257. )
  258. def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected):
  259. obj = index_or_series(
  260. ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
  261. )
  262. expected_dtype = np.int64 if any_string_dtype == "object" else "Int64"
  263. expected = index_or_series(expected, dtype=expected_dtype)
  264. result = getattr(obj.str, method)(sub, start, end)
  265. if index_or_series is Series:
  266. tm.assert_series_equal(result, expected)
  267. else:
  268. tm.assert_index_equal(result, expected)
  269. # compare with standard library
  270. expected = [getattr(item, method)(sub, start, end) for item in obj]
  271. assert list(result) == expected
  272. def test_index_not_found_raises(index_or_series, any_string_dtype):
  273. obj = index_or_series(
  274. ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
  275. )
  276. with pytest.raises(ValueError, match="substring not found"):
  277. obj.str.index("DE")
  278. @pytest.mark.parametrize("method", ["index", "rindex"])
  279. def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
  280. obj = index_or_series([], dtype=any_string_dtype)
  281. msg = "expected a string object, not int"
  282. with pytest.raises(TypeError, match=msg):
  283. getattr(obj.str, method)(0)
  284. @pytest.mark.parametrize(
  285. "method, exp",
  286. [
  287. ["index", [1, 1, 0]],
  288. ["rindex", [3, 1, 2]],
  289. ],
  290. )
  291. def test_index_missing(any_string_dtype, method, exp):
  292. ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
  293. expected_dtype = np.float64 if any_string_dtype == "object" else "Int64"
  294. result = getattr(ser.str, method)("b")
  295. expected = Series(exp + [np.nan], dtype=expected_dtype)
  296. tm.assert_series_equal(result, expected)
  297. def test_pipe_failures(any_string_dtype):
  298. # #2119
  299. ser = Series(["A|B|C"], dtype=any_string_dtype)
  300. result = ser.str.split("|")
  301. expected = Series([["A", "B", "C"]], dtype=object)
  302. tm.assert_series_equal(result, expected)
  303. result = ser.str.replace("|", " ", regex=False)
  304. expected = Series(["A B C"], dtype=any_string_dtype)
  305. tm.assert_series_equal(result, expected)
  306. @pytest.mark.parametrize(
  307. "start, stop, step, expected",
  308. [
  309. (2, 5, None, ["foo", "bar", np.nan, "baz"]),
  310. (0, 3, -1, ["", "", np.nan, ""]),
  311. (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
  312. (3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
  313. (3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
  314. ],
  315. )
  316. def test_slice(start, stop, step, expected, any_string_dtype):
  317. ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype)
  318. result = ser.str.slice(start, stop, step)
  319. expected = Series(expected, dtype=any_string_dtype)
  320. tm.assert_series_equal(result, expected)
  321. @pytest.mark.parametrize(
  322. "start, stop, step, expected",
  323. [
  324. (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, np.nan, np.nan, np.nan]),
  325. (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, np.nan, np.nan, np.nan]),
  326. ],
  327. )
  328. def test_slice_mixed_object(start, stop, step, expected):
  329. ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0])
  330. result = ser.str.slice(start, stop, step)
  331. expected = Series(expected)
  332. tm.assert_series_equal(result, expected)
  333. @pytest.mark.parametrize(
  334. "start,stop,repl,expected",
  335. [
  336. (2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]),
  337. (2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]),
  338. (2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
  339. (2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
  340. (-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]),
  341. (None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]),
  342. (6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]),
  343. (-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]),
  344. ],
  345. )
  346. def test_slice_replace(start, stop, repl, expected, any_string_dtype):
  347. ser = Series(
  348. ["short", "a bit longer", "evenlongerthanthat", "", np.nan],
  349. dtype=any_string_dtype,
  350. )
  351. expected = Series(expected, dtype=any_string_dtype)
  352. result = ser.str.slice_replace(start, stop, repl)
  353. tm.assert_series_equal(result, expected)
  354. @pytest.mark.parametrize(
  355. "method, exp",
  356. [
  357. ["strip", ["aa", "bb", np.nan, "cc"]],
  358. ["lstrip", ["aa ", "bb \n", np.nan, "cc "]],
  359. ["rstrip", [" aa", " bb", np.nan, "cc"]],
  360. ],
  361. )
  362. def test_strip_lstrip_rstrip(any_string_dtype, method, exp):
  363. ser = Series([" aa ", " bb \n", np.nan, "cc "], dtype=any_string_dtype)
  364. result = getattr(ser.str, method)()
  365. expected = Series(exp, dtype=any_string_dtype)
  366. tm.assert_series_equal(result, expected)
  367. @pytest.mark.parametrize(
  368. "method, exp",
  369. [
  370. ["strip", ["aa", np.nan, "bb"]],
  371. ["lstrip", ["aa ", np.nan, "bb \t\n"]],
  372. ["rstrip", [" aa", np.nan, " bb"]],
  373. ],
  374. )
  375. def test_strip_lstrip_rstrip_mixed_object(method, exp):
  376. ser = Series([" aa ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0])
  377. result = getattr(ser.str, method)()
  378. expected = Series(exp + [np.nan, np.nan, np.nan, np.nan, np.nan])
  379. tm.assert_series_equal(result, expected)
  380. @pytest.mark.parametrize(
  381. "method, exp",
  382. [
  383. ["strip", ["ABC", " BNSD", "LDFJH "]],
  384. ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]],
  385. ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]],
  386. ],
  387. )
  388. def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp):
  389. ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype)
  390. result = getattr(ser.str, method)("x")
  391. expected = Series(exp, dtype=any_string_dtype)
  392. tm.assert_series_equal(result, expected)
  393. @pytest.mark.parametrize(
  394. "prefix, expected", [("a", ["b", " b c", "bc"]), ("ab", ["", "a b c", "bc"])]
  395. )
  396. def test_removeprefix(any_string_dtype, prefix, expected):
  397. ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
  398. result = ser.str.removeprefix(prefix)
  399. ser_expected = Series(expected, dtype=any_string_dtype)
  400. tm.assert_series_equal(result, ser_expected)
  401. @pytest.mark.parametrize(
  402. "suffix, expected", [("c", ["ab", "a b ", "b"]), ("bc", ["ab", "a b c", ""])]
  403. )
  404. def test_removesuffix(any_string_dtype, suffix, expected):
  405. ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
  406. result = ser.str.removesuffix(suffix)
  407. ser_expected = Series(expected, dtype=any_string_dtype)
  408. tm.assert_series_equal(result, ser_expected)
  409. def test_string_slice_get_syntax(any_string_dtype):
  410. ser = Series(
  411. ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
  412. dtype=any_string_dtype,
  413. )
  414. result = ser.str[0]
  415. expected = ser.str.get(0)
  416. tm.assert_series_equal(result, expected)
  417. result = ser.str[:3]
  418. expected = ser.str.slice(stop=3)
  419. tm.assert_series_equal(result, expected)
  420. result = ser.str[2::-1]
  421. expected = ser.str.slice(start=2, step=-1)
  422. tm.assert_series_equal(result, expected)
  423. def test_string_slice_out_of_bounds_nested():
  424. ser = Series([(1, 2), (1,), (3, 4, 5)])
  425. result = ser.str[1]
  426. expected = Series([2, np.nan, 4])
  427. tm.assert_series_equal(result, expected)
  428. def test_string_slice_out_of_bounds(any_string_dtype):
  429. ser = Series(["foo", "b", "ba"], dtype=any_string_dtype)
  430. result = ser.str[1]
  431. expected = Series(["o", np.nan, "a"], dtype=any_string_dtype)
  432. tm.assert_series_equal(result, expected)
  433. def test_encode_decode(any_string_dtype):
  434. ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
  435. result = ser.str.decode("utf-8")
  436. expected = ser.map(lambda x: x.decode("utf-8"))
  437. tm.assert_series_equal(result, expected)
  438. def test_encode_errors_kwarg(any_string_dtype):
  439. ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype)
  440. msg = (
  441. r"'charmap' codec can't encode character '\\x9d' in position 1: "
  442. "character maps to <undefined>"
  443. )
  444. with pytest.raises(UnicodeEncodeError, match=msg):
  445. ser.str.encode("cp1252")
  446. result = ser.str.encode("cp1252", "ignore")
  447. expected = ser.map(lambda x: x.encode("cp1252", "ignore"))
  448. tm.assert_series_equal(result, expected)
  449. def test_decode_errors_kwarg():
  450. ser = Series([b"a", b"b", b"a\x9d"])
  451. msg = (
  452. "'charmap' codec can't decode byte 0x9d in position 1: "
  453. "character maps to <undefined>"
  454. )
  455. with pytest.raises(UnicodeDecodeError, match=msg):
  456. ser.str.decode("cp1252")
  457. result = ser.str.decode("cp1252", "ignore")
  458. expected = ser.map(lambda x: x.decode("cp1252", "ignore"))
  459. tm.assert_series_equal(result, expected)
  460. @pytest.mark.parametrize(
  461. "form, expected",
  462. [
  463. ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]),
  464. ("NFC", ["ABC", "ABC", "123", np.nan, "アイエ"]),
  465. ],
  466. )
  467. def test_normalize(form, expected, any_string_dtype):
  468. ser = Series(
  469. ["ABC", "ABC", "123", np.nan, "アイエ"],
  470. index=["a", "b", "c", "d", "e"],
  471. dtype=any_string_dtype,
  472. )
  473. expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype)
  474. result = ser.str.normalize(form)
  475. tm.assert_series_equal(result, expected)
  476. def test_normalize_bad_arg_raises(any_string_dtype):
  477. ser = Series(
  478. ["ABC", "ABC", "123", np.nan, "アイエ"],
  479. index=["a", "b", "c", "d", "e"],
  480. dtype=any_string_dtype,
  481. )
  482. with pytest.raises(ValueError, match="invalid normalization form"):
  483. ser.str.normalize("xxx")
  484. def test_normalize_index():
  485. idx = Index(["ABC", "123", "アイエ"])
  486. expected = Index(["ABC", "123", "アイエ"])
  487. result = idx.str.normalize("NFKC")
  488. tm.assert_index_equal(result, expected)
  489. @pytest.mark.parametrize(
  490. "values,inferred_type",
  491. [
  492. (["a", "b"], "string"),
  493. (["a", "b", 1], "mixed-integer"),
  494. (["a", "b", 1.3], "mixed"),
  495. (["a", "b", 1.3, 1], "mixed-integer"),
  496. (["aa", datetime(2011, 1, 1)], "mixed"),
  497. ],
  498. )
  499. def test_index_str_accessor_visibility(values, inferred_type, index_or_series):
  500. obj = index_or_series(values)
  501. if index_or_series is Index:
  502. assert obj.inferred_type == inferred_type
  503. assert isinstance(obj.str, StringMethods)
  504. @pytest.mark.parametrize(
  505. "values,inferred_type",
  506. [
  507. ([1, np.nan], "floating"),
  508. ([datetime(2011, 1, 1)], "datetime64"),
  509. ([timedelta(1)], "timedelta64"),
  510. ],
  511. )
  512. def test_index_str_accessor_non_string_values_raises(
  513. values, inferred_type, index_or_series
  514. ):
  515. obj = index_or_series(values)
  516. if index_or_series is Index:
  517. assert obj.inferred_type == inferred_type
  518. msg = "Can only use .str accessor with string values"
  519. with pytest.raises(AttributeError, match=msg):
  520. obj.str
  521. def test_index_str_accessor_multiindex_raises():
  522. # MultiIndex has mixed dtype, but not allow to use accessor
  523. idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
  524. assert idx.inferred_type == "mixed"
  525. msg = "Can only use .str accessor with Index, not MultiIndex"
  526. with pytest.raises(AttributeError, match=msg):
  527. idx.str
  528. def test_str_accessor_no_new_attributes(any_string_dtype):
  529. # https://github.com/pandas-dev/pandas/issues/10673
  530. ser = Series(list("aabbcde"), dtype=any_string_dtype)
  531. with pytest.raises(AttributeError, match="You cannot add any new attribute"):
  532. ser.str.xlabel = "a"
  533. def test_cat_on_bytes_raises():
  534. lhs = Series(np.array(list("abc"), "S1").astype(object))
  535. rhs = Series(np.array(list("def"), "S1").astype(object))
  536. msg = "Cannot use .str.cat with values of inferred dtype 'bytes'"
  537. with pytest.raises(TypeError, match=msg):
  538. lhs.str.cat(rhs)
  539. def test_str_accessor_in_apply_func():
  540. # https://github.com/pandas-dev/pandas/issues/38979
  541. df = DataFrame(zip("abc", "def"))
  542. expected = Series(["A/D", "B/E", "C/F"])
  543. result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
  544. tm.assert_series_equal(result, expected)
  545. def test_zfill():
  546. # https://github.com/pandas-dev/pandas/issues/20868
  547. value = Series(["-1", "1", "1000", 10, np.nan])
  548. expected = Series(["-01", "001", "1000", np.nan, np.nan])
  549. tm.assert_series_equal(value.str.zfill(3), expected)
  550. value = Series(["-2", "+5"])
  551. expected = Series(["-0002", "+0005"])
  552. tm.assert_series_equal(value.str.zfill(5), expected)
  553. def test_zfill_with_non_integer_argument():
  554. value = Series(["-2", "+5"])
  555. wid = "a"
  556. msg = f"width must be of integer type, not {type(wid).__name__}"
  557. with pytest.raises(TypeError, match=msg):
  558. value.str.zfill(wid)
  559. def test_zfill_with_leading_sign():
  560. value = Series(["-cat", "-1", "+dog"])
  561. expected = Series(["-0cat", "-0001", "+0dog"])
  562. tm.assert_series_equal(value.str.zfill(5), expected)
  563. def test_get_with_dict_label():
  564. # GH47911
  565. s = Series(
  566. [
  567. {"name": "Hello", "value": "World"},
  568. {"name": "Goodbye", "value": "Planet"},
  569. {"value": "Sea"},
  570. ]
  571. )
  572. result = s.str.get("name")
  573. expected = Series(["Hello", "Goodbye", None])
  574. tm.assert_series_equal(result, expected)
  575. result = s.str.get("value")
  576. expected = Series(["World", "Planet", "Sea"])
  577. tm.assert_series_equal(result, expected)