test_split_partition.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. _testing as tm,
  12. )
  13. @pytest.mark.parametrize("method", ["split", "rsplit"])
  14. def test_split(any_string_dtype, method):
  15. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  16. result = getattr(values.str, method)("_")
  17. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  18. tm.assert_series_equal(result, exp)
  19. @pytest.mark.parametrize("method", ["split", "rsplit"])
  20. def test_split_more_than_one_char(any_string_dtype, method):
  21. # more than one char
  22. values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
  23. result = getattr(values.str, method)("__")
  24. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  25. tm.assert_series_equal(result, exp)
  26. result = getattr(values.str, method)("__", expand=False)
  27. tm.assert_series_equal(result, exp)
  28. def test_split_more_regex_split(any_string_dtype):
  29. # regex split
  30. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
  31. result = values.str.split("[,_]")
  32. exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
  33. tm.assert_series_equal(result, exp)
  34. def test_split_regex(any_string_dtype):
  35. # GH 43563
  36. # explicit regex = True split
  37. values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
  38. result = values.str.split(r"\.jpg", regex=True)
  39. exp = Series([["xxxjpgzzz", ""]])
  40. tm.assert_series_equal(result, exp)
  41. def test_split_regex_explicit(any_string_dtype):
  42. # explicit regex = True split with compiled regex
  43. regex_pat = re.compile(r".jpg")
  44. values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
  45. result = values.str.split(regex_pat)
  46. exp = Series([["xx", "zzz", ""]])
  47. tm.assert_series_equal(result, exp)
  48. # explicit regex = False split
  49. result = values.str.split(r"\.jpg", regex=False)
  50. exp = Series([["xxxjpgzzz.jpg"]])
  51. tm.assert_series_equal(result, exp)
  52. # non explicit regex split, pattern length == 1
  53. result = values.str.split(r".")
  54. exp = Series([["xxxjpgzzz", "jpg"]])
  55. tm.assert_series_equal(result, exp)
  56. # non explicit regex split, pattern length != 1
  57. result = values.str.split(r".jpg")
  58. exp = Series([["xx", "zzz", ""]])
  59. tm.assert_series_equal(result, exp)
  60. # regex=False with pattern compiled regex raises error
  61. with pytest.raises(
  62. ValueError,
  63. match="Cannot use a compiled regex as replacement pattern with regex=False",
  64. ):
  65. values.str.split(regex_pat, regex=False)
  66. @pytest.mark.parametrize("expand", [None, False])
  67. @pytest.mark.parametrize("method", ["split", "rsplit"])
  68. def test_split_object_mixed(expand, method):
  69. mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
  70. result = getattr(mixed.str, method)("_", expand=expand)
  71. exp = Series(
  72. [
  73. ["a", "b", "c"],
  74. np.nan,
  75. ["d", "e", "f"],
  76. np.nan,
  77. np.nan,
  78. np.nan,
  79. np.nan,
  80. np.nan,
  81. ]
  82. )
  83. assert isinstance(result, Series)
  84. tm.assert_almost_equal(result, exp)
  85. @pytest.mark.parametrize("method", ["split", "rsplit"])
  86. @pytest.mark.parametrize("n", [None, 0])
  87. def test_split_n(any_string_dtype, method, n):
  88. s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
  89. expected = Series([["a", "b"], pd.NA, ["b", "c"]])
  90. result = getattr(s.str, method)(" ", n=n)
  91. tm.assert_series_equal(result, expected)
  92. def test_rsplit(any_string_dtype):
  93. # regex split is not supported by rsplit
  94. values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
  95. result = values.str.rsplit("[,_]")
  96. exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
  97. tm.assert_series_equal(result, exp)
  98. def test_rsplit_max_number(any_string_dtype):
  99. # setting max number of splits, make sure it's from reverse
  100. values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  101. result = values.str.rsplit("_", n=1)
  102. exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
  103. tm.assert_series_equal(result, exp)
  104. def test_split_blank_string(any_string_dtype):
  105. # expand blank split GH 20067
  106. values = Series([""], name="test", dtype=any_string_dtype)
  107. result = values.str.split(expand=True)
  108. exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df
  109. tm.assert_frame_equal(result, exp)
  110. def test_split_blank_string_with_non_empty(any_string_dtype):
  111. values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
  112. result = values.str.split(expand=True)
  113. exp = DataFrame(
  114. [
  115. ["a", "b", "c"],
  116. ["a", "b", np.nan],
  117. [np.nan, np.nan, np.nan],
  118. [np.nan, np.nan, np.nan],
  119. ],
  120. dtype=any_string_dtype,
  121. )
  122. tm.assert_frame_equal(result, exp)
  123. @pytest.mark.parametrize("method", ["split", "rsplit"])
  124. def test_split_noargs(any_string_dtype, method):
  125. # #1859
  126. s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype)
  127. result = getattr(s.str, method)()
  128. expected = ["Travis", "Oliphant"]
  129. assert result[1] == expected
  130. @pytest.mark.parametrize(
  131. "data, pat",
  132. [
  133. (["bd asdf jfg", "kjasdflqw asdfnfk"], None),
  134. (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
  135. (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
  136. ],
  137. )
  138. @pytest.mark.parametrize("n", [-1, 0])
  139. def test_split_maxsplit(data, pat, any_string_dtype, n):
  140. # re.split 0, str.split -1
  141. s = Series(data, dtype=any_string_dtype)
  142. result = s.str.split(pat=pat, n=n)
  143. xp = s.str.split(pat=pat)
  144. tm.assert_series_equal(result, xp)
  145. @pytest.mark.parametrize(
  146. "data, pat, expected",
  147. [
  148. (
  149. ["split once", "split once too!"],
  150. None,
  151. Series({0: ["split", "once"], 1: ["split", "once too!"]}),
  152. ),
  153. (
  154. ["split_once", "split_once_too!"],
  155. "_",
  156. Series({0: ["split", "once"], 1: ["split", "once_too!"]}),
  157. ),
  158. ],
  159. )
  160. def test_split_no_pat_with_nonzero_n(data, pat, expected, any_string_dtype):
  161. s = Series(data, dtype=any_string_dtype)
  162. result = s.str.split(pat=pat, n=1)
  163. tm.assert_series_equal(expected, result, check_index_type=False)
  164. def test_split_to_dataframe_no_splits(any_string_dtype):
  165. s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
  166. result = s.str.split("_", expand=True)
  167. exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
  168. tm.assert_frame_equal(result, exp)
  169. def test_split_to_dataframe(any_string_dtype):
  170. s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
  171. result = s.str.split("_", expand=True)
  172. exp = DataFrame(
  173. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  174. dtype=any_string_dtype,
  175. )
  176. tm.assert_frame_equal(result, exp)
  177. def test_split_to_dataframe_unequal_splits(any_string_dtype):
  178. s = Series(
  179. ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
  180. )
  181. result = s.str.split("_", expand=True)
  182. exp = DataFrame(
  183. {
  184. 0: ["some", "one"],
  185. 1: ["unequal", "of"],
  186. 2: ["splits", "these"],
  187. 3: [np.nan, "things"],
  188. 4: [np.nan, "is"],
  189. 5: [np.nan, "not"],
  190. },
  191. dtype=any_string_dtype,
  192. )
  193. tm.assert_frame_equal(result, exp)
  194. def test_split_to_dataframe_with_index(any_string_dtype):
  195. s = Series(
  196. ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
  197. )
  198. result = s.str.split("_", expand=True)
  199. exp = DataFrame(
  200. {0: ["some", "with"], 1: ["splits", "index"]},
  201. index=["preserve", "me"],
  202. dtype=any_string_dtype,
  203. )
  204. tm.assert_frame_equal(result, exp)
  205. with pytest.raises(ValueError, match="expand must be"):
  206. s.str.split("_", expand="not_a_boolean")
  207. def test_split_to_multiindex_expand_no_splits():
  208. # https://github.com/pandas-dev/pandas/issues/23677
  209. idx = Index(["nosplit", "alsonosplit", np.nan])
  210. result = idx.str.split("_", expand=True)
  211. exp = idx
  212. tm.assert_index_equal(result, exp)
  213. assert result.nlevels == 1
  214. def test_split_to_multiindex_expand():
  215. idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
  216. result = idx.str.split("_", expand=True)
  217. exp = MultiIndex.from_tuples(
  218. [
  219. ("some", "equal", "splits"),
  220. ("with", "no", "nans"),
  221. [np.nan, np.nan, np.nan],
  222. [None, None, None],
  223. ]
  224. )
  225. tm.assert_index_equal(result, exp)
  226. assert result.nlevels == 3
  227. def test_split_to_multiindex_expand_unequal_splits():
  228. idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
  229. result = idx.str.split("_", expand=True)
  230. exp = MultiIndex.from_tuples(
  231. [
  232. ("some", "unequal", "splits", np.nan, np.nan, np.nan),
  233. ("one", "of", "these", "things", "is", "not"),
  234. (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
  235. (None, None, None, None, None, None),
  236. ]
  237. )
  238. tm.assert_index_equal(result, exp)
  239. assert result.nlevels == 6
  240. with pytest.raises(ValueError, match="expand must be"):
  241. idx.str.split("_", expand="not_a_boolean")
  242. def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype):
  243. s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
  244. result = s.str.rsplit("_", expand=True)
  245. exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
  246. tm.assert_frame_equal(result, exp)
  247. def test_rsplit_to_dataframe_expand(any_string_dtype):
  248. s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
  249. result = s.str.rsplit("_", expand=True)
  250. exp = DataFrame(
  251. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  252. dtype=any_string_dtype,
  253. )
  254. tm.assert_frame_equal(result, exp)
  255. result = s.str.rsplit("_", expand=True, n=2)
  256. exp = DataFrame(
  257. {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
  258. dtype=any_string_dtype,
  259. )
  260. tm.assert_frame_equal(result, exp)
  261. result = s.str.rsplit("_", expand=True, n=1)
  262. exp = DataFrame(
  263. {0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype
  264. )
  265. tm.assert_frame_equal(result, exp)
  266. def test_rsplit_to_dataframe_expand_with_index(any_string_dtype):
  267. s = Series(
  268. ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
  269. )
  270. result = s.str.rsplit("_", expand=True)
  271. exp = DataFrame(
  272. {0: ["some", "with"], 1: ["splits", "index"]},
  273. index=["preserve", "me"],
  274. dtype=any_string_dtype,
  275. )
  276. tm.assert_frame_equal(result, exp)
  277. def test_rsplit_to_multiindex_expand_no_split():
  278. idx = Index(["nosplit", "alsonosplit"])
  279. result = idx.str.rsplit("_", expand=True)
  280. exp = idx
  281. tm.assert_index_equal(result, exp)
  282. assert result.nlevels == 1
  283. def test_rsplit_to_multiindex_expand():
  284. idx = Index(["some_equal_splits", "with_no_nans"])
  285. result = idx.str.rsplit("_", expand=True)
  286. exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")])
  287. tm.assert_index_equal(result, exp)
  288. assert result.nlevels == 3
  289. def test_rsplit_to_multiindex_expand_n():
  290. idx = Index(["some_equal_splits", "with_no_nans"])
  291. result = idx.str.rsplit("_", expand=True, n=1)
  292. exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
  293. tm.assert_index_equal(result, exp)
  294. assert result.nlevels == 2
  295. def test_split_nan_expand(any_string_dtype):
  296. # gh-18450
  297. s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
  298. result = s.str.split(",", expand=True)
  299. exp = DataFrame(
  300. [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
  301. )
  302. tm.assert_frame_equal(result, exp)
  303. # check that these are actually np.nan/pd.NA and not None
  304. # TODO see GH 18463
  305. # tm.assert_frame_equal does not differentiate
  306. if any_string_dtype == "object":
  307. assert all(np.isnan(x) for x in result.iloc[1])
  308. else:
  309. assert all(x is pd.NA for x in result.iloc[1])
  310. def test_split_with_name_series(any_string_dtype):
  311. # GH 12617
  312. # should preserve name
  313. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  314. res = s.str.split(",")
  315. exp = Series([["a", "b"], ["c", "d"]], name="xxx")
  316. tm.assert_series_equal(res, exp)
  317. res = s.str.split(",", expand=True)
  318. exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
  319. tm.assert_frame_equal(res, exp)
  320. def test_split_with_name_index():
  321. # GH 12617
  322. idx = Index(["a,b", "c,d"], name="xxx")
  323. res = idx.str.split(",")
  324. exp = Index([["a", "b"], ["c", "d"]], name="xxx")
  325. assert res.nlevels == 1
  326. tm.assert_index_equal(res, exp)
  327. res = idx.str.split(",", expand=True)
  328. exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
  329. assert res.nlevels == 2
  330. tm.assert_index_equal(res, exp)
  331. @pytest.mark.parametrize(
  332. "method, exp",
  333. [
  334. [
  335. "partition",
  336. [
  337. ("a", "__", "b__c"),
  338. ("c", "__", "d__e"),
  339. np.nan,
  340. ("f", "__", "g__h"),
  341. None,
  342. ],
  343. ],
  344. [
  345. "rpartition",
  346. [
  347. ("a__b", "__", "c"),
  348. ("c__d", "__", "e"),
  349. np.nan,
  350. ("f__g", "__", "h"),
  351. None,
  352. ],
  353. ],
  354. ],
  355. )
  356. def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
  357. # https://github.com/pandas-dev/pandas/issues/23558
  358. # more than one char
  359. s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
  360. result = getattr(s.str, method)("__", expand=False)
  361. expected = Series(exp)
  362. tm.assert_series_equal(result, expected)
  363. @pytest.mark.parametrize(
  364. "method, exp",
  365. [
  366. [
  367. "partition",
  368. [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None],
  369. ],
  370. [
  371. "rpartition",
  372. [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None],
  373. ],
  374. ],
  375. )
  376. def test_partition_series_none(any_string_dtype, method, exp):
  377. # https://github.com/pandas-dev/pandas/issues/23558
  378. # None
  379. s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
  380. result = getattr(s.str, method)(expand=False)
  381. expected = Series(exp)
  382. tm.assert_series_equal(result, expected)
  383. @pytest.mark.parametrize(
  384. "method, exp",
  385. [
  386. [
  387. "partition",
  388. [("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None],
  389. ],
  390. [
  391. "rpartition",
  392. [("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None],
  393. ],
  394. ],
  395. )
  396. def test_partition_series_not_split(any_string_dtype, method, exp):
  397. # https://github.com/pandas-dev/pandas/issues/23558
  398. # Not split
  399. s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
  400. result = getattr(s.str, method)("_", expand=False)
  401. expected = Series(exp)
  402. tm.assert_series_equal(result, expected)
  403. @pytest.mark.parametrize(
  404. "method, exp",
  405. [
  406. [
  407. "partition",
  408. [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")],
  409. ],
  410. [
  411. "rpartition",
  412. [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")],
  413. ],
  414. ],
  415. )
  416. def test_partition_series_unicode(any_string_dtype, method, exp):
  417. # https://github.com/pandas-dev/pandas/issues/23558
  418. # unicode
  419. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  420. result = getattr(s.str, method)("_", expand=False)
  421. expected = Series(exp)
  422. tm.assert_series_equal(result, expected)
  423. @pytest.mark.parametrize("method", ["partition", "rpartition"])
  424. def test_partition_series_stdlib(any_string_dtype, method):
  425. # https://github.com/pandas-dev/pandas/issues/23558
  426. # compare to standard lib
  427. s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype)
  428. result = getattr(s.str, method)("_", expand=False).tolist()
  429. assert result == [getattr(v, method)("_") for v in s]
  430. @pytest.mark.parametrize(
  431. "method, expand, exp, exp_levels",
  432. [
  433. [
  434. "partition",
  435. False,
  436. np.array(
  437. [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
  438. dtype=object,
  439. ),
  440. 1,
  441. ],
  442. [
  443. "rpartition",
  444. False,
  445. np.array(
  446. [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
  447. dtype=object,
  448. ),
  449. 1,
  450. ],
  451. ],
  452. )
  453. def test_partition_index(method, expand, exp, exp_levels):
  454. # https://github.com/pandas-dev/pandas/issues/23558
  455. values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
  456. result = getattr(values.str, method)("_", expand=expand)
  457. exp = Index(exp)
  458. tm.assert_index_equal(result, exp)
  459. assert result.nlevels == exp_levels
  460. @pytest.mark.parametrize(
  461. "method, exp",
  462. [
  463. [
  464. "partition",
  465. {
  466. 0: ["a", "c", np.nan, "f", None],
  467. 1: ["_", "_", np.nan, "_", None],
  468. 2: ["b_c", "d_e", np.nan, "g_h", None],
  469. },
  470. ],
  471. [
  472. "rpartition",
  473. {
  474. 0: ["a_b", "c_d", np.nan, "f_g", None],
  475. 1: ["_", "_", np.nan, "_", None],
  476. 2: ["c", "e", np.nan, "h", None],
  477. },
  478. ],
  479. ],
  480. )
  481. def test_partition_to_dataframe(any_string_dtype, method, exp):
  482. # https://github.com/pandas-dev/pandas/issues/23558
  483. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
  484. result = getattr(s.str, method)("_")
  485. expected = DataFrame(
  486. exp,
  487. dtype=any_string_dtype,
  488. )
  489. tm.assert_frame_equal(result, expected)
  490. @pytest.mark.parametrize(
  491. "method, exp",
  492. [
  493. [
  494. "partition",
  495. {
  496. 0: ["a", "c", np.nan, "f", None],
  497. 1: ["_", "_", np.nan, "_", None],
  498. 2: ["b_c", "d_e", np.nan, "g_h", None],
  499. },
  500. ],
  501. [
  502. "rpartition",
  503. {
  504. 0: ["a_b", "c_d", np.nan, "f_g", None],
  505. 1: ["_", "_", np.nan, "_", None],
  506. 2: ["c", "e", np.nan, "h", None],
  507. },
  508. ],
  509. ],
  510. )
  511. def test_partition_to_dataframe_from_series(any_string_dtype, method, exp):
  512. # https://github.com/pandas-dev/pandas/issues/23558
  513. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
  514. result = getattr(s.str, method)("_", expand=True)
  515. expected = DataFrame(
  516. exp,
  517. dtype=any_string_dtype,
  518. )
  519. tm.assert_frame_equal(result, expected)
  520. def test_partition_with_name(any_string_dtype):
  521. # GH 12617
  522. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  523. result = s.str.partition(",")
  524. expected = DataFrame(
  525. {0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype
  526. )
  527. tm.assert_frame_equal(result, expected)
  528. def test_partition_with_name_expand(any_string_dtype):
  529. # GH 12617
  530. # should preserve name
  531. s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
  532. result = s.str.partition(",", expand=False)
  533. expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
  534. tm.assert_series_equal(result, expected)
  535. def test_partition_index_with_name():
  536. idx = Index(["a,b", "c,d"], name="xxx")
  537. result = idx.str.partition(",")
  538. expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
  539. assert result.nlevels == 3
  540. tm.assert_index_equal(result, expected)
  541. def test_partition_index_with_name_expand_false():
  542. idx = Index(["a,b", "c,d"], name="xxx")
  543. # should preserve name
  544. result = idx.str.partition(",", expand=False)
  545. expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
  546. assert result.nlevels == 1
  547. tm.assert_index_equal(result, expected)
  548. @pytest.mark.parametrize("method", ["partition", "rpartition"])
  549. def test_partition_sep_kwarg(any_string_dtype, method):
  550. # GH 22676; depr kwarg "pat" in favor of "sep"
  551. s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
  552. expected = getattr(s.str, method)(sep="_")
  553. result = getattr(s.str, method)("_")
  554. tm.assert_frame_equal(result, expected)
  555. def test_get():
  556. ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
  557. result = ser.str.split("_").str.get(1)
  558. expected = Series(["b", "d", np.nan, "g"])
  559. tm.assert_series_equal(result, expected)
  560. def test_get_mixed_object():
  561. ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
  562. result = ser.str.split("_").str.get(1)
  563. expected = Series(["b", np.nan, "d", np.nan, np.nan, np.nan, np.nan, np.nan])
  564. tm.assert_series_equal(result, expected)
  565. @pytest.mark.parametrize("idx", [2, -3])
  566. def test_get_bounds(idx):
  567. ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
  568. result = ser.str.split("_").str.get(idx)
  569. expected = Series(["3", "8", np.nan])
  570. tm.assert_series_equal(result, expected)
  571. @pytest.mark.parametrize(
  572. "idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]]
  573. )
  574. def test_get_complex(idx, exp):
  575. # GH 20671, getting value not in dict raising `KeyError`
  576. ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
  577. result = ser.str.get(idx)
  578. expected = Series(exp)
  579. tm.assert_series_equal(result, expected)
  580. @pytest.mark.parametrize("to_type", [tuple, list, np.array])
  581. def test_get_complex_nested(to_type):
  582. ser = Series([to_type([to_type([1, 2])])])
  583. result = ser.str.get(0)
  584. expected = Series([to_type([1, 2])])
  585. tm.assert_series_equal(result, expected)
  586. result = ser.str.get(1)
  587. expected = Series([np.nan])
  588. tm.assert_series_equal(result, expected)
  589. def test_get_strings(any_string_dtype):
  590. ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype)
  591. result = ser.str.get(2)
  592. expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype)
  593. tm.assert_series_equal(result, expected)