test_extract.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. from datetime import datetime
  2. import re
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Series,
  10. _testing as tm,
  11. )
  12. def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype):
  13. # TODO: should this raise TypeError
  14. values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  15. with pytest.raises(ValueError, match="expand must be True or False"):
  16. values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
  17. def test_extract_expand_kwarg(any_string_dtype):
  18. s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  19. expected = DataFrame(["BAD__", np.nan, np.nan], dtype=any_string_dtype)
  20. result = s.str.extract(".*(BAD[_]+).*")
  21. tm.assert_frame_equal(result, expected)
  22. result = s.str.extract(".*(BAD[_]+).*", expand=True)
  23. tm.assert_frame_equal(result, expected)
  24. expected = DataFrame(
  25. [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  26. )
  27. result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  28. tm.assert_frame_equal(result, expected)
  29. def test_extract_expand_False_mixed_object():
  30. ser = Series(
  31. ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0]
  32. )
  33. # two groups
  34. result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
  35. er = [np.nan, np.nan] # empty row
  36. expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
  37. tm.assert_frame_equal(result, expected)
  38. # single group
  39. result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False)
  40. expected = Series(
  41. ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
  42. )
  43. tm.assert_series_equal(result, expected)
  44. def test_extract_expand_index_raises():
  45. # GH9980
  46. # Index only works with one regex group since
  47. # multi-group would expand to a frame
  48. idx = Index(["A1", "A2", "A3", "A4", "B5"])
  49. msg = "only one regex group is supported with Index"
  50. with pytest.raises(ValueError, match=msg):
  51. idx.str.extract("([AB])([123])", expand=False)
  52. def test_extract_expand_no_capture_groups_raises(index_or_series, any_string_dtype):
  53. s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
  54. msg = "pattern contains no capture groups"
  55. # no groups
  56. with pytest.raises(ValueError, match=msg):
  57. s_or_idx.str.extract("[ABC][123]", expand=False)
  58. # only non-capturing groups
  59. with pytest.raises(ValueError, match=msg):
  60. s_or_idx.str.extract("(?:[AB]).*", expand=False)
  61. def test_extract_expand_single_capture_group(index_or_series, any_string_dtype):
  62. # single group renames series/index properly
  63. s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
  64. result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
  65. expected = index_or_series(["A", "A"], name="uno", dtype=any_string_dtype)
  66. if index_or_series == Series:
  67. tm.assert_series_equal(result, expected)
  68. else:
  69. tm.assert_index_equal(result, expected)
  70. def test_extract_expand_capture_groups(any_string_dtype):
  71. s = Series(["A1", "B2", "C3"], dtype=any_string_dtype)
  72. # one group, no matches
  73. result = s.str.extract("(_)", expand=False)
  74. expected = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
  75. tm.assert_series_equal(result, expected)
  76. # two groups, no matches
  77. result = s.str.extract("(_)(_)", expand=False)
  78. expected = DataFrame(
  79. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  80. )
  81. tm.assert_frame_equal(result, expected)
  82. # one group, some matches
  83. result = s.str.extract("([AB])[123]", expand=False)
  84. expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
  85. tm.assert_series_equal(result, expected)
  86. # two groups, some matches
  87. result = s.str.extract("([AB])([123])", expand=False)
  88. expected = DataFrame(
  89. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  90. )
  91. tm.assert_frame_equal(result, expected)
  92. # one named group
  93. result = s.str.extract("(?P<letter>[AB])", expand=False)
  94. expected = Series(["A", "B", np.nan], name="letter", dtype=any_string_dtype)
  95. tm.assert_series_equal(result, expected)
  96. # two named groups
  97. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
  98. expected = DataFrame(
  99. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  100. columns=["letter", "number"],
  101. dtype=any_string_dtype,
  102. )
  103. tm.assert_frame_equal(result, expected)
  104. # mix named and unnamed groups
  105. result = s.str.extract("([AB])(?P<number>[123])", expand=False)
  106. expected = DataFrame(
  107. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  108. columns=[0, "number"],
  109. dtype=any_string_dtype,
  110. )
  111. tm.assert_frame_equal(result, expected)
  112. # one normal group, one non-capturing group
  113. result = s.str.extract("([AB])(?:[123])", expand=False)
  114. expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
  115. tm.assert_series_equal(result, expected)
  116. # two normal groups, one non-capturing group
  117. s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
  118. result = s.str.extract("([AB])([123])(?:[123])", expand=False)
  119. expected = DataFrame(
  120. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  121. )
  122. tm.assert_frame_equal(result, expected)
  123. # one optional group followed by one normal group
  124. s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
  125. result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=False)
  126. expected = DataFrame(
  127. [["A", "1"], ["B", "2"], [np.nan, "3"]],
  128. columns=["letter", "number"],
  129. dtype=any_string_dtype,
  130. )
  131. tm.assert_frame_equal(result, expected)
  132. # one normal group followed by one optional group
  133. s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
  134. result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=False)
  135. expected = DataFrame(
  136. [["A", "1"], ["B", "2"], ["C", np.nan]],
  137. columns=["letter", "number"],
  138. dtype=any_string_dtype,
  139. )
  140. tm.assert_frame_equal(result, expected)
  141. def test_extract_expand_capture_groups_index(index, any_string_dtype):
  142. # https://github.com/pandas-dev/pandas/issues/6348
  143. # not passing index to the extractor
  144. data = ["A1", "B2", "C"]
  145. if len(index) == 0:
  146. pytest.skip("Test requires len(index) > 0")
  147. while len(index) < len(data):
  148. index = index.repeat(2)
  149. index = index[: len(data)]
  150. ser = Series(data, index=index, dtype=any_string_dtype)
  151. result = ser.str.extract(r"(\d)", expand=False)
  152. expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype)
  153. tm.assert_series_equal(result, expected)
  154. result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
  155. expected = DataFrame(
  156. [["A", "1"], ["B", "2"], ["C", np.nan]],
  157. columns=["letter", "number"],
  158. index=index,
  159. dtype=any_string_dtype,
  160. )
  161. tm.assert_frame_equal(result, expected)
  162. def test_extract_single_series_name_is_preserved(any_string_dtype):
  163. s = Series(["a3", "b3", "c2"], name="bob", dtype=any_string_dtype)
  164. result = s.str.extract(r"(?P<sue>[a-z])", expand=False)
  165. expected = Series(["a", "b", "c"], name="sue", dtype=any_string_dtype)
  166. tm.assert_series_equal(result, expected)
  167. def test_extract_expand_True(any_string_dtype):
  168. # Contains tests like those in test_match and some others.
  169. s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
  170. result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  171. expected = DataFrame(
  172. [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  173. )
  174. tm.assert_frame_equal(result, expected)
  175. def test_extract_expand_True_mixed_object():
  176. er = [np.nan, np.nan] # empty row
  177. mixed = Series(
  178. [
  179. "aBAD_BAD",
  180. np.nan,
  181. "BAD_b_BAD",
  182. True,
  183. datetime.today(),
  184. "foo",
  185. None,
  186. 1,
  187. 2.0,
  188. ]
  189. )
  190. result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
  191. expected = DataFrame([["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er])
  192. tm.assert_frame_equal(result, expected)
  193. def test_extract_expand_True_single_capture_group_raises(
  194. index_or_series, any_string_dtype
  195. ):
  196. # these should work for both Series and Index
  197. # no groups
  198. s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
  199. msg = "pattern contains no capture groups"
  200. with pytest.raises(ValueError, match=msg):
  201. s_or_idx.str.extract("[ABC][123]", expand=True)
  202. # only non-capturing groups
  203. with pytest.raises(ValueError, match=msg):
  204. s_or_idx.str.extract("(?:[AB]).*", expand=True)
  205. def test_extract_expand_True_single_capture_group(index_or_series, any_string_dtype):
  206. # single group renames series/index properly
  207. s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
  208. result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
  209. expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype)
  210. tm.assert_frame_equal(result, expected)
  211. @pytest.mark.parametrize("name", [None, "series_name"])
  212. def test_extract_series(name, any_string_dtype):
  213. # extract should give the same result whether or not the series has a name.
  214. s = Series(["A1", "B2", "C3"], name=name, dtype=any_string_dtype)
  215. # one group, no matches
  216. result = s.str.extract("(_)", expand=True)
  217. expected = DataFrame([np.nan, np.nan, np.nan], dtype=any_string_dtype)
  218. tm.assert_frame_equal(result, expected)
  219. # two groups, no matches
  220. result = s.str.extract("(_)(_)", expand=True)
  221. expected = DataFrame(
  222. [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
  223. )
  224. tm.assert_frame_equal(result, expected)
  225. # one group, some matches
  226. result = s.str.extract("([AB])[123]", expand=True)
  227. expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
  228. tm.assert_frame_equal(result, expected)
  229. # two groups, some matches
  230. result = s.str.extract("([AB])([123])", expand=True)
  231. expected = DataFrame(
  232. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  233. )
  234. tm.assert_frame_equal(result, expected)
  235. # one named group
  236. result = s.str.extract("(?P<letter>[AB])", expand=True)
  237. expected = DataFrame({"letter": ["A", "B", np.nan]}, dtype=any_string_dtype)
  238. tm.assert_frame_equal(result, expected)
  239. # two named groups
  240. result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
  241. expected = DataFrame(
  242. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  243. columns=["letter", "number"],
  244. dtype=any_string_dtype,
  245. )
  246. tm.assert_frame_equal(result, expected)
  247. # mix named and unnamed groups
  248. result = s.str.extract("([AB])(?P<number>[123])", expand=True)
  249. expected = DataFrame(
  250. [["A", "1"], ["B", "2"], [np.nan, np.nan]],
  251. columns=[0, "number"],
  252. dtype=any_string_dtype,
  253. )
  254. tm.assert_frame_equal(result, expected)
  255. # one normal group, one non-capturing group
  256. result = s.str.extract("([AB])(?:[123])", expand=True)
  257. expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
  258. tm.assert_frame_equal(result, expected)
  259. def test_extract_optional_groups(any_string_dtype):
  260. # two normal groups, one non-capturing group
  261. s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
  262. result = s.str.extract("([AB])([123])(?:[123])", expand=True)
  263. expected = DataFrame(
  264. [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
  265. )
  266. tm.assert_frame_equal(result, expected)
  267. # one optional group followed by one normal group
  268. s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
  269. result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=True)
  270. expected = DataFrame(
  271. [["A", "1"], ["B", "2"], [np.nan, "3"]],
  272. columns=["letter", "number"],
  273. dtype=any_string_dtype,
  274. )
  275. tm.assert_frame_equal(result, expected)
  276. # one normal group followed by one optional group
  277. s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
  278. result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=True)
  279. expected = DataFrame(
  280. [["A", "1"], ["B", "2"], ["C", np.nan]],
  281. columns=["letter", "number"],
  282. dtype=any_string_dtype,
  283. )
  284. tm.assert_frame_equal(result, expected)
  285. def test_extract_dataframe_capture_groups_index(index, any_string_dtype):
  286. # GH6348
  287. # not passing index to the extractor
  288. data = ["A1", "B2", "C"]
  289. if len(index) < len(data):
  290. pytest.skip("Index too short")
  291. index = index[: len(data)]
  292. s = Series(data, index=index, dtype=any_string_dtype)
  293. result = s.str.extract(r"(\d)", expand=True)
  294. expected = DataFrame(["1", "2", np.nan], index=index, dtype=any_string_dtype)
  295. tm.assert_frame_equal(result, expected)
  296. result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=True)
  297. expected = DataFrame(
  298. [["A", "1"], ["B", "2"], ["C", np.nan]],
  299. columns=["letter", "number"],
  300. index=index,
  301. dtype=any_string_dtype,
  302. )
  303. tm.assert_frame_equal(result, expected)
  304. def test_extract_single_group_returns_frame(any_string_dtype):
  305. # GH11386 extract should always return DataFrame, even when
  306. # there is only one group. Prior to v0.18.0, extract returned
  307. # Series when there was only one group in the regex.
  308. s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
  309. result = s.str.extract(r"(?P<letter>[a-z])", expand=True)
  310. expected = DataFrame({"letter": ["a", "b", "c"]}, dtype=any_string_dtype)
  311. tm.assert_frame_equal(result, expected)
  312. def test_extractall(any_string_dtype):
  313. data = [
  314. "dave@google.com",
  315. "tdhock5@gmail.com",
  316. "maudelaperriere@gmail.com",
  317. "rob@gmail.com some text steve@gmail.com",
  318. "a@b.com some text c@d.com and e@f.com",
  319. np.nan,
  320. "",
  321. ]
  322. expected_tuples = [
  323. ("dave", "google", "com"),
  324. ("tdhock5", "gmail", "com"),
  325. ("maudelaperriere", "gmail", "com"),
  326. ("rob", "gmail", "com"),
  327. ("steve", "gmail", "com"),
  328. ("a", "b", "com"),
  329. ("c", "d", "com"),
  330. ("e", "f", "com"),
  331. ]
  332. pat = r"""
  333. (?P<user>[a-z0-9]+)
  334. @
  335. (?P<domain>[a-z]+)
  336. \.
  337. (?P<tld>[a-z]{2,4})
  338. """
  339. expected_columns = ["user", "domain", "tld"]
  340. s = Series(data, dtype=any_string_dtype)
  341. # extractall should return a DataFrame with one row for each match, indexed by the
  342. # subject from which the match came.
  343. expected_index = MultiIndex.from_tuples(
  344. [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
  345. names=(None, "match"),
  346. )
  347. expected = DataFrame(
  348. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  349. )
  350. result = s.str.extractall(pat, flags=re.VERBOSE)
  351. tm.assert_frame_equal(result, expected)
  352. # The index of the input Series should be used to construct the index of the output
  353. # DataFrame:
  354. mi = MultiIndex.from_tuples(
  355. [
  356. ("single", "Dave"),
  357. ("single", "Toby"),
  358. ("single", "Maude"),
  359. ("multiple", "robAndSteve"),
  360. ("multiple", "abcdef"),
  361. ("none", "missing"),
  362. ("none", "empty"),
  363. ]
  364. )
  365. s = Series(data, index=mi, dtype=any_string_dtype)
  366. expected_index = MultiIndex.from_tuples(
  367. [
  368. ("single", "Dave", 0),
  369. ("single", "Toby", 0),
  370. ("single", "Maude", 0),
  371. ("multiple", "robAndSteve", 0),
  372. ("multiple", "robAndSteve", 1),
  373. ("multiple", "abcdef", 0),
  374. ("multiple", "abcdef", 1),
  375. ("multiple", "abcdef", 2),
  376. ],
  377. names=(None, None, "match"),
  378. )
  379. expected = DataFrame(
  380. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  381. )
  382. result = s.str.extractall(pat, flags=re.VERBOSE)
  383. tm.assert_frame_equal(result, expected)
  384. # MultiIndexed subject with names.
  385. s = Series(data, index=mi, dtype=any_string_dtype)
  386. s.index.names = ("matches", "description")
  387. expected_index.names = ("matches", "description", "match")
  388. expected = DataFrame(
  389. expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
  390. )
  391. result = s.str.extractall(pat, flags=re.VERBOSE)
  392. tm.assert_frame_equal(result, expected)
  393. @pytest.mark.parametrize(
  394. "pat,expected_names",
  395. [
  396. # optional groups.
  397. ("(?P<letter>[AB])?(?P<number>[123])", ["letter", "number"]),
  398. # only one of two groups has a name.
  399. ("([AB])?(?P<number>[123])", [0, "number"]),
  400. ],
  401. )
  402. def test_extractall_column_names(pat, expected_names, any_string_dtype):
  403. s = Series(["", "A1", "32"], dtype=any_string_dtype)
  404. result = s.str.extractall(pat)
  405. expected = DataFrame(
  406. [("A", "1"), (np.nan, "3"), (np.nan, "2")],
  407. index=MultiIndex.from_tuples([(1, 0), (2, 0), (2, 1)], names=(None, "match")),
  408. columns=expected_names,
  409. dtype=any_string_dtype,
  410. )
  411. tm.assert_frame_equal(result, expected)
  412. def test_extractall_single_group(any_string_dtype):
  413. s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
  414. expected_index = MultiIndex.from_tuples(
  415. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  416. )
  417. # extractall(one named group) returns DataFrame with one named column.
  418. result = s.str.extractall(r"(?P<letter>[a-z])")
  419. expected = DataFrame(
  420. {"letter": ["a", "b", "d", "c"]}, index=expected_index, dtype=any_string_dtype
  421. )
  422. tm.assert_frame_equal(result, expected)
  423. # extractall(one un-named group) returns DataFrame with one un-named column.
  424. result = s.str.extractall(r"([a-z])")
  425. expected = DataFrame(
  426. ["a", "b", "d", "c"], index=expected_index, dtype=any_string_dtype
  427. )
  428. tm.assert_frame_equal(result, expected)
  429. def test_extractall_single_group_with_quantifier(any_string_dtype):
  430. # GH#13382
  431. # extractall(one un-named group with quantifier) returns DataFrame with one un-named
  432. # column.
  433. s = Series(["ab3", "abc3", "d4cd2"], name="series_name", dtype=any_string_dtype)
  434. result = s.str.extractall(r"([a-z]+)")
  435. expected = DataFrame(
  436. ["ab", "abc", "d", "cd"],
  437. index=MultiIndex.from_tuples(
  438. [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
  439. ),
  440. dtype=any_string_dtype,
  441. )
  442. tm.assert_frame_equal(result, expected)
  443. @pytest.mark.parametrize(
  444. "data, names",
  445. [
  446. ([], (None,)),
  447. ([], ("i1",)),
  448. ([], (None, "i2")),
  449. ([], ("i1", "i2")),
  450. (["a3", "b3", "d4c2"], (None,)),
  451. (["a3", "b3", "d4c2"], ("i1", "i2")),
  452. (["a3", "b3", "d4c2"], (None, "i2")),
  453. (["a3", "b3", "d4c2"], ("i1", "i2")),
  454. ],
  455. )
  456. def test_extractall_no_matches(data, names, any_string_dtype):
  457. # GH19075 extractall with no matches should return a valid MultiIndex
  458. n = len(data)
  459. if len(names) == 1:
  460. index = Index(range(n), name=names[0])
  461. else:
  462. tuples = (tuple([i] * (n - 1)) for i in range(n))
  463. index = MultiIndex.from_tuples(tuples, names=names)
  464. s = Series(data, name="series_name", index=index, dtype=any_string_dtype)
  465. expected_index = MultiIndex.from_tuples([], names=(names + ("match",)))
  466. # one un-named group.
  467. result = s.str.extractall("(z)")
  468. expected = DataFrame(columns=[0], index=expected_index, dtype=any_string_dtype)
  469. tm.assert_frame_equal(result, expected)
  470. # two un-named groups.
  471. result = s.str.extractall("(z)(z)")
  472. expected = DataFrame(columns=[0, 1], index=expected_index, dtype=any_string_dtype)
  473. tm.assert_frame_equal(result, expected)
  474. # one named group.
  475. result = s.str.extractall("(?P<first>z)")
  476. expected = DataFrame(
  477. columns=["first"], index=expected_index, dtype=any_string_dtype
  478. )
  479. tm.assert_frame_equal(result, expected)
  480. # two named groups.
  481. result = s.str.extractall("(?P<first>z)(?P<second>z)")
  482. expected = DataFrame(
  483. columns=["first", "second"], index=expected_index, dtype=any_string_dtype
  484. )
  485. tm.assert_frame_equal(result, expected)
  486. # one named, one un-named.
  487. result = s.str.extractall("(z)(?P<second>z)")
  488. expected = DataFrame(
  489. columns=[0, "second"], index=expected_index, dtype=any_string_dtype
  490. )
  491. tm.assert_frame_equal(result, expected)
  492. def test_extractall_stringindex(any_string_dtype):
  493. s = Series(["a1a2", "b1", "c1"], name="xxx", dtype=any_string_dtype)
  494. result = s.str.extractall(r"[ab](?P<digit>\d)")
  495. expected = DataFrame(
  496. {"digit": ["1", "2", "1"]},
  497. index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, "match"]),
  498. dtype=any_string_dtype,
  499. )
  500. tm.assert_frame_equal(result, expected)
  501. # index should return the same result as the default index without name thus
  502. # index.name doesn't affect to the result
  503. if any_string_dtype == "object":
  504. for idx in [
  505. Index(["a1a2", "b1", "c1"]),
  506. Index(["a1a2", "b1", "c1"], name="xxx"),
  507. ]:
  508. result = idx.str.extractall(r"[ab](?P<digit>\d)")
  509. tm.assert_frame_equal(result, expected)
  510. s = Series(
  511. ["a1a2", "b1", "c1"],
  512. name="s_name",
  513. index=Index(["XX", "yy", "zz"], name="idx_name"),
  514. dtype=any_string_dtype,
  515. )
  516. result = s.str.extractall(r"[ab](?P<digit>\d)")
  517. expected = DataFrame(
  518. {"digit": ["1", "2", "1"]},
  519. index=MultiIndex.from_tuples(
  520. [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
  521. ),
  522. dtype=any_string_dtype,
  523. )
  524. tm.assert_frame_equal(result, expected)
  525. def test_extractall_no_capture_groups_raises(any_string_dtype):
  526. # Does not make sense to use extractall with a regex that has no capture groups.
  527. # (it returns DataFrame with one column for each capture group)
  528. s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
  529. with pytest.raises(ValueError, match="no capture groups"):
  530. s.str.extractall(r"[a-z]")
  531. def test_extract_index_one_two_groups():
  532. s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
  533. r = s.index.str.extract(r"([A-Z])", expand=True)
  534. e = DataFrame(["A", "B", "D"])
  535. tm.assert_frame_equal(r, e)
  536. # Prior to v0.18.0, index.str.extract(regex with one group)
  537. # returned Index. With more than one group, extract raised an
  538. # error (GH9980). Now extract always returns DataFrame.
  539. r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
  540. e_list = [("A", "3"), ("B", "3"), ("D", "4")]
  541. e = DataFrame(e_list, columns=["letter", "digit"])
  542. tm.assert_frame_equal(r, e)
  543. def test_extractall_same_as_extract(any_string_dtype):
  544. s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
  545. pattern_two_noname = r"([a-z])([0-9])"
  546. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  547. has_multi_index = s.str.extractall(pattern_two_noname)
  548. no_multi_index = has_multi_index.xs(0, level="match")
  549. tm.assert_frame_equal(extract_two_noname, no_multi_index)
  550. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  551. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  552. has_multi_index = s.str.extractall(pattern_two_named)
  553. no_multi_index = has_multi_index.xs(0, level="match")
  554. tm.assert_frame_equal(extract_two_named, no_multi_index)
  555. pattern_one_named = r"(?P<group_name>[a-z])"
  556. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  557. has_multi_index = s.str.extractall(pattern_one_named)
  558. no_multi_index = has_multi_index.xs(0, level="match")
  559. tm.assert_frame_equal(extract_one_named, no_multi_index)
  560. pattern_one_noname = r"([a-z])"
  561. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  562. has_multi_index = s.str.extractall(pattern_one_noname)
  563. no_multi_index = has_multi_index.xs(0, level="match")
  564. tm.assert_frame_equal(extract_one_noname, no_multi_index)
  565. def test_extractall_same_as_extract_subject_index(any_string_dtype):
  566. # same as above tests, but s has an MultiIndex.
  567. mi = MultiIndex.from_tuples(
  568. [("A", "first"), ("B", "second"), ("C", "third")],
  569. names=("capital", "ordinal"),
  570. )
  571. s = Series(["a3", "b3", "c2"], index=mi, name="series_name", dtype=any_string_dtype)
  572. pattern_two_noname = r"([a-z])([0-9])"
  573. extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
  574. has_match_index = s.str.extractall(pattern_two_noname)
  575. no_match_index = has_match_index.xs(0, level="match")
  576. tm.assert_frame_equal(extract_two_noname, no_match_index)
  577. pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
  578. extract_two_named = s.str.extract(pattern_two_named, expand=True)
  579. has_match_index = s.str.extractall(pattern_two_named)
  580. no_match_index = has_match_index.xs(0, level="match")
  581. tm.assert_frame_equal(extract_two_named, no_match_index)
  582. pattern_one_named = r"(?P<group_name>[a-z])"
  583. extract_one_named = s.str.extract(pattern_one_named, expand=True)
  584. has_match_index = s.str.extractall(pattern_one_named)
  585. no_match_index = has_match_index.xs(0, level="match")
  586. tm.assert_frame_equal(extract_one_named, no_match_index)
  587. pattern_one_noname = r"([a-z])"
  588. extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
  589. has_match_index = s.str.extractall(pattern_one_noname)
  590. no_match_index = has_match_index.xs(0, level="match")
  591. tm.assert_frame_equal(extract_one_noname, no_match_index)