accessor.py 106 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376
  1. from __future__ import annotations
  2. import codecs
  3. from functools import wraps
  4. import re
  5. from typing import (
  6. TYPE_CHECKING,
  7. Callable,
  8. Hashable,
  9. Literal,
  10. cast,
  11. )
  12. import warnings
  13. import numpy as np
  14. from pandas._libs import lib
  15. from pandas._typing import (
  16. AlignJoin,
  17. DtypeObj,
  18. F,
  19. Scalar,
  20. )
  21. from pandas.util._decorators import Appender
  22. from pandas.util._exceptions import find_stack_level
  23. from pandas.core.dtypes.common import (
  24. ensure_object,
  25. is_bool_dtype,
  26. is_categorical_dtype,
  27. is_integer,
  28. is_list_like,
  29. is_object_dtype,
  30. is_re,
  31. )
  32. from pandas.core.dtypes.generic import (
  33. ABCDataFrame,
  34. ABCIndex,
  35. ABCMultiIndex,
  36. ABCSeries,
  37. )
  38. from pandas.core.dtypes.missing import isna
  39. from pandas.core.arrays.arrow.dtype import ArrowDtype
  40. from pandas.core.base import NoNewAttributesMixin
  41. from pandas.core.construction import extract_array
  42. if TYPE_CHECKING:
  43. from pandas import (
  44. DataFrame,
  45. Index,
  46. Series,
  47. )
  48. _shared_docs: dict[str, str] = {}
  49. _cpython_optimized_encoders = (
  50. "utf-8",
  51. "utf8",
  52. "latin-1",
  53. "latin1",
  54. "iso-8859-1",
  55. "mbcs",
  56. "ascii",
  57. )
  58. _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
  59. def forbid_nonstring_types(
  60. forbidden: list[str] | None, name: str | None = None
  61. ) -> Callable[[F], F]:
  62. """
  63. Decorator to forbid specific types for a method of StringMethods.
  64. For calling `.str.{method}` on a Series or Index, it is necessary to first
  65. initialize the :class:`StringMethods` object, and then call the method.
  66. However, different methods allow different input types, and so this can not
  67. be checked during :meth:`StringMethods.__init__`, but must be done on a
  68. per-method basis. This decorator exists to facilitate this process, and
  69. make it explicit which (inferred) types are disallowed by the method.
  70. :meth:`StringMethods.__init__` allows the *union* of types its different
  71. methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
  72. namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
  73. The default string types ['string', 'empty'] are allowed for all methods.
  74. For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
  75. then needs to forbid the types it is not intended for.
  76. Parameters
  77. ----------
  78. forbidden : list-of-str or None
  79. List of forbidden non-string types, may be one or more of
  80. `['bytes', 'mixed', 'mixed-integer']`.
  81. name : str, default None
  82. Name of the method to use in the error message. By default, this is
  83. None, in which case the name from the method being wrapped will be
  84. copied. However, for working with further wrappers (like _pat_wrapper
  85. and _noarg_wrapper), it is necessary to specify the name.
  86. Returns
  87. -------
  88. func : wrapper
  89. The method to which the decorator is applied, with an added check that
  90. enforces the inferred type to not be in the list of forbidden types.
  91. Raises
  92. ------
  93. TypeError
  94. If the inferred type of the underlying data is in `forbidden`.
  95. """
  96. # deal with None
  97. forbidden = [] if forbidden is None else forbidden
  98. allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
  99. forbidden
  100. )
  101. def _forbid_nonstring_types(func: F) -> F:
  102. func_name = func.__name__ if name is None else name
  103. @wraps(func)
  104. def wrapper(self, *args, **kwargs):
  105. if self._inferred_dtype not in allowed_types:
  106. msg = (
  107. f"Cannot use .str.{func_name} with values of "
  108. f"inferred dtype '{self._inferred_dtype}'."
  109. )
  110. raise TypeError(msg)
  111. return func(self, *args, **kwargs)
  112. wrapper.__name__ = func_name
  113. return cast(F, wrapper)
  114. return _forbid_nonstring_types
  115. def _map_and_wrap(name, docstring):
  116. @forbid_nonstring_types(["bytes"], name=name)
  117. def wrapper(self):
  118. result = getattr(self._data.array, f"_str_{name}")()
  119. return self._wrap_result(result)
  120. wrapper.__doc__ = docstring
  121. return wrapper
  122. class StringMethods(NoNewAttributesMixin):
  123. """
  124. Vectorized string functions for Series and Index.
  125. NAs stay NA unless handled otherwise by a particular method.
  126. Patterned after Python's string methods, with some inspiration from
  127. R's stringr package.
  128. Examples
  129. --------
  130. >>> s = pd.Series(["A_Str_Series"])
  131. >>> s
  132. 0 A_Str_Series
  133. dtype: object
  134. >>> s.str.split("_")
  135. 0 [A, Str, Series]
  136. dtype: object
  137. >>> s.str.replace("_", "")
  138. 0 AStrSeries
  139. dtype: object
  140. """
  141. # Note: see the docstring in pandas.core.strings.__init__
  142. # for an explanation of the implementation.
  143. # TODO: Dispatch all the methods
  144. # Currently the following are not dispatched to the array
  145. # * cat
  146. # * extractall
  147. def __init__(self, data) -> None:
  148. from pandas.core.arrays.string_ import StringDtype
  149. self._inferred_dtype = self._validate(data)
  150. self._is_categorical = is_categorical_dtype(data.dtype)
  151. self._is_string = isinstance(data.dtype, StringDtype)
  152. self._data = data
  153. self._index = self._name = None
  154. if isinstance(data, ABCSeries):
  155. self._index = data.index
  156. self._name = data.name
  157. # ._values.categories works for both Series/Index
  158. self._parent = data._values.categories if self._is_categorical else data
  159. # save orig to blow up categoricals to the right type
  160. self._orig = data
  161. self._freeze()
  162. @staticmethod
  163. def _validate(data):
  164. """
  165. Auxiliary function for StringMethods, infers and checks dtype of data.
  166. This is a "first line of defence" at the creation of the StringMethods-
  167. object, and just checks that the dtype is in the
  168. *union* of the allowed types over all string methods below; this
  169. restriction is then refined on a per-method basis using the decorator
  170. @forbid_nonstring_types (more info in the corresponding docstring).
  171. This really should exclude all series/index with any non-string values,
  172. but that isn't practical for performance reasons until we have a str
  173. dtype (GH 9343 / 13877)
  174. Parameters
  175. ----------
  176. data : The content of the Series
  177. Returns
  178. -------
  179. dtype : inferred dtype of data
  180. """
  181. if isinstance(data, ABCMultiIndex):
  182. raise AttributeError(
  183. "Can only use .str accessor with Index, not MultiIndex"
  184. )
  185. # see _libs/lib.pyx for list of inferred types
  186. allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
  187. data = extract_array(data)
  188. values = getattr(data, "categories", data) # categorical / normal
  189. inferred_dtype = lib.infer_dtype(values, skipna=True)
  190. if inferred_dtype not in allowed_types:
  191. raise AttributeError("Can only use .str accessor with string values!")
  192. return inferred_dtype
  193. def __getitem__(self, key):
  194. result = self._data.array._str_getitem(key)
  195. return self._wrap_result(result)
  196. def _wrap_result(
  197. self,
  198. result,
  199. name=None,
  200. expand: bool | None = None,
  201. fill_value=np.nan,
  202. returns_string: bool = True,
  203. returns_bool: bool = False,
  204. ):
  205. from pandas import (
  206. Index,
  207. MultiIndex,
  208. )
  209. if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
  210. if isinstance(result, ABCDataFrame):
  211. result = result.__finalize__(self._orig, name="str")
  212. return result
  213. assert result.ndim < 3
  214. # We can be wrapping a string / object / categorical result, in which
  215. # case we'll want to return the same dtype as the input.
  216. # Or we can be wrapping a numeric output, in which case we don't want
  217. # to return a StringArray.
  218. # Ideally the array method returns the right array type.
  219. if expand is None:
  220. # infer from ndim if expand is not specified
  221. expand = result.ndim != 1
  222. elif expand is True and not isinstance(self._orig, ABCIndex):
  223. # required when expand=True is explicitly specified
  224. # not needed when inferred
  225. if isinstance(result.dtype, ArrowDtype):
  226. import pyarrow as pa
  227. from pandas.compat import pa_version_under11p0
  228. from pandas.core.arrays.arrow.array import ArrowExtensionArray
  229. value_lengths = result._data.combine_chunks().value_lengths()
  230. max_len = pa.compute.max(value_lengths).as_py()
  231. min_len = pa.compute.min(value_lengths).as_py()
  232. if result._hasna:
  233. # ArrowExtensionArray.fillna doesn't work for list scalars
  234. result = ArrowExtensionArray(
  235. result._data.fill_null([None] * max_len)
  236. )
  237. if min_len < max_len:
  238. # append nulls to each scalar list element up to max_len
  239. if not pa_version_under11p0:
  240. result = ArrowExtensionArray(
  241. pa.compute.list_slice(
  242. result._data,
  243. start=0,
  244. stop=max_len,
  245. return_fixed_size_list=True,
  246. )
  247. )
  248. else:
  249. all_null = np.full(max_len, fill_value=None, dtype=object)
  250. values = result.to_numpy()
  251. new_values = []
  252. for row in values:
  253. if len(row) < max_len:
  254. nulls = all_null[: max_len - len(row)]
  255. row = np.append(row, nulls)
  256. new_values.append(row)
  257. pa_type = result._data.type
  258. result = ArrowExtensionArray(pa.array(new_values, type=pa_type))
  259. if name is not None:
  260. labels = name
  261. else:
  262. labels = range(max_len)
  263. result = {
  264. label: ArrowExtensionArray(pa.array(res))
  265. for label, res in zip(labels, (zip(*result.tolist())))
  266. }
  267. elif is_object_dtype(result):
  268. def cons_row(x):
  269. if is_list_like(x):
  270. return x
  271. else:
  272. return [x]
  273. result = [cons_row(x) for x in result]
  274. if result and not self._is_string:
  275. # propagate nan values to match longest sequence (GH 18450)
  276. max_len = max(len(x) for x in result)
  277. result = [
  278. x * max_len if len(x) == 0 or x[0] is np.nan else x
  279. for x in result
  280. ]
  281. if not isinstance(expand, bool):
  282. raise ValueError("expand must be True or False")
  283. if expand is False:
  284. # if expand is False, result should have the same name
  285. # as the original otherwise specified
  286. if name is None:
  287. name = getattr(result, "name", None)
  288. if name is None:
  289. # do not use logical or, _orig may be a DataFrame
  290. # which has "name" column
  291. name = self._orig.name
  292. # Wait until we are sure result is a Series or Index before
  293. # checking attributes (GH 12180)
  294. if isinstance(self._orig, ABCIndex):
  295. # if result is a boolean np.array, return the np.array
  296. # instead of wrapping it into a boolean Index (GH 8875)
  297. if is_bool_dtype(result):
  298. return result
  299. if expand:
  300. result = list(result)
  301. out = MultiIndex.from_tuples(result, names=name)
  302. if out.nlevels == 1:
  303. # We had all tuples of length-one, which are
  304. # better represented as a regular Index.
  305. out = out.get_level_values(0)
  306. return out
  307. else:
  308. return Index(result, name=name)
  309. else:
  310. index = self._orig.index
  311. # This is a mess.
  312. dtype: DtypeObj | str | None
  313. vdtype = getattr(result, "dtype", None)
  314. if self._is_string:
  315. if is_bool_dtype(vdtype):
  316. dtype = result.dtype
  317. elif returns_string:
  318. dtype = self._orig.dtype
  319. else:
  320. dtype = vdtype
  321. else:
  322. dtype = vdtype
  323. if expand:
  324. cons = self._orig._constructor_expanddim
  325. result = cons(result, columns=name, index=index, dtype=dtype)
  326. else:
  327. # Must be a Series
  328. cons = self._orig._constructor
  329. result = cons(result, name=name, index=index, dtype=dtype)
  330. result = result.__finalize__(self._orig, method="str")
  331. if name is not None and result.ndim == 1:
  332. # __finalize__ might copy over the original name, but we may
  333. # want the new name (e.g. str.extract).
  334. result.name = name
  335. return result
  336. def _get_series_list(self, others):
  337. """
  338. Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
  339. into a list of Series (elements without an index must match the length
  340. of the calling Series/Index).
  341. Parameters
  342. ----------
  343. others : Series, DataFrame, np.ndarray, list-like or list-like of
  344. Objects that are either Series, Index or np.ndarray (1-dim).
  345. Returns
  346. -------
  347. list of Series
  348. Others transformed into list of Series.
  349. """
  350. from pandas import (
  351. DataFrame,
  352. Series,
  353. )
  354. # self._orig is either Series or Index
  355. idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index
  356. # Generally speaking, all objects without an index inherit the index
  357. # `idx` of the calling Series/Index - i.e. must have matching length.
  358. # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
  359. if isinstance(others, ABCSeries):
  360. return [others]
  361. elif isinstance(others, ABCIndex):
  362. return [Series(others, index=idx, dtype=others.dtype)]
  363. elif isinstance(others, ABCDataFrame):
  364. return [others[x] for x in others]
  365. elif isinstance(others, np.ndarray) and others.ndim == 2:
  366. others = DataFrame(others, index=idx)
  367. return [others[x] for x in others]
  368. elif is_list_like(others, allow_sets=False):
  369. others = list(others) # ensure iterators do not get read twice etc
  370. # in case of list-like `others`, all elements must be
  371. # either Series/Index/np.ndarray (1-dim)...
  372. if all(
  373. isinstance(x, (ABCSeries, ABCIndex))
  374. or (isinstance(x, np.ndarray) and x.ndim == 1)
  375. for x in others
  376. ):
  377. los: list[Series] = []
  378. while others: # iterate through list and append each element
  379. los = los + self._get_series_list(others.pop(0))
  380. return los
  381. # ... or just strings
  382. elif all(not is_list_like(x) for x in others):
  383. return [Series(others, index=idx)]
  384. raise TypeError(
  385. "others must be Series, Index, DataFrame, np.ndarray "
  386. "or list-like (either containing only strings or "
  387. "containing only objects of type Series/Index/"
  388. "np.ndarray[1-dim])"
  389. )
  390. @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
  391. def cat(
  392. self,
  393. others=None,
  394. sep=None,
  395. na_rep=None,
  396. join: AlignJoin = "left",
  397. ) -> str | Series | Index:
  398. """
  399. Concatenate strings in the Series/Index with given separator.
  400. If `others` is specified, this function concatenates the Series/Index
  401. and elements of `others` element-wise.
  402. If `others` is not passed, then all values in the Series/Index are
  403. concatenated into a single string with a given `sep`.
  404. Parameters
  405. ----------
  406. others : Series, Index, DataFrame, np.ndarray or list-like
  407. Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
  408. other list-likes of strings must have the same length as the
  409. calling Series/Index, with the exception of indexed objects (i.e.
  410. Series/Index/DataFrame) if `join` is not None.
  411. If others is a list-like that contains a combination of Series,
  412. Index or np.ndarray (1-dim), then all elements will be unpacked and
  413. must satisfy the above criteria individually.
  414. If others is None, the method returns the concatenation of all
  415. strings in the calling Series/Index.
  416. sep : str, default ''
  417. The separator between the different elements/columns. By default
  418. the empty string `''` is used.
  419. na_rep : str or None, default None
  420. Representation that is inserted for all missing values:
  421. - If `na_rep` is None, and `others` is None, missing values in the
  422. Series/Index are omitted from the result.
  423. - If `na_rep` is None, and `others` is not None, a row containing a
  424. missing value in any of the columns (before concatenation) will
  425. have a missing value in the result.
  426. join : {'left', 'right', 'outer', 'inner'}, default 'left'
  427. Determines the join-style between the calling Series/Index and any
  428. Series/Index/DataFrame in `others` (objects without an index need
  429. to match the length of the calling Series/Index). To disable
  430. alignment, use `.values` on any Series/Index/DataFrame in `others`.
  431. Returns
  432. -------
  433. str, Series or Index
  434. If `others` is None, `str` is returned, otherwise a `Series/Index`
  435. (same type as caller) of objects is returned.
  436. See Also
  437. --------
  438. split : Split each string in the Series/Index.
  439. join : Join lists contained as elements in the Series/Index.
  440. Examples
  441. --------
  442. When not passing `others`, all values are concatenated into a single
  443. string:
  444. >>> s = pd.Series(['a', 'b', np.nan, 'd'])
  445. >>> s.str.cat(sep=' ')
  446. 'a b d'
  447. By default, NA values in the Series are ignored. Using `na_rep`, they
  448. can be given a representation:
  449. >>> s.str.cat(sep=' ', na_rep='?')
  450. 'a b ? d'
  451. If `others` is specified, corresponding values are concatenated with
  452. the separator. Result will be a Series of strings.
  453. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
  454. 0 a,A
  455. 1 b,B
  456. 2 NaN
  457. 3 d,D
  458. dtype: object
  459. Missing values will remain missing in the result, but can again be
  460. represented using `na_rep`
  461. >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
  462. 0 a,A
  463. 1 b,B
  464. 2 -,C
  465. 3 d,D
  466. dtype: object
  467. If `sep` is not specified, the values are concatenated without
  468. separation.
  469. >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
  470. 0 aA
  471. 1 bB
  472. 2 -C
  473. 3 dD
  474. dtype: object
  475. Series with different indexes can be aligned before concatenation. The
  476. `join`-keyword works as in other methods.
  477. >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
  478. >>> s.str.cat(t, join='left', na_rep='-')
  479. 0 aa
  480. 1 b-
  481. 2 -c
  482. 3 dd
  483. dtype: object
  484. >>>
  485. >>> s.str.cat(t, join='outer', na_rep='-')
  486. 0 aa
  487. 1 b-
  488. 2 -c
  489. 3 dd
  490. 4 -e
  491. dtype: object
  492. >>>
  493. >>> s.str.cat(t, join='inner', na_rep='-')
  494. 0 aa
  495. 2 -c
  496. 3 dd
  497. dtype: object
  498. >>>
  499. >>> s.str.cat(t, join='right', na_rep='-')
  500. 3 dd
  501. 0 aa
  502. 4 -e
  503. 2 -c
  504. dtype: object
  505. For more examples, see :ref:`here <text.concatenate>`.
  506. """
  507. # TODO: dispatch
  508. from pandas import (
  509. Index,
  510. Series,
  511. concat,
  512. )
  513. if isinstance(others, str):
  514. raise ValueError("Did you mean to supply a `sep` keyword?")
  515. if sep is None:
  516. sep = ""
  517. if isinstance(self._orig, ABCIndex):
  518. data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)
  519. else: # Series
  520. data = self._orig
  521. # concatenate Series/Index with itself if no "others"
  522. if others is None:
  523. # error: Incompatible types in assignment (expression has type
  524. # "ndarray", variable has type "Series")
  525. data = ensure_object(data) # type: ignore[assignment]
  526. na_mask = isna(data)
  527. if na_rep is None and na_mask.any():
  528. return sep.join(data[~na_mask])
  529. elif na_rep is not None and na_mask.any():
  530. return sep.join(np.where(na_mask, na_rep, data))
  531. else:
  532. return sep.join(data)
  533. try:
  534. # turn anything in "others" into lists of Series
  535. others = self._get_series_list(others)
  536. except ValueError as err: # do not catch TypeError raised by _get_series_list
  537. raise ValueError(
  538. "If `others` contains arrays or lists (or other "
  539. "list-likes without an index), these must all be "
  540. "of the same length as the calling Series/Index."
  541. ) from err
  542. # align if required
  543. if any(not data.index.equals(x.index) for x in others):
  544. # Need to add keys for uniqueness in case of duplicate columns
  545. others = concat(
  546. others,
  547. axis=1,
  548. join=(join if join == "inner" else "outer"),
  549. keys=range(len(others)),
  550. sort=False,
  551. copy=False,
  552. )
  553. data, others = data.align(others, join=join)
  554. others = [others[x] for x in others] # again list of Series
  555. all_cols = [ensure_object(x) for x in [data] + others]
  556. na_masks = np.array([isna(x) for x in all_cols])
  557. union_mask = np.logical_or.reduce(na_masks, axis=0)
  558. if na_rep is None and union_mask.any():
  559. # no na_rep means NaNs for all rows where any column has a NaN
  560. # only necessary if there are actually any NaNs
  561. result = np.empty(len(data), dtype=object)
  562. np.putmask(result, union_mask, np.nan)
  563. not_masked = ~union_mask
  564. result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
  565. elif na_rep is not None and union_mask.any():
  566. # fill NaNs with na_rep in case there are actually any NaNs
  567. all_cols = [
  568. np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
  569. ]
  570. result = cat_safe(all_cols, sep)
  571. else:
  572. # no NaNs - can just concatenate
  573. result = cat_safe(all_cols, sep)
  574. out: Index | Series
  575. if isinstance(self._orig, ABCIndex):
  576. # add dtype for case that result is all-NA
  577. out = Index(result, dtype=object, name=self._orig.name)
  578. else: # Series
  579. if is_categorical_dtype(self._orig.dtype):
  580. # We need to infer the new categories.
  581. dtype = None
  582. else:
  583. dtype = self._orig.dtype
  584. res_ser = Series(
  585. result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
  586. )
  587. out = res_ser.__finalize__(self._orig, method="str_cat")
  588. return out
  589. _shared_docs[
  590. "str_split"
  591. ] = r"""
  592. Split strings around given separator/delimiter.
  593. Splits the string in the Series/Index from the %(side)s,
  594. at the specified delimiter string.
  595. Parameters
  596. ----------
  597. pat : str%(pat_regex)s, optional
  598. %(pat_description)s.
  599. If not specified, split on whitespace.
  600. n : int, default -1 (all)
  601. Limit number of splits in output.
  602. ``None``, 0 and -1 will be interpreted as return all splits.
  603. expand : bool, default False
  604. Expand the split strings into separate columns.
  605. - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
  606. - If ``False``, return Series/Index, containing lists of strings.
  607. %(regex_argument)s
  608. Returns
  609. -------
  610. Series, Index, DataFrame or MultiIndex
  611. Type matches caller unless ``expand=True`` (see Notes).
  612. %(raises_split)s
  613. See Also
  614. --------
  615. Series.str.split : Split strings around given separator/delimiter.
  616. Series.str.rsplit : Splits string around given separator/delimiter,
  617. starting from the right.
  618. Series.str.join : Join lists contained as elements in the Series/Index
  619. with passed delimiter.
  620. str.split : Standard library version for split.
  621. str.rsplit : Standard library version for rsplit.
  622. Notes
  623. -----
  624. The handling of the `n` keyword depends on the number of found splits:
  625. - If found splits > `n`, make first `n` splits only
  626. - If found splits <= `n`, make all splits
  627. - If for a certain row the number of found splits < `n`,
  628. append `None` for padding up to `n` if ``expand=True``
  629. If using ``expand=True``, Series and Index callers return DataFrame and
  630. MultiIndex objects, respectively.
  631. %(regex_pat_note)s
  632. Examples
  633. --------
  634. >>> s = pd.Series(
  635. ... [
  636. ... "this is a regular sentence",
  637. ... "https://docs.python.org/3/tutorial/index.html",
  638. ... np.nan
  639. ... ]
  640. ... )
  641. >>> s
  642. 0 this is a regular sentence
  643. 1 https://docs.python.org/3/tutorial/index.html
  644. 2 NaN
  645. dtype: object
  646. In the default setting, the string is split by whitespace.
  647. >>> s.str.split()
  648. 0 [this, is, a, regular, sentence]
  649. 1 [https://docs.python.org/3/tutorial/index.html]
  650. 2 NaN
  651. dtype: object
  652. Without the `n` parameter, the outputs of `rsplit` and `split`
  653. are identical.
  654. >>> s.str.rsplit()
  655. 0 [this, is, a, regular, sentence]
  656. 1 [https://docs.python.org/3/tutorial/index.html]
  657. 2 NaN
  658. dtype: object
  659. The `n` parameter can be used to limit the number of splits on the
  660. delimiter. The outputs of `split` and `rsplit` are different.
  661. >>> s.str.split(n=2)
  662. 0 [this, is, a regular sentence]
  663. 1 [https://docs.python.org/3/tutorial/index.html]
  664. 2 NaN
  665. dtype: object
  666. >>> s.str.rsplit(n=2)
  667. 0 [this is a, regular, sentence]
  668. 1 [https://docs.python.org/3/tutorial/index.html]
  669. 2 NaN
  670. dtype: object
  671. The `pat` parameter can be used to split by other characters.
  672. >>> s.str.split(pat="/")
  673. 0 [this is a regular sentence]
  674. 1 [https:, , docs.python.org, 3, tutorial, index...
  675. 2 NaN
  676. dtype: object
  677. When using ``expand=True``, the split elements will expand out into
  678. separate columns. If NaN is present, it is propagated throughout
  679. the columns during the split.
  680. >>> s.str.split(expand=True)
  681. 0 1 2 3 4
  682. 0 this is a regular sentence
  683. 1 https://docs.python.org/3/tutorial/index.html None None None None
  684. 2 NaN NaN NaN NaN NaN
  685. For slightly more complex use cases like splitting the html document name
  686. from a url, a combination of parameter settings can be used.
  687. >>> s.str.rsplit("/", n=1, expand=True)
  688. 0 1
  689. 0 this is a regular sentence None
  690. 1 https://docs.python.org/3/tutorial index.html
  691. 2 NaN NaN
  692. %(regex_examples)s"""
  693. @Appender(
  694. _shared_docs["str_split"]
  695. % {
  696. "side": "beginning",
  697. "pat_regex": " or compiled regex",
  698. "pat_description": "String or regular expression to split on",
  699. "regex_argument": """
  700. regex : bool, default None
  701. Determines if the passed-in pattern is a regular expression:
  702. - If ``True``, assumes the passed-in pattern is a regular expression
  703. - If ``False``, treats the pattern as a literal string.
  704. - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
  705. - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
  706. - Cannot be set to False if `pat` is a compiled regex
  707. .. versionadded:: 1.4.0
  708. """,
  709. "raises_split": """
  710. Raises
  711. ------
  712. ValueError
  713. * if `regex` is False and `pat` is a compiled regex
  714. """,
  715. "regex_pat_note": """
  716. Use of `regex =False` with a `pat` as a compiled regex will raise an error.
  717. """,
  718. "method": "split",
  719. "regex_examples": r"""
  720. Remember to escape special characters when explicitly using regular expressions.
  721. >>> s = pd.Series(["foo and bar plus baz"])
  722. >>> s.str.split(r"and|plus", expand=True)
  723. 0 1 2
  724. 0 foo bar baz
  725. Regular expressions can be used to handle urls or file names.
  726. When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
  727. as a regex only if ``len(pat) != 1``.
  728. >>> s = pd.Series(['foojpgbar.jpg'])
  729. >>> s.str.split(r".", expand=True)
  730. 0 1
  731. 0 foojpgbar jpg
  732. >>> s.str.split(r"\.jpg", expand=True)
  733. 0 1
  734. 0 foojpgbar
  735. When ``regex=True``, `pat` is interpreted as a regex
  736. >>> s.str.split(r"\.jpg", regex=True, expand=True)
  737. 0 1
  738. 0 foojpgbar
  739. A compiled regex can be passed as `pat`
  740. >>> import re
  741. >>> s.str.split(re.compile(r"\.jpg"), expand=True)
  742. 0 1
  743. 0 foojpgbar
  744. When ``regex=False``, `pat` is interpreted as the string itself
  745. >>> s.str.split(r"\.jpg", regex=False, expand=True)
  746. 0
  747. 0 foojpgbar.jpg
  748. """,
  749. }
  750. )
  751. @forbid_nonstring_types(["bytes"])
  752. def split(
  753. self,
  754. pat: str | re.Pattern | None = None,
  755. *,
  756. n=-1,
  757. expand: bool = False,
  758. regex: bool | None = None,
  759. ):
  760. if regex is False and is_re(pat):
  761. raise ValueError(
  762. "Cannot use a compiled regex as replacement pattern with regex=False"
  763. )
  764. if is_re(pat):
  765. regex = True
  766. result = self._data.array._str_split(pat, n, expand, regex)
  767. return self._wrap_result(result, returns_string=expand, expand=expand)
  768. @Appender(
  769. _shared_docs["str_split"]
  770. % {
  771. "side": "end",
  772. "pat_regex": "",
  773. "pat_description": "String to split on",
  774. "regex_argument": "",
  775. "raises_split": "",
  776. "regex_pat_note": "",
  777. "method": "rsplit",
  778. "regex_examples": "",
  779. }
  780. )
  781. @forbid_nonstring_types(["bytes"])
  782. def rsplit(self, pat=None, *, n=-1, expand: bool = False):
  783. result = self._data.array._str_rsplit(pat, n=n)
  784. return self._wrap_result(result, expand=expand, returns_string=expand)
  785. _shared_docs[
  786. "str_partition"
  787. ] = """
  788. Split the string at the %(side)s occurrence of `sep`.
  789. This method splits the string at the %(side)s occurrence of `sep`,
  790. and returns 3 elements containing the part before the separator,
  791. the separator itself, and the part after the separator.
  792. If the separator is not found, return %(return)s.
  793. Parameters
  794. ----------
  795. sep : str, default whitespace
  796. String to split on.
  797. expand : bool, default True
  798. If True, return DataFrame/MultiIndex expanding dimensionality.
  799. If False, return Series/Index.
  800. Returns
  801. -------
  802. DataFrame/MultiIndex or Series/Index of objects
  803. See Also
  804. --------
  805. %(also)s
  806. Series.str.split : Split strings around given separators.
  807. str.partition : Standard library version.
  808. Examples
  809. --------
  810. >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
  811. >>> s
  812. 0 Linda van der Berg
  813. 1 George Pitt-Rivers
  814. dtype: object
  815. >>> s.str.partition()
  816. 0 1 2
  817. 0 Linda van der Berg
  818. 1 George Pitt-Rivers
  819. To partition by the last space instead of the first one:
  820. >>> s.str.rpartition()
  821. 0 1 2
  822. 0 Linda van der Berg
  823. 1 George Pitt-Rivers
  824. To partition by something different than a space:
  825. >>> s.str.partition('-')
  826. 0 1 2
  827. 0 Linda van der Berg
  828. 1 George Pitt - Rivers
  829. To return a Series containing tuples instead of a DataFrame:
  830. >>> s.str.partition('-', expand=False)
  831. 0 (Linda van der Berg, , )
  832. 1 (George Pitt, -, Rivers)
  833. dtype: object
  834. Also available on indices:
  835. >>> idx = pd.Index(['X 123', 'Y 999'])
  836. >>> idx
  837. Index(['X 123', 'Y 999'], dtype='object')
  838. Which will create a MultiIndex:
  839. >>> idx.str.partition()
  840. MultiIndex([('X', ' ', '123'),
  841. ('Y', ' ', '999')],
  842. )
  843. Or an index with tuples with ``expand=False``:
  844. >>> idx.str.partition(expand=False)
  845. Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
  846. """
  847. @Appender(
  848. _shared_docs["str_partition"]
  849. % {
  850. "side": "first",
  851. "return": "3 elements containing the string itself, followed by two "
  852. "empty strings",
  853. "also": "rpartition : Split the string at the last occurrence of `sep`.",
  854. }
  855. )
  856. @forbid_nonstring_types(["bytes"])
  857. def partition(self, sep: str = " ", expand: bool = True):
  858. result = self._data.array._str_partition(sep, expand)
  859. return self._wrap_result(result, expand=expand, returns_string=expand)
  860. @Appender(
  861. _shared_docs["str_partition"]
  862. % {
  863. "side": "last",
  864. "return": "3 elements containing two empty strings, followed by the "
  865. "string itself",
  866. "also": "partition : Split the string at the first occurrence of `sep`.",
  867. }
  868. )
  869. @forbid_nonstring_types(["bytes"])
  870. def rpartition(self, sep: str = " ", expand: bool = True):
  871. result = self._data.array._str_rpartition(sep, expand)
  872. return self._wrap_result(result, expand=expand, returns_string=expand)
  873. def get(self, i):
  874. """
  875. Extract element from each component at specified position or with specified key.
  876. Extract element from lists, tuples, dict, or strings in each element in the
  877. Series/Index.
  878. Parameters
  879. ----------
  880. i : int or hashable dict label
  881. Position or key of element to extract.
  882. Returns
  883. -------
  884. Series or Index
  885. Examples
  886. --------
  887. >>> s = pd.Series(["String",
  888. ... (1, 2, 3),
  889. ... ["a", "b", "c"],
  890. ... 123,
  891. ... -456,
  892. ... {1: "Hello", "2": "World"}])
  893. >>> s
  894. 0 String
  895. 1 (1, 2, 3)
  896. 2 [a, b, c]
  897. 3 123
  898. 4 -456
  899. 5 {1: 'Hello', '2': 'World'}
  900. dtype: object
  901. >>> s.str.get(1)
  902. 0 t
  903. 1 2
  904. 2 b
  905. 3 NaN
  906. 4 NaN
  907. 5 Hello
  908. dtype: object
  909. >>> s.str.get(-1)
  910. 0 g
  911. 1 3
  912. 2 c
  913. 3 NaN
  914. 4 NaN
  915. 5 None
  916. dtype: object
  917. Return element with given key
  918. >>> s = pd.Series([{"name": "Hello", "value": "World"},
  919. ... {"name": "Goodbye", "value": "Planet"}])
  920. >>> s.str.get('name')
  921. 0 Hello
  922. 1 Goodbye
  923. dtype: object
  924. """
  925. result = self._data.array._str_get(i)
  926. return self._wrap_result(result)
  927. @forbid_nonstring_types(["bytes"])
  928. def join(self, sep):
  929. """
  930. Join lists contained as elements in the Series/Index with passed delimiter.
  931. If the elements of a Series are lists themselves, join the content of these
  932. lists using the delimiter passed to the function.
  933. This function is an equivalent to :meth:`str.join`.
  934. Parameters
  935. ----------
  936. sep : str
  937. Delimiter to use between list entries.
  938. Returns
  939. -------
  940. Series/Index: object
  941. The list entries concatenated by intervening occurrences of the
  942. delimiter.
  943. Raises
  944. ------
  945. AttributeError
  946. If the supplied Series contains neither strings nor lists.
  947. See Also
  948. --------
  949. str.join : Standard library version of this method.
  950. Series.str.split : Split strings around given separator/delimiter.
  951. Notes
  952. -----
  953. If any of the list items is not a string object, the result of the join
  954. will be `NaN`.
  955. Examples
  956. --------
  957. Example with a list that contains non-string elements.
  958. >>> s = pd.Series([['lion', 'elephant', 'zebra'],
  959. ... [1.1, 2.2, 3.3],
  960. ... ['cat', np.nan, 'dog'],
  961. ... ['cow', 4.5, 'goat'],
  962. ... ['duck', ['swan', 'fish'], 'guppy']])
  963. >>> s
  964. 0 [lion, elephant, zebra]
  965. 1 [1.1, 2.2, 3.3]
  966. 2 [cat, nan, dog]
  967. 3 [cow, 4.5, goat]
  968. 4 [duck, [swan, fish], guppy]
  969. dtype: object
  970. Join all lists using a '-'. The lists containing object(s) of types other
  971. than str will produce a NaN.
  972. >>> s.str.join('-')
  973. 0 lion-elephant-zebra
  974. 1 NaN
  975. 2 NaN
  976. 3 NaN
  977. 4 NaN
  978. dtype: object
  979. """
  980. result = self._data.array._str_join(sep)
  981. return self._wrap_result(result)
  982. @forbid_nonstring_types(["bytes"])
  983. def contains(
  984. self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
  985. ):
  986. r"""
  987. Test if pattern or regex is contained within a string of a Series or Index.
  988. Return boolean Series or Index based on whether a given pattern or regex is
  989. contained within a string of a Series or Index.
  990. Parameters
  991. ----------
  992. pat : str
  993. Character sequence or regular expression.
  994. case : bool, default True
  995. If True, case sensitive.
  996. flags : int, default 0 (no flags)
  997. Flags to pass through to the re module, e.g. re.IGNORECASE.
  998. na : scalar, optional
  999. Fill value for missing values. The default depends on dtype of the
  1000. array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
  1001. ``pandas.NA`` is used.
  1002. regex : bool, default True
  1003. If True, assumes the pat is a regular expression.
  1004. If False, treats the pat as a literal string.
  1005. Returns
  1006. -------
  1007. Series or Index of boolean values
  1008. A Series or Index of boolean values indicating whether the
  1009. given pattern is contained within the string of each element
  1010. of the Series or Index.
  1011. See Also
  1012. --------
  1013. match : Analogous, but stricter, relying on re.match instead of re.search.
  1014. Series.str.startswith : Test if the start of each string element matches a
  1015. pattern.
  1016. Series.str.endswith : Same as startswith, but tests the end of string.
  1017. Examples
  1018. --------
  1019. Returning a Series of booleans using only a literal pattern.
  1020. >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
  1021. >>> s1.str.contains('og', regex=False)
  1022. 0 False
  1023. 1 True
  1024. 2 False
  1025. 3 False
  1026. 4 NaN
  1027. dtype: object
  1028. Returning an Index of booleans using only a literal pattern.
  1029. >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
  1030. >>> ind.str.contains('23', regex=False)
  1031. Index([False, False, False, True, nan], dtype='object')
  1032. Specifying case sensitivity using `case`.
  1033. >>> s1.str.contains('oG', case=True, regex=True)
  1034. 0 False
  1035. 1 False
  1036. 2 False
  1037. 3 False
  1038. 4 NaN
  1039. dtype: object
  1040. Specifying `na` to be `False` instead of `NaN` replaces NaN values
  1041. with `False`. If Series or Index does not contain NaN values
  1042. the resultant dtype will be `bool`, otherwise, an `object` dtype.
  1043. >>> s1.str.contains('og', na=False, regex=True)
  1044. 0 False
  1045. 1 True
  1046. 2 False
  1047. 3 False
  1048. 4 False
  1049. dtype: bool
  1050. Returning 'house' or 'dog' when either expression occurs in a string.
  1051. >>> s1.str.contains('house|dog', regex=True)
  1052. 0 False
  1053. 1 True
  1054. 2 True
  1055. 3 False
  1056. 4 NaN
  1057. dtype: object
  1058. Ignoring case sensitivity using `flags` with regex.
  1059. >>> import re
  1060. >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
  1061. 0 False
  1062. 1 False
  1063. 2 True
  1064. 3 False
  1065. 4 NaN
  1066. dtype: object
  1067. Returning any digit using regular expression.
  1068. >>> s1.str.contains('\\d', regex=True)
  1069. 0 False
  1070. 1 False
  1071. 2 False
  1072. 3 True
  1073. 4 NaN
  1074. dtype: object
  1075. Ensure `pat` is a not a literal pattern when `regex` is set to True.
  1076. Note in the following example one might expect only `s2[1]` and `s2[3]` to
  1077. return `True`. However, '.0' as a regex matches any character
  1078. followed by a 0.
  1079. >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
  1080. >>> s2.str.contains('.0', regex=True)
  1081. 0 True
  1082. 1 True
  1083. 2 False
  1084. 3 True
  1085. 4 False
  1086. dtype: bool
  1087. """
  1088. if regex and re.compile(pat).groups:
  1089. warnings.warn(
  1090. "This pattern is interpreted as a regular expression, and has "
  1091. "match groups. To actually get the groups, use str.extract.",
  1092. UserWarning,
  1093. stacklevel=find_stack_level(),
  1094. )
  1095. result = self._data.array._str_contains(pat, case, flags, na, regex)
  1096. return self._wrap_result(result, fill_value=na, returns_string=False)
  1097. @forbid_nonstring_types(["bytes"])
  1098. def match(self, pat, case: bool = True, flags: int = 0, na=None):
  1099. """
  1100. Determine if each string starts with a match of a regular expression.
  1101. Parameters
  1102. ----------
  1103. pat : str
  1104. Character sequence or regular expression.
  1105. case : bool, default True
  1106. If True, case sensitive.
  1107. flags : int, default 0 (no flags)
  1108. Regex module flags, e.g. re.IGNORECASE.
  1109. na : scalar, optional
  1110. Fill value for missing values. The default depends on dtype of the
  1111. array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
  1112. ``pandas.NA`` is used.
  1113. Returns
  1114. -------
  1115. Series/Index/array of boolean values
  1116. See Also
  1117. --------
  1118. fullmatch : Stricter matching that requires the entire string to match.
  1119. contains : Analogous, but less strict, relying on re.search instead of
  1120. re.match.
  1121. extract : Extract matched groups.
  1122. """
  1123. result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
  1124. return self._wrap_result(result, fill_value=na, returns_string=False)
  1125. @forbid_nonstring_types(["bytes"])
  1126. def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
  1127. """
  1128. Determine if each string entirely matches a regular expression.
  1129. .. versionadded:: 1.1.0
  1130. Parameters
  1131. ----------
  1132. pat : str
  1133. Character sequence or regular expression.
  1134. case : bool, default True
  1135. If True, case sensitive.
  1136. flags : int, default 0 (no flags)
  1137. Regex module flags, e.g. re.IGNORECASE.
  1138. na : scalar, optional
  1139. Fill value for missing values. The default depends on dtype of the
  1140. array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
  1141. ``pandas.NA`` is used.
  1142. Returns
  1143. -------
  1144. Series/Index/array of boolean values
  1145. See Also
  1146. --------
  1147. match : Similar, but also returns `True` when only a *prefix* of the string
  1148. matches the regular expression.
  1149. extract : Extract matched groups.
  1150. """
  1151. result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
  1152. return self._wrap_result(result, fill_value=na, returns_string=False)
  1153. @forbid_nonstring_types(["bytes"])
  1154. def replace(
  1155. self,
  1156. pat: str | re.Pattern,
  1157. repl: str | Callable,
  1158. n: int = -1,
  1159. case: bool | None = None,
  1160. flags: int = 0,
  1161. regex: bool = False,
  1162. ):
  1163. r"""
  1164. Replace each occurrence of pattern/regex in the Series/Index.
  1165. Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
  1166. the regex value.
  1167. Parameters
  1168. ----------
  1169. pat : str or compiled regex
  1170. String can be a character sequence or regular expression.
  1171. repl : str or callable
  1172. Replacement string or a callable. The callable is passed the regex
  1173. match object and must return a replacement string to be used.
  1174. See :func:`re.sub`.
  1175. n : int, default -1 (all)
  1176. Number of replacements to make from start.
  1177. case : bool, default None
  1178. Determines if replace is case sensitive:
  1179. - If True, case sensitive (the default if `pat` is a string)
  1180. - Set to False for case insensitive
  1181. - Cannot be set if `pat` is a compiled regex.
  1182. flags : int, default 0 (no flags)
  1183. Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
  1184. regex.
  1185. regex : bool, default False
  1186. Determines if the passed-in pattern is a regular expression:
  1187. - If True, assumes the passed-in pattern is a regular expression.
  1188. - If False, treats the pattern as a literal string
  1189. - Cannot be set to False if `pat` is a compiled regex or `repl` is
  1190. a callable.
  1191. Returns
  1192. -------
  1193. Series or Index of object
  1194. A copy of the object with all matching occurrences of `pat` replaced by
  1195. `repl`.
  1196. Raises
  1197. ------
  1198. ValueError
  1199. * if `regex` is False and `repl` is a callable or `pat` is a compiled
  1200. regex
  1201. * if `pat` is a compiled regex and `case` or `flags` is set
  1202. Notes
  1203. -----
  1204. When `pat` is a compiled regex, all flags should be included in the
  1205. compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
  1206. regex will raise an error.
  1207. Examples
  1208. --------
  1209. When `pat` is a string and `regex` is True (the default), the given `pat`
  1210. is compiled as a regex. When `repl` is a string, it replaces matching
  1211. regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
  1212. left as is:
  1213. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
  1214. 0 bao
  1215. 1 baz
  1216. 2 NaN
  1217. dtype: object
  1218. When `pat` is a string and `regex` is False, every `pat` is replaced with
  1219. `repl` as with :meth:`str.replace`:
  1220. >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
  1221. 0 bao
  1222. 1 fuz
  1223. 2 NaN
  1224. dtype: object
  1225. When `repl` is a callable, it is called on every `pat` using
  1226. :func:`re.sub`. The callable should expect one positional argument
  1227. (a regex object) and return a string.
  1228. To get the idea:
  1229. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)
  1230. 0 <re.Match object; span=(0, 1), match='f'>oo
  1231. 1 <re.Match object; span=(0, 1), match='f'>uz
  1232. 2 NaN
  1233. dtype: object
  1234. Reverse every lowercase alphabetic word:
  1235. >>> repl = lambda m: m.group(0)[::-1]
  1236. >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])
  1237. >>> ser.str.replace(r'[a-z]+', repl, regex=True)
  1238. 0 oof 123
  1239. 1 rab zab
  1240. 2 NaN
  1241. dtype: object
  1242. Using regex groups (extract second group and swap case):
  1243. >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
  1244. >>> repl = lambda m: m.group('two').swapcase()
  1245. >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])
  1246. >>> ser.str.replace(pat, repl, regex=True)
  1247. 0 tWO
  1248. 1 bAR
  1249. dtype: object
  1250. Using a compiled regex with flags
  1251. >>> import re
  1252. >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
  1253. >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
  1254. 0 foo
  1255. 1 bar
  1256. 2 NaN
  1257. dtype: object
  1258. """
  1259. # Check whether repl is valid (GH 13438, GH 15055)
  1260. if not (isinstance(repl, str) or callable(repl)):
  1261. raise TypeError("repl must be a string or callable")
  1262. is_compiled_re = is_re(pat)
  1263. if regex or regex is None:
  1264. if is_compiled_re and (case is not None or flags != 0):
  1265. raise ValueError(
  1266. "case and flags cannot be set when pat is a compiled regex"
  1267. )
  1268. elif is_compiled_re:
  1269. raise ValueError(
  1270. "Cannot use a compiled regex as replacement pattern with regex=False"
  1271. )
  1272. elif callable(repl):
  1273. raise ValueError("Cannot use a callable replacement when regex=False")
  1274. if case is None:
  1275. case = True
  1276. result = self._data.array._str_replace(
  1277. pat, repl, n=n, case=case, flags=flags, regex=regex
  1278. )
  1279. return self._wrap_result(result)
  1280. @forbid_nonstring_types(["bytes"])
  1281. def repeat(self, repeats):
  1282. """
  1283. Duplicate each string in the Series or Index.
  1284. Parameters
  1285. ----------
  1286. repeats : int or sequence of int
  1287. Same value for all (int) or different value per (sequence).
  1288. Returns
  1289. -------
  1290. Series or pandas.Index
  1291. Series or Index of repeated string objects specified by
  1292. input parameter repeats.
  1293. Examples
  1294. --------
  1295. >>> s = pd.Series(['a', 'b', 'c'])
  1296. >>> s
  1297. 0 a
  1298. 1 b
  1299. 2 c
  1300. dtype: object
  1301. Single int repeats string in Series
  1302. >>> s.str.repeat(repeats=2)
  1303. 0 aa
  1304. 1 bb
  1305. 2 cc
  1306. dtype: object
  1307. Sequence of int repeats corresponding string in Series
  1308. >>> s.str.repeat(repeats=[1, 2, 3])
  1309. 0 a
  1310. 1 bb
  1311. 2 ccc
  1312. dtype: object
  1313. """
  1314. result = self._data.array._str_repeat(repeats)
  1315. return self._wrap_result(result)
  1316. @forbid_nonstring_types(["bytes"])
  1317. def pad(
  1318. self,
  1319. width,
  1320. side: Literal["left", "right", "both"] = "left",
  1321. fillchar: str = " ",
  1322. ):
  1323. """
  1324. Pad strings in the Series/Index up to width.
  1325. Parameters
  1326. ----------
  1327. width : int
  1328. Minimum width of resulting string; additional characters will be filled
  1329. with character defined in `fillchar`.
  1330. side : {'left', 'right', 'both'}, default 'left'
  1331. Side from which to fill resulting string.
  1332. fillchar : str, default ' '
  1333. Additional character for filling, default is whitespace.
  1334. Returns
  1335. -------
  1336. Series or Index of object
  1337. Returns Series or Index with minimum number of char in object.
  1338. See Also
  1339. --------
  1340. Series.str.rjust : Fills the left side of strings with an arbitrary
  1341. character. Equivalent to ``Series.str.pad(side='left')``.
  1342. Series.str.ljust : Fills the right side of strings with an arbitrary
  1343. character. Equivalent to ``Series.str.pad(side='right')``.
  1344. Series.str.center : Fills both sides of strings with an arbitrary
  1345. character. Equivalent to ``Series.str.pad(side='both')``.
  1346. Series.str.zfill : Pad strings in the Series/Index by prepending '0'
  1347. character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
  1348. Examples
  1349. --------
  1350. >>> s = pd.Series(["caribou", "tiger"])
  1351. >>> s
  1352. 0 caribou
  1353. 1 tiger
  1354. dtype: object
  1355. >>> s.str.pad(width=10)
  1356. 0 caribou
  1357. 1 tiger
  1358. dtype: object
  1359. >>> s.str.pad(width=10, side='right', fillchar='-')
  1360. 0 caribou---
  1361. 1 tiger-----
  1362. dtype: object
  1363. >>> s.str.pad(width=10, side='both', fillchar='-')
  1364. 0 -caribou--
  1365. 1 --tiger---
  1366. dtype: object
  1367. """
  1368. if not isinstance(fillchar, str):
  1369. msg = f"fillchar must be a character, not {type(fillchar).__name__}"
  1370. raise TypeError(msg)
  1371. if len(fillchar) != 1:
  1372. raise TypeError("fillchar must be a character, not str")
  1373. if not is_integer(width):
  1374. msg = f"width must be of integer type, not {type(width).__name__}"
  1375. raise TypeError(msg)
  1376. result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
  1377. return self._wrap_result(result)
  1378. _shared_docs[
  1379. "str_pad"
  1380. ] = """
  1381. Pad %(side)s side of strings in the Series/Index.
  1382. Equivalent to :meth:`str.%(method)s`.
  1383. Parameters
  1384. ----------
  1385. width : int
  1386. Minimum width of resulting string; additional characters will be filled
  1387. with ``fillchar``.
  1388. fillchar : str
  1389. Additional character for filling, default is whitespace.
  1390. Returns
  1391. -------
  1392. Series/Index of objects.
  1393. """
  1394. @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
  1395. @forbid_nonstring_types(["bytes"])
  1396. def center(self, width, fillchar: str = " "):
  1397. return self.pad(width, side="both", fillchar=fillchar)
  1398. @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
  1399. @forbid_nonstring_types(["bytes"])
  1400. def ljust(self, width, fillchar: str = " "):
  1401. return self.pad(width, side="right", fillchar=fillchar)
  1402. @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
  1403. @forbid_nonstring_types(["bytes"])
  1404. def rjust(self, width, fillchar: str = " "):
  1405. return self.pad(width, side="left", fillchar=fillchar)
  1406. @forbid_nonstring_types(["bytes"])
  1407. def zfill(self, width):
  1408. """
  1409. Pad strings in the Series/Index by prepending '0' characters.
  1410. Strings in the Series/Index are padded with '0' characters on the
  1411. left of the string to reach a total string length `width`. Strings
  1412. in the Series/Index with length greater or equal to `width` are
  1413. unchanged.
  1414. Parameters
  1415. ----------
  1416. width : int
  1417. Minimum length of resulting string; strings with length less
  1418. than `width` be prepended with '0' characters.
  1419. Returns
  1420. -------
  1421. Series/Index of objects.
  1422. See Also
  1423. --------
  1424. Series.str.rjust : Fills the left side of strings with an arbitrary
  1425. character.
  1426. Series.str.ljust : Fills the right side of strings with an arbitrary
  1427. character.
  1428. Series.str.pad : Fills the specified sides of strings with an arbitrary
  1429. character.
  1430. Series.str.center : Fills both sides of strings with an arbitrary
  1431. character.
  1432. Notes
  1433. -----
  1434. Differs from :meth:`str.zfill` which has special handling
  1435. for '+'/'-' in the string.
  1436. Examples
  1437. --------
  1438. >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
  1439. >>> s
  1440. 0 -1
  1441. 1 1
  1442. 2 1000
  1443. 3 10
  1444. 4 NaN
  1445. dtype: object
  1446. Note that ``10`` and ``NaN`` are not strings, therefore they are
  1447. converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
  1448. special character and the zero is added to the right of it
  1449. (:meth:`str.zfill` would have moved it to the left). ``1000``
  1450. remains unchanged as it is longer than `width`.
  1451. >>> s.str.zfill(3)
  1452. 0 -01
  1453. 1 001
  1454. 2 1000
  1455. 3 NaN
  1456. 4 NaN
  1457. dtype: object
  1458. """
  1459. if not is_integer(width):
  1460. msg = f"width must be of integer type, not {type(width).__name__}"
  1461. raise TypeError(msg)
  1462. f = lambda x: x.zfill(width)
  1463. result = self._data.array._str_map(f)
  1464. return self._wrap_result(result)
  1465. def slice(self, start=None, stop=None, step=None):
  1466. """
  1467. Slice substrings from each element in the Series or Index.
  1468. Parameters
  1469. ----------
  1470. start : int, optional
  1471. Start position for slice operation.
  1472. stop : int, optional
  1473. Stop position for slice operation.
  1474. step : int, optional
  1475. Step size for slice operation.
  1476. Returns
  1477. -------
  1478. Series or Index of object
  1479. Series or Index from sliced substring from original string object.
  1480. See Also
  1481. --------
  1482. Series.str.slice_replace : Replace a slice with a string.
  1483. Series.str.get : Return element at position.
  1484. Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
  1485. being the position.
  1486. Examples
  1487. --------
  1488. >>> s = pd.Series(["koala", "dog", "chameleon"])
  1489. >>> s
  1490. 0 koala
  1491. 1 dog
  1492. 2 chameleon
  1493. dtype: object
  1494. >>> s.str.slice(start=1)
  1495. 0 oala
  1496. 1 og
  1497. 2 hameleon
  1498. dtype: object
  1499. >>> s.str.slice(start=-1)
  1500. 0 a
  1501. 1 g
  1502. 2 n
  1503. dtype: object
  1504. >>> s.str.slice(stop=2)
  1505. 0 ko
  1506. 1 do
  1507. 2 ch
  1508. dtype: object
  1509. >>> s.str.slice(step=2)
  1510. 0 kaa
  1511. 1 dg
  1512. 2 caeen
  1513. dtype: object
  1514. >>> s.str.slice(start=0, stop=5, step=3)
  1515. 0 kl
  1516. 1 d
  1517. 2 cm
  1518. dtype: object
  1519. Equivalent behaviour to:
  1520. >>> s.str[0:5:3]
  1521. 0 kl
  1522. 1 d
  1523. 2 cm
  1524. dtype: object
  1525. """
  1526. result = self._data.array._str_slice(start, stop, step)
  1527. return self._wrap_result(result)
  1528. @forbid_nonstring_types(["bytes"])
  1529. def slice_replace(self, start=None, stop=None, repl=None):
  1530. """
  1531. Replace a positional slice of a string with another value.
  1532. Parameters
  1533. ----------
  1534. start : int, optional
  1535. Left index position to use for the slice. If not specified (None),
  1536. the slice is unbounded on the left, i.e. slice from the start
  1537. of the string.
  1538. stop : int, optional
  1539. Right index position to use for the slice. If not specified (None),
  1540. the slice is unbounded on the right, i.e. slice until the
  1541. end of the string.
  1542. repl : str, optional
  1543. String for replacement. If not specified (None), the sliced region
  1544. is replaced with an empty string.
  1545. Returns
  1546. -------
  1547. Series or Index
  1548. Same type as the original object.
  1549. See Also
  1550. --------
  1551. Series.str.slice : Just slicing without replacement.
  1552. Examples
  1553. --------
  1554. >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
  1555. >>> s
  1556. 0 a
  1557. 1 ab
  1558. 2 abc
  1559. 3 abdc
  1560. 4 abcde
  1561. dtype: object
  1562. Specify just `start`, meaning replace `start` until the end of the
  1563. string with `repl`.
  1564. >>> s.str.slice_replace(1, repl='X')
  1565. 0 aX
  1566. 1 aX
  1567. 2 aX
  1568. 3 aX
  1569. 4 aX
  1570. dtype: object
  1571. Specify just `stop`, meaning the start of the string to `stop` is replaced
  1572. with `repl`, and the rest of the string is included.
  1573. >>> s.str.slice_replace(stop=2, repl='X')
  1574. 0 X
  1575. 1 X
  1576. 2 Xc
  1577. 3 Xdc
  1578. 4 Xcde
  1579. dtype: object
  1580. Specify `start` and `stop`, meaning the slice from `start` to `stop` is
  1581. replaced with `repl`. Everything before or after `start` and `stop` is
  1582. included as is.
  1583. >>> s.str.slice_replace(start=1, stop=3, repl='X')
  1584. 0 aX
  1585. 1 aX
  1586. 2 aX
  1587. 3 aXc
  1588. 4 aXde
  1589. dtype: object
  1590. """
  1591. result = self._data.array._str_slice_replace(start, stop, repl)
  1592. return self._wrap_result(result)
  1593. def decode(self, encoding, errors: str = "strict"):
  1594. """
  1595. Decode character string in the Series/Index using indicated encoding.
  1596. Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
  1597. python3.
  1598. Parameters
  1599. ----------
  1600. encoding : str
  1601. errors : str, optional
  1602. Returns
  1603. -------
  1604. Series or Index
  1605. """
  1606. # TODO: Add a similar _bytes interface.
  1607. if encoding in _cpython_optimized_decoders:
  1608. # CPython optimized implementation
  1609. f = lambda x: x.decode(encoding, errors)
  1610. else:
  1611. decoder = codecs.getdecoder(encoding)
  1612. f = lambda x: decoder(x, errors)[0]
  1613. arr = self._data.array
  1614. # assert isinstance(arr, (StringArray,))
  1615. result = arr._str_map(f)
  1616. return self._wrap_result(result)
  1617. @forbid_nonstring_types(["bytes"])
  1618. def encode(self, encoding, errors: str = "strict"):
  1619. """
  1620. Encode character string in the Series/Index using indicated encoding.
  1621. Equivalent to :meth:`str.encode`.
  1622. Parameters
  1623. ----------
  1624. encoding : str
  1625. errors : str, optional
  1626. Returns
  1627. -------
  1628. Series/Index of objects
  1629. """
  1630. result = self._data.array._str_encode(encoding, errors)
  1631. return self._wrap_result(result, returns_string=False)
  1632. _shared_docs[
  1633. "str_strip"
  1634. ] = r"""
  1635. Remove %(position)s characters.
  1636. Strip whitespaces (including newlines) or a set of specified characters
  1637. from each string in the Series/Index from %(side)s.
  1638. Replaces any non-strings in Series with NaNs.
  1639. Equivalent to :meth:`str.%(method)s`.
  1640. Parameters
  1641. ----------
  1642. to_strip : str or None, default None
  1643. Specifying the set of characters to be removed.
  1644. All combinations of this set of characters will be stripped.
  1645. If None then whitespaces are removed.
  1646. Returns
  1647. -------
  1648. Series or Index of object
  1649. See Also
  1650. --------
  1651. Series.str.strip : Remove leading and trailing characters in Series/Index.
  1652. Series.str.lstrip : Remove leading characters in Series/Index.
  1653. Series.str.rstrip : Remove trailing characters in Series/Index.
  1654. Examples
  1655. --------
  1656. >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
  1657. >>> s
  1658. 0 1. Ant.
  1659. 1 2. Bee!\n
  1660. 2 3. Cat?\t
  1661. 3 NaN
  1662. 4 10
  1663. 5 True
  1664. dtype: object
  1665. >>> s.str.strip()
  1666. 0 1. Ant.
  1667. 1 2. Bee!
  1668. 2 3. Cat?
  1669. 3 NaN
  1670. 4 NaN
  1671. 5 NaN
  1672. dtype: object
  1673. >>> s.str.lstrip('123.')
  1674. 0 Ant.
  1675. 1 Bee!\n
  1676. 2 Cat?\t
  1677. 3 NaN
  1678. 4 NaN
  1679. 5 NaN
  1680. dtype: object
  1681. >>> s.str.rstrip('.!? \n\t')
  1682. 0 1. Ant
  1683. 1 2. Bee
  1684. 2 3. Cat
  1685. 3 NaN
  1686. 4 NaN
  1687. 5 NaN
  1688. dtype: object
  1689. >>> s.str.strip('123.!? \n\t')
  1690. 0 Ant
  1691. 1 Bee
  1692. 2 Cat
  1693. 3 NaN
  1694. 4 NaN
  1695. 5 NaN
  1696. dtype: object
  1697. """
  1698. @Appender(
  1699. _shared_docs["str_strip"]
  1700. % {
  1701. "side": "left and right sides",
  1702. "method": "strip",
  1703. "position": "leading and trailing",
  1704. }
  1705. )
  1706. @forbid_nonstring_types(["bytes"])
  1707. def strip(self, to_strip=None):
  1708. result = self._data.array._str_strip(to_strip)
  1709. return self._wrap_result(result)
  1710. @Appender(
  1711. _shared_docs["str_strip"]
  1712. % {"side": "left side", "method": "lstrip", "position": "leading"}
  1713. )
  1714. @forbid_nonstring_types(["bytes"])
  1715. def lstrip(self, to_strip=None):
  1716. result = self._data.array._str_lstrip(to_strip)
  1717. return self._wrap_result(result)
  1718. @Appender(
  1719. _shared_docs["str_strip"]
  1720. % {"side": "right side", "method": "rstrip", "position": "trailing"}
  1721. )
  1722. @forbid_nonstring_types(["bytes"])
  1723. def rstrip(self, to_strip=None):
  1724. result = self._data.array._str_rstrip(to_strip)
  1725. return self._wrap_result(result)
  1726. _shared_docs[
  1727. "str_removefix"
  1728. ] = r"""
  1729. Remove a %(side)s from an object series.
  1730. If the %(side)s is not present, the original string will be returned.
  1731. Parameters
  1732. ----------
  1733. %(side)s : str
  1734. Remove the %(side)s of the string.
  1735. Returns
  1736. -------
  1737. Series/Index: object
  1738. The Series or Index with given %(side)s removed.
  1739. See Also
  1740. --------
  1741. Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
  1742. Examples
  1743. --------
  1744. >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
  1745. >>> s
  1746. 0 str_foo
  1747. 1 str_bar
  1748. 2 no_prefix
  1749. dtype: object
  1750. >>> s.str.removeprefix("str_")
  1751. 0 foo
  1752. 1 bar
  1753. 2 no_prefix
  1754. dtype: object
  1755. >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
  1756. >>> s
  1757. 0 foo_str
  1758. 1 bar_str
  1759. 2 no_suffix
  1760. dtype: object
  1761. >>> s.str.removesuffix("_str")
  1762. 0 foo
  1763. 1 bar
  1764. 2 no_suffix
  1765. dtype: object
  1766. """
  1767. @Appender(
  1768. _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
  1769. )
  1770. @forbid_nonstring_types(["bytes"])
  1771. def removeprefix(self, prefix):
  1772. result = self._data.array._str_removeprefix(prefix)
  1773. return self._wrap_result(result)
  1774. @Appender(
  1775. _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
  1776. )
  1777. @forbid_nonstring_types(["bytes"])
  1778. def removesuffix(self, suffix):
  1779. result = self._data.array._str_removesuffix(suffix)
  1780. return self._wrap_result(result)
  1781. @forbid_nonstring_types(["bytes"])
  1782. def wrap(self, width, **kwargs):
  1783. r"""
  1784. Wrap strings in Series/Index at specified line width.
  1785. This method has the same keyword parameters and defaults as
  1786. :class:`textwrap.TextWrapper`.
  1787. Parameters
  1788. ----------
  1789. width : int
  1790. Maximum line width.
  1791. expand_tabs : bool, optional
  1792. If True, tab characters will be expanded to spaces (default: True).
  1793. replace_whitespace : bool, optional
  1794. If True, each whitespace character (as defined by string.whitespace)
  1795. remaining after tab expansion will be replaced by a single space
  1796. (default: True).
  1797. drop_whitespace : bool, optional
  1798. If True, whitespace that, after wrapping, happens to end up at the
  1799. beginning or end of a line is dropped (default: True).
  1800. break_long_words : bool, optional
  1801. If True, then words longer than width will be broken in order to ensure
  1802. that no lines are longer than width. If it is false, long words will
  1803. not be broken, and some lines may be longer than width (default: True).
  1804. break_on_hyphens : bool, optional
  1805. If True, wrapping will occur preferably on whitespace and right after
  1806. hyphens in compound words, as it is customary in English. If false,
  1807. only whitespaces will be considered as potentially good places for line
  1808. breaks, but you need to set break_long_words to false if you want truly
  1809. insecable words (default: True).
  1810. Returns
  1811. -------
  1812. Series or Index
  1813. Notes
  1814. -----
  1815. Internally, this method uses a :class:`textwrap.TextWrapper` instance with
  1816. default settings. To achieve behavior matching R's stringr library str_wrap
  1817. function, use the arguments:
  1818. - expand_tabs = False
  1819. - replace_whitespace = True
  1820. - drop_whitespace = True
  1821. - break_long_words = False
  1822. - break_on_hyphens = False
  1823. Examples
  1824. --------
  1825. >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
  1826. >>> s.str.wrap(12)
  1827. 0 line to be\nwrapped
  1828. 1 another line\nto be\nwrapped
  1829. dtype: object
  1830. """
  1831. result = self._data.array._str_wrap(width, **kwargs)
  1832. return self._wrap_result(result)
  1833. @forbid_nonstring_types(["bytes"])
  1834. def get_dummies(self, sep: str = "|"):
  1835. """
  1836. Return DataFrame of dummy/indicator variables for Series.
  1837. Each string in Series is split by sep and returned as a DataFrame
  1838. of dummy/indicator variables.
  1839. Parameters
  1840. ----------
  1841. sep : str, default "|"
  1842. String to split on.
  1843. Returns
  1844. -------
  1845. DataFrame
  1846. Dummy variables corresponding to values of the Series.
  1847. See Also
  1848. --------
  1849. get_dummies : Convert categorical variable into dummy/indicator
  1850. variables.
  1851. Examples
  1852. --------
  1853. >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
  1854. a b c
  1855. 0 1 1 0
  1856. 1 1 0 0
  1857. 2 1 0 1
  1858. >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
  1859. a b c
  1860. 0 1 1 0
  1861. 1 0 0 0
  1862. 2 1 0 1
  1863. """
  1864. # we need to cast to Series of strings as only that has all
  1865. # methods available for making the dummies...
  1866. result, name = self._data.array._str_get_dummies(sep)
  1867. return self._wrap_result(
  1868. result,
  1869. name=name,
  1870. expand=True,
  1871. returns_string=False,
  1872. )
  1873. @forbid_nonstring_types(["bytes"])
  1874. def translate(self, table):
  1875. """
  1876. Map all characters in the string through the given mapping table.
  1877. Equivalent to standard :meth:`str.translate`.
  1878. Parameters
  1879. ----------
  1880. table : dict
  1881. Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
  1882. None. Unmapped characters are left untouched.
  1883. Characters mapped to None are deleted. :meth:`str.maketrans` is a
  1884. helper function for making translation tables.
  1885. Returns
  1886. -------
  1887. Series or Index
  1888. """
  1889. result = self._data.array._str_translate(table)
  1890. return self._wrap_result(result)
  1891. @forbid_nonstring_types(["bytes"])
  1892. def count(self, pat, flags: int = 0):
  1893. r"""
  1894. Count occurrences of pattern in each string of the Series/Index.
  1895. This function is used to count the number of times a particular regex
  1896. pattern is repeated in each of the string elements of the
  1897. :class:`~pandas.Series`.
  1898. Parameters
  1899. ----------
  1900. pat : str
  1901. Valid regular expression.
  1902. flags : int, default 0, meaning no flags
  1903. Flags for the `re` module. For a complete list, `see here
  1904. <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
  1905. **kwargs
  1906. For compatibility with other string methods. Not used.
  1907. Returns
  1908. -------
  1909. Series or Index
  1910. Same type as the calling object containing the integer counts.
  1911. See Also
  1912. --------
  1913. re : Standard library module for regular expressions.
  1914. str.count : Standard library version, without regular expression support.
  1915. Notes
  1916. -----
  1917. Some characters need to be escaped when passing in `pat`.
  1918. eg. ``'$'`` has a special meaning in regex and must be escaped when
  1919. finding this literal character.
  1920. Examples
  1921. --------
  1922. >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
  1923. >>> s.str.count('a')
  1924. 0 0.0
  1925. 1 0.0
  1926. 2 2.0
  1927. 3 2.0
  1928. 4 NaN
  1929. 5 0.0
  1930. 6 1.0
  1931. dtype: float64
  1932. Escape ``'$'`` to find the literal dollar sign.
  1933. >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
  1934. >>> s.str.count('\\$')
  1935. 0 1
  1936. 1 0
  1937. 2 1
  1938. 3 2
  1939. 4 2
  1940. 5 0
  1941. dtype: int64
  1942. This is also available on Index
  1943. >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
  1944. Index([0, 0, 2, 1], dtype='int64')
  1945. """
  1946. result = self._data.array._str_count(pat, flags)
  1947. return self._wrap_result(result, returns_string=False)
  1948. @forbid_nonstring_types(["bytes"])
  1949. def startswith(
  1950. self, pat: str | tuple[str, ...], na: Scalar | None = None
  1951. ) -> Series | Index:
  1952. """
  1953. Test if the start of each string element matches a pattern.
  1954. Equivalent to :meth:`str.startswith`.
  1955. Parameters
  1956. ----------
  1957. pat : str or tuple[str, ...]
  1958. Character sequence or tuple of strings. Regular expressions are not
  1959. accepted.
  1960. na : object, default NaN
  1961. Object shown if element tested is not a string. The default depends
  1962. on dtype of the array. For object-dtype, ``numpy.nan`` is used.
  1963. For ``StringDtype``, ``pandas.NA`` is used.
  1964. Returns
  1965. -------
  1966. Series or Index of bool
  1967. A Series of booleans indicating whether the given pattern matches
  1968. the start of each string element.
  1969. See Also
  1970. --------
  1971. str.startswith : Python standard library string method.
  1972. Series.str.endswith : Same as startswith, but tests the end of string.
  1973. Series.str.contains : Tests if string element contains a pattern.
  1974. Examples
  1975. --------
  1976. >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
  1977. >>> s
  1978. 0 bat
  1979. 1 Bear
  1980. 2 cat
  1981. 3 NaN
  1982. dtype: object
  1983. >>> s.str.startswith('b')
  1984. 0 True
  1985. 1 False
  1986. 2 False
  1987. 3 NaN
  1988. dtype: object
  1989. >>> s.str.startswith(('b', 'B'))
  1990. 0 True
  1991. 1 True
  1992. 2 False
  1993. 3 NaN
  1994. dtype: object
  1995. Specifying `na` to be `False` instead of `NaN`.
  1996. >>> s.str.startswith('b', na=False)
  1997. 0 True
  1998. 1 False
  1999. 2 False
  2000. 3 False
  2001. dtype: bool
  2002. """
  2003. if not isinstance(pat, (str, tuple)):
  2004. msg = f"expected a string or tuple, not {type(pat).__name__}"
  2005. raise TypeError(msg)
  2006. result = self._data.array._str_startswith(pat, na=na)
  2007. return self._wrap_result(result, returns_string=False)
  2008. @forbid_nonstring_types(["bytes"])
  2009. def endswith(
  2010. self, pat: str | tuple[str, ...], na: Scalar | None = None
  2011. ) -> Series | Index:
  2012. """
  2013. Test if the end of each string element matches a pattern.
  2014. Equivalent to :meth:`str.endswith`.
  2015. Parameters
  2016. ----------
  2017. pat : str or tuple[str, ...]
  2018. Character sequence or tuple of strings. Regular expressions are not
  2019. accepted.
  2020. na : object, default NaN
  2021. Object shown if element tested is not a string. The default depends
  2022. on dtype of the array. For object-dtype, ``numpy.nan`` is used.
  2023. For ``StringDtype``, ``pandas.NA`` is used.
  2024. Returns
  2025. -------
  2026. Series or Index of bool
  2027. A Series of booleans indicating whether the given pattern matches
  2028. the end of each string element.
  2029. See Also
  2030. --------
  2031. str.endswith : Python standard library string method.
  2032. Series.str.startswith : Same as endswith, but tests the start of string.
  2033. Series.str.contains : Tests if string element contains a pattern.
  2034. Examples
  2035. --------
  2036. >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
  2037. >>> s
  2038. 0 bat
  2039. 1 bear
  2040. 2 caT
  2041. 3 NaN
  2042. dtype: object
  2043. >>> s.str.endswith('t')
  2044. 0 True
  2045. 1 False
  2046. 2 False
  2047. 3 NaN
  2048. dtype: object
  2049. >>> s.str.endswith(('t', 'T'))
  2050. 0 True
  2051. 1 False
  2052. 2 True
  2053. 3 NaN
  2054. dtype: object
  2055. Specifying `na` to be `False` instead of `NaN`.
  2056. >>> s.str.endswith('t', na=False)
  2057. 0 True
  2058. 1 False
  2059. 2 False
  2060. 3 False
  2061. dtype: bool
  2062. """
  2063. if not isinstance(pat, (str, tuple)):
  2064. msg = f"expected a string or tuple, not {type(pat).__name__}"
  2065. raise TypeError(msg)
  2066. result = self._data.array._str_endswith(pat, na=na)
  2067. return self._wrap_result(result, returns_string=False)
  2068. @forbid_nonstring_types(["bytes"])
  2069. def findall(self, pat, flags: int = 0):
  2070. """
  2071. Find all occurrences of pattern or regular expression in the Series/Index.
  2072. Equivalent to applying :func:`re.findall` to all the elements in the
  2073. Series/Index.
  2074. Parameters
  2075. ----------
  2076. pat : str
  2077. Pattern or regular expression.
  2078. flags : int, default 0
  2079. Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
  2080. means no flags).
  2081. Returns
  2082. -------
  2083. Series/Index of lists of strings
  2084. All non-overlapping matches of pattern or regular expression in each
  2085. string of this Series/Index.
  2086. See Also
  2087. --------
  2088. count : Count occurrences of pattern or regular expression in each string
  2089. of the Series/Index.
  2090. extractall : For each string in the Series, extract groups from all matches
  2091. of regular expression and return a DataFrame with one row for each
  2092. match and one column for each group.
  2093. re.findall : The equivalent ``re`` function to all non-overlapping matches
  2094. of pattern or regular expression in string, as a list of strings.
  2095. Examples
  2096. --------
  2097. >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
  2098. The search for the pattern 'Monkey' returns one match:
  2099. >>> s.str.findall('Monkey')
  2100. 0 []
  2101. 1 [Monkey]
  2102. 2 []
  2103. dtype: object
  2104. On the other hand, the search for the pattern 'MONKEY' doesn't return any
  2105. match:
  2106. >>> s.str.findall('MONKEY')
  2107. 0 []
  2108. 1 []
  2109. 2 []
  2110. dtype: object
  2111. Flags can be added to the pattern or regular expression. For instance,
  2112. to find the pattern 'MONKEY' ignoring the case:
  2113. >>> import re
  2114. >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
  2115. 0 []
  2116. 1 [Monkey]
  2117. 2 []
  2118. dtype: object
  2119. When the pattern matches more than one string in the Series, all matches
  2120. are returned:
  2121. >>> s.str.findall('on')
  2122. 0 [on]
  2123. 1 [on]
  2124. 2 []
  2125. dtype: object
  2126. Regular expressions are supported too. For instance, the search for all the
  2127. strings ending with the word 'on' is shown next:
  2128. >>> s.str.findall('on$')
  2129. 0 [on]
  2130. 1 []
  2131. 2 []
  2132. dtype: object
  2133. If the pattern is found more than once in the same string, then a list of
  2134. multiple strings is returned:
  2135. >>> s.str.findall('b')
  2136. 0 []
  2137. 1 []
  2138. 2 [b, b]
  2139. dtype: object
  2140. """
  2141. result = self._data.array._str_findall(pat, flags)
  2142. return self._wrap_result(result, returns_string=False)
  2143. @forbid_nonstring_types(["bytes"])
  2144. def extract(
  2145. self, pat: str, flags: int = 0, expand: bool = True
  2146. ) -> DataFrame | Series | Index:
  2147. r"""
  2148. Extract capture groups in the regex `pat` as columns in a DataFrame.
  2149. For each subject string in the Series, extract groups from the
  2150. first match of regular expression `pat`.
  2151. Parameters
  2152. ----------
  2153. pat : str
  2154. Regular expression pattern with capturing groups.
  2155. flags : int, default 0 (no flags)
  2156. Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
  2157. modify regular expression matching for things like case,
  2158. spaces, etc. For more details, see :mod:`re`.
  2159. expand : bool, default True
  2160. If True, return DataFrame with one column per capture group.
  2161. If False, return a Series/Index if there is one capture group
  2162. or DataFrame if there are multiple capture groups.
  2163. Returns
  2164. -------
  2165. DataFrame or Series or Index
  2166. A DataFrame with one row for each subject string, and one
  2167. column for each group. Any capture group names in regular
  2168. expression pat will be used for column names; otherwise
  2169. capture group numbers will be used. The dtype of each result
  2170. column is always object, even when no match is found. If
  2171. ``expand=False`` and pat has only one capture group, then
  2172. return a Series (if subject is a Series) or Index (if subject
  2173. is an Index).
  2174. See Also
  2175. --------
  2176. extractall : Returns all matches (not just the first match).
  2177. Examples
  2178. --------
  2179. A pattern with two groups will return a DataFrame with two columns.
  2180. Non-matches will be NaN.
  2181. >>> s = pd.Series(['a1', 'b2', 'c3'])
  2182. >>> s.str.extract(r'([ab])(\d)')
  2183. 0 1
  2184. 0 a 1
  2185. 1 b 2
  2186. 2 NaN NaN
  2187. A pattern may contain optional groups.
  2188. >>> s.str.extract(r'([ab])?(\d)')
  2189. 0 1
  2190. 0 a 1
  2191. 1 b 2
  2192. 2 NaN 3
  2193. Named groups will become column names in the result.
  2194. >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
  2195. letter digit
  2196. 0 a 1
  2197. 1 b 2
  2198. 2 NaN NaN
  2199. A pattern with one group will return a DataFrame with one column
  2200. if expand=True.
  2201. >>> s.str.extract(r'[ab](\d)', expand=True)
  2202. 0
  2203. 0 1
  2204. 1 2
  2205. 2 NaN
  2206. A pattern with one group will return a Series if expand=False.
  2207. >>> s.str.extract(r'[ab](\d)', expand=False)
  2208. 0 1
  2209. 1 2
  2210. 2 NaN
  2211. dtype: object
  2212. """
  2213. from pandas import DataFrame
  2214. if not isinstance(expand, bool):
  2215. raise ValueError("expand must be True or False")
  2216. regex = re.compile(pat, flags=flags)
  2217. if regex.groups == 0:
  2218. raise ValueError("pattern contains no capture groups")
  2219. if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):
  2220. raise ValueError("only one regex group is supported with Index")
  2221. obj = self._data
  2222. result_dtype = _result_dtype(obj)
  2223. returns_df = regex.groups > 1 or expand
  2224. if returns_df:
  2225. name = None
  2226. columns = _get_group_names(regex)
  2227. if obj.array.size == 0:
  2228. result = DataFrame(columns=columns, dtype=result_dtype)
  2229. else:
  2230. result_list = self._data.array._str_extract(
  2231. pat, flags=flags, expand=returns_df
  2232. )
  2233. result_index: Index | None
  2234. if isinstance(obj, ABCSeries):
  2235. result_index = obj.index
  2236. else:
  2237. result_index = None
  2238. result = DataFrame(
  2239. result_list, columns=columns, index=result_index, dtype=result_dtype
  2240. )
  2241. else:
  2242. name = _get_single_group_name(regex)
  2243. result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
  2244. return self._wrap_result(result, name=name)
  2245. @forbid_nonstring_types(["bytes"])
  2246. def extractall(self, pat, flags: int = 0):
  2247. r"""
  2248. Extract capture groups in the regex `pat` as columns in DataFrame.
  2249. For each subject string in the Series, extract groups from all
  2250. matches of regular expression pat. When each subject string in the
  2251. Series has exactly one match, extractall(pat).xs(0, level='match')
  2252. is the same as extract(pat).
  2253. Parameters
  2254. ----------
  2255. pat : str
  2256. Regular expression pattern with capturing groups.
  2257. flags : int, default 0 (no flags)
  2258. A ``re`` module flag, for example ``re.IGNORECASE``. These allow
  2259. to modify regular expression matching for things like case, spaces,
  2260. etc. Multiple flags can be combined with the bitwise OR operator,
  2261. for example ``re.IGNORECASE | re.MULTILINE``.
  2262. Returns
  2263. -------
  2264. DataFrame
  2265. A ``DataFrame`` with one row for each match, and one column for each
  2266. group. Its rows have a ``MultiIndex`` with first levels that come from
  2267. the subject ``Series``. The last level is named 'match' and indexes the
  2268. matches in each item of the ``Series``. Any capture group names in
  2269. regular expression pat will be used for column names; otherwise capture
  2270. group numbers will be used.
  2271. See Also
  2272. --------
  2273. extract : Returns first match only (not all matches).
  2274. Examples
  2275. --------
  2276. A pattern with one group will return a DataFrame with one column.
  2277. Indices with no matches will not appear in the result.
  2278. >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
  2279. >>> s.str.extractall(r"[ab](\d)")
  2280. 0
  2281. match
  2282. A 0 1
  2283. 1 2
  2284. B 0 1
  2285. Capture group names are used for column names of the result.
  2286. >>> s.str.extractall(r"[ab](?P<digit>\d)")
  2287. digit
  2288. match
  2289. A 0 1
  2290. 1 2
  2291. B 0 1
  2292. A pattern with two groups will return a DataFrame with two columns.
  2293. >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
  2294. letter digit
  2295. match
  2296. A 0 a 1
  2297. 1 a 2
  2298. B 0 b 1
  2299. Optional groups that do not match are NaN in the result.
  2300. >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
  2301. letter digit
  2302. match
  2303. A 0 a 1
  2304. 1 a 2
  2305. B 0 b 1
  2306. C 0 NaN 1
  2307. """
  2308. # TODO: dispatch
  2309. return str_extractall(self._orig, pat, flags)
  2310. _shared_docs[
  2311. "find"
  2312. ] = """
  2313. Return %(side)s indexes in each strings in the Series/Index.
  2314. Each of returned indexes corresponds to the position where the
  2315. substring is fully contained between [start:end]. Return -1 on
  2316. failure. Equivalent to standard :meth:`str.%(method)s`.
  2317. Parameters
  2318. ----------
  2319. sub : str
  2320. Substring being searched.
  2321. start : int
  2322. Left edge index.
  2323. end : int
  2324. Right edge index.
  2325. Returns
  2326. -------
  2327. Series or Index of int.
  2328. See Also
  2329. --------
  2330. %(also)s
  2331. """
  2332. @Appender(
  2333. _shared_docs["find"]
  2334. % {
  2335. "side": "lowest",
  2336. "method": "find",
  2337. "also": "rfind : Return highest indexes in each strings.",
  2338. }
  2339. )
  2340. @forbid_nonstring_types(["bytes"])
  2341. def find(self, sub, start: int = 0, end=None):
  2342. if not isinstance(sub, str):
  2343. msg = f"expected a string object, not {type(sub).__name__}"
  2344. raise TypeError(msg)
  2345. result = self._data.array._str_find(sub, start, end)
  2346. return self._wrap_result(result, returns_string=False)
  2347. @Appender(
  2348. _shared_docs["find"]
  2349. % {
  2350. "side": "highest",
  2351. "method": "rfind",
  2352. "also": "find : Return lowest indexes in each strings.",
  2353. }
  2354. )
  2355. @forbid_nonstring_types(["bytes"])
  2356. def rfind(self, sub, start: int = 0, end=None):
  2357. if not isinstance(sub, str):
  2358. msg = f"expected a string object, not {type(sub).__name__}"
  2359. raise TypeError(msg)
  2360. result = self._data.array._str_rfind(sub, start=start, end=end)
  2361. return self._wrap_result(result, returns_string=False)
  2362. @forbid_nonstring_types(["bytes"])
  2363. def normalize(self, form):
  2364. """
  2365. Return the Unicode normal form for the strings in the Series/Index.
  2366. For more information on the forms, see the
  2367. :func:`unicodedata.normalize`.
  2368. Parameters
  2369. ----------
  2370. form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
  2371. Unicode form.
  2372. Returns
  2373. -------
  2374. Series/Index of objects
  2375. """
  2376. result = self._data.array._str_normalize(form)
  2377. return self._wrap_result(result)
  2378. _shared_docs[
  2379. "index"
  2380. ] = """
  2381. Return %(side)s indexes in each string in Series/Index.
  2382. Each of the returned indexes corresponds to the position where the
  2383. substring is fully contained between [start:end]. This is the same
  2384. as ``str.%(similar)s`` except instead of returning -1, it raises a
  2385. ValueError when the substring is not found. Equivalent to standard
  2386. ``str.%(method)s``.
  2387. Parameters
  2388. ----------
  2389. sub : str
  2390. Substring being searched.
  2391. start : int
  2392. Left edge index.
  2393. end : int
  2394. Right edge index.
  2395. Returns
  2396. -------
  2397. Series or Index of object
  2398. See Also
  2399. --------
  2400. %(also)s
  2401. """
  2402. @Appender(
  2403. _shared_docs["index"]
  2404. % {
  2405. "side": "lowest",
  2406. "similar": "find",
  2407. "method": "index",
  2408. "also": "rindex : Return highest indexes in each strings.",
  2409. }
  2410. )
  2411. @forbid_nonstring_types(["bytes"])
  2412. def index(self, sub, start: int = 0, end=None):
  2413. if not isinstance(sub, str):
  2414. msg = f"expected a string object, not {type(sub).__name__}"
  2415. raise TypeError(msg)
  2416. result = self._data.array._str_index(sub, start=start, end=end)
  2417. return self._wrap_result(result, returns_string=False)
  2418. @Appender(
  2419. _shared_docs["index"]
  2420. % {
  2421. "side": "highest",
  2422. "similar": "rfind",
  2423. "method": "rindex",
  2424. "also": "index : Return lowest indexes in each strings.",
  2425. }
  2426. )
  2427. @forbid_nonstring_types(["bytes"])
  2428. def rindex(self, sub, start: int = 0, end=None):
  2429. if not isinstance(sub, str):
  2430. msg = f"expected a string object, not {type(sub).__name__}"
  2431. raise TypeError(msg)
  2432. result = self._data.array._str_rindex(sub, start=start, end=end)
  2433. return self._wrap_result(result, returns_string=False)
  2434. def len(self):
  2435. """
  2436. Compute the length of each element in the Series/Index.
  2437. The element may be a sequence (such as a string, tuple or list) or a collection
  2438. (such as a dictionary).
  2439. Returns
  2440. -------
  2441. Series or Index of int
  2442. A Series or Index of integer values indicating the length of each
  2443. element in the Series or Index.
  2444. See Also
  2445. --------
  2446. str.len : Python built-in function returning the length of an object.
  2447. Series.size : Returns the length of the Series.
  2448. Examples
  2449. --------
  2450. Returns the length (number of characters) in a string. Returns the
  2451. number of entries for dictionaries, lists or tuples.
  2452. >>> s = pd.Series(['dog',
  2453. ... '',
  2454. ... 5,
  2455. ... {'foo' : 'bar'},
  2456. ... [2, 3, 5, 7],
  2457. ... ('one', 'two', 'three')])
  2458. >>> s
  2459. 0 dog
  2460. 1
  2461. 2 5
  2462. 3 {'foo': 'bar'}
  2463. 4 [2, 3, 5, 7]
  2464. 5 (one, two, three)
  2465. dtype: object
  2466. >>> s.str.len()
  2467. 0 3.0
  2468. 1 0.0
  2469. 2 NaN
  2470. 3 1.0
  2471. 4 4.0
  2472. 5 3.0
  2473. dtype: float64
  2474. """
  2475. result = self._data.array._str_len()
  2476. return self._wrap_result(result, returns_string=False)
  2477. _shared_docs[
  2478. "casemethods"
  2479. ] = """
  2480. Convert strings in the Series/Index to %(type)s.
  2481. %(version)s
  2482. Equivalent to :meth:`str.%(method)s`.
  2483. Returns
  2484. -------
  2485. Series or Index of object
  2486. See Also
  2487. --------
  2488. Series.str.lower : Converts all characters to lowercase.
  2489. Series.str.upper : Converts all characters to uppercase.
  2490. Series.str.title : Converts first character of each word to uppercase and
  2491. remaining to lowercase.
  2492. Series.str.capitalize : Converts first character to uppercase and
  2493. remaining to lowercase.
  2494. Series.str.swapcase : Converts uppercase to lowercase and lowercase to
  2495. uppercase.
  2496. Series.str.casefold: Removes all case distinctions in the string.
  2497. Examples
  2498. --------
  2499. >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
  2500. >>> s
  2501. 0 lower
  2502. 1 CAPITALS
  2503. 2 this is a sentence
  2504. 3 SwApCaSe
  2505. dtype: object
  2506. >>> s.str.lower()
  2507. 0 lower
  2508. 1 capitals
  2509. 2 this is a sentence
  2510. 3 swapcase
  2511. dtype: object
  2512. >>> s.str.upper()
  2513. 0 LOWER
  2514. 1 CAPITALS
  2515. 2 THIS IS A SENTENCE
  2516. 3 SWAPCASE
  2517. dtype: object
  2518. >>> s.str.title()
  2519. 0 Lower
  2520. 1 Capitals
  2521. 2 This Is A Sentence
  2522. 3 Swapcase
  2523. dtype: object
  2524. >>> s.str.capitalize()
  2525. 0 Lower
  2526. 1 Capitals
  2527. 2 This is a sentence
  2528. 3 Swapcase
  2529. dtype: object
  2530. >>> s.str.swapcase()
  2531. 0 LOWER
  2532. 1 capitals
  2533. 2 THIS IS A SENTENCE
  2534. 3 sWaPcAsE
  2535. dtype: object
  2536. """
  2537. # Types:
  2538. # cases:
  2539. # upper, lower, title, capitalize, swapcase, casefold
  2540. # boolean:
  2541. # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
  2542. # _doc_args holds dict of strings to use in substituting casemethod docs
  2543. _doc_args: dict[str, dict[str, str]] = {}
  2544. _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
  2545. _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
  2546. _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
  2547. _doc_args["capitalize"] = {
  2548. "type": "be capitalized",
  2549. "method": "capitalize",
  2550. "version": "",
  2551. }
  2552. _doc_args["swapcase"] = {
  2553. "type": "be swapcased",
  2554. "method": "swapcase",
  2555. "version": "",
  2556. }
  2557. _doc_args["casefold"] = {
  2558. "type": "be casefolded",
  2559. "method": "casefold",
  2560. "version": "",
  2561. }
  2562. @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
  2563. @forbid_nonstring_types(["bytes"])
  2564. def lower(self):
  2565. result = self._data.array._str_lower()
  2566. return self._wrap_result(result)
  2567. @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
  2568. @forbid_nonstring_types(["bytes"])
  2569. def upper(self):
  2570. result = self._data.array._str_upper()
  2571. return self._wrap_result(result)
  2572. @Appender(_shared_docs["casemethods"] % _doc_args["title"])
  2573. @forbid_nonstring_types(["bytes"])
  2574. def title(self):
  2575. result = self._data.array._str_title()
  2576. return self._wrap_result(result)
  2577. @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
  2578. @forbid_nonstring_types(["bytes"])
  2579. def capitalize(self):
  2580. result = self._data.array._str_capitalize()
  2581. return self._wrap_result(result)
  2582. @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
  2583. @forbid_nonstring_types(["bytes"])
  2584. def swapcase(self):
  2585. result = self._data.array._str_swapcase()
  2586. return self._wrap_result(result)
  2587. @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
  2588. @forbid_nonstring_types(["bytes"])
  2589. def casefold(self):
  2590. result = self._data.array._str_casefold()
  2591. return self._wrap_result(result)
  2592. _shared_docs[
  2593. "ismethods"
  2594. ] = """
  2595. Check whether all characters in each string are %(type)s.
  2596. This is equivalent to running the Python string method
  2597. :meth:`str.%(method)s` for each element of the Series/Index. If a string
  2598. has zero characters, ``False`` is returned for that check.
  2599. Returns
  2600. -------
  2601. Series or Index of bool
  2602. Series or Index of boolean values with the same length as the original
  2603. Series/Index.
  2604. See Also
  2605. --------
  2606. Series.str.isalpha : Check whether all characters are alphabetic.
  2607. Series.str.isnumeric : Check whether all characters are numeric.
  2608. Series.str.isalnum : Check whether all characters are alphanumeric.
  2609. Series.str.isdigit : Check whether all characters are digits.
  2610. Series.str.isdecimal : Check whether all characters are decimal.
  2611. Series.str.isspace : Check whether all characters are whitespace.
  2612. Series.str.islower : Check whether all characters are lowercase.
  2613. Series.str.isupper : Check whether all characters are uppercase.
  2614. Series.str.istitle : Check whether all characters are titlecase.
  2615. Examples
  2616. --------
  2617. **Checks for Alphabetic and Numeric Characters**
  2618. >>> s1 = pd.Series(['one', 'one1', '1', ''])
  2619. >>> s1.str.isalpha()
  2620. 0 True
  2621. 1 False
  2622. 2 False
  2623. 3 False
  2624. dtype: bool
  2625. >>> s1.str.isnumeric()
  2626. 0 False
  2627. 1 False
  2628. 2 True
  2629. 3 False
  2630. dtype: bool
  2631. >>> s1.str.isalnum()
  2632. 0 True
  2633. 1 True
  2634. 2 True
  2635. 3 False
  2636. dtype: bool
  2637. Note that checks against characters mixed with any additional punctuation
  2638. or whitespace will evaluate to false for an alphanumeric check.
  2639. >>> s2 = pd.Series(['A B', '1.5', '3,000'])
  2640. >>> s2.str.isalnum()
  2641. 0 False
  2642. 1 False
  2643. 2 False
  2644. dtype: bool
  2645. **More Detailed Checks for Numeric Characters**
  2646. There are several different but overlapping sets of numeric characters that
  2647. can be checked for.
  2648. >>> s3 = pd.Series(['23', '³', '⅕', ''])
  2649. The ``s3.str.isdecimal`` method checks for characters used to form numbers
  2650. in base 10.
  2651. >>> s3.str.isdecimal()
  2652. 0 True
  2653. 1 False
  2654. 2 False
  2655. 3 False
  2656. dtype: bool
  2657. The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
  2658. includes special digits, like superscripted and subscripted digits in
  2659. unicode.
  2660. >>> s3.str.isdigit()
  2661. 0 True
  2662. 1 True
  2663. 2 False
  2664. 3 False
  2665. dtype: bool
  2666. The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
  2667. includes other characters that can represent quantities such as unicode
  2668. fractions.
  2669. >>> s3.str.isnumeric()
  2670. 0 True
  2671. 1 True
  2672. 2 True
  2673. 3 False
  2674. dtype: bool
  2675. **Checks for Whitespace**
  2676. >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
  2677. >>> s4.str.isspace()
  2678. 0 True
  2679. 1 True
  2680. 2 False
  2681. dtype: bool
  2682. **Checks for Character Case**
  2683. >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
  2684. >>> s5.str.islower()
  2685. 0 True
  2686. 1 False
  2687. 2 False
  2688. 3 False
  2689. dtype: bool
  2690. >>> s5.str.isupper()
  2691. 0 False
  2692. 1 False
  2693. 2 True
  2694. 3 False
  2695. dtype: bool
  2696. The ``s5.str.istitle`` method checks for whether all words are in title
  2697. case (whether only the first letter of each word is capitalized). Words are
  2698. assumed to be as any sequence of non-numeric characters separated by
  2699. whitespace characters.
  2700. >>> s5.str.istitle()
  2701. 0 False
  2702. 1 True
  2703. 2 False
  2704. 3 False
  2705. dtype: bool
  2706. """
  2707. _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
  2708. _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
  2709. _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
  2710. _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
  2711. _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
  2712. _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
  2713. _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
  2714. _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
  2715. _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
  2716. # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
  2717. isalnum = _map_and_wrap(
  2718. "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
  2719. )
  2720. isalpha = _map_and_wrap(
  2721. "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
  2722. )
  2723. isdigit = _map_and_wrap(
  2724. "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
  2725. )
  2726. isspace = _map_and_wrap(
  2727. "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
  2728. )
  2729. islower = _map_and_wrap(
  2730. "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
  2731. )
  2732. isupper = _map_and_wrap(
  2733. "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
  2734. )
  2735. istitle = _map_and_wrap(
  2736. "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
  2737. )
  2738. isnumeric = _map_and_wrap(
  2739. "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
  2740. )
  2741. isdecimal = _map_and_wrap(
  2742. "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
  2743. )
  2744. def cat_safe(list_of_columns: list, sep: str):
  2745. """
  2746. Auxiliary function for :meth:`str.cat`.
  2747. Same signature as cat_core, but handles TypeErrors in concatenation, which
  2748. happen if the arrays in list_of columns have the wrong dtypes or content.
  2749. Parameters
  2750. ----------
  2751. list_of_columns : list of numpy arrays
  2752. List of arrays to be concatenated with sep;
  2753. these arrays may not contain NaNs!
  2754. sep : string
  2755. The separator string for concatenating the columns.
  2756. Returns
  2757. -------
  2758. nd.array
  2759. The concatenation of list_of_columns with sep.
  2760. """
  2761. try:
  2762. result = cat_core(list_of_columns, sep)
  2763. except TypeError:
  2764. # if there are any non-string values (wrong dtype or hidden behind
  2765. # object dtype), np.sum will fail; catch and return with better message
  2766. for column in list_of_columns:
  2767. dtype = lib.infer_dtype(column, skipna=True)
  2768. if dtype not in ["string", "empty"]:
  2769. raise TypeError(
  2770. "Concatenation requires list-likes containing only "
  2771. "strings (or missing values). Offending values found in "
  2772. f"column {dtype}"
  2773. ) from None
  2774. return result
  2775. def cat_core(list_of_columns: list, sep: str):
  2776. """
  2777. Auxiliary function for :meth:`str.cat`
  2778. Parameters
  2779. ----------
  2780. list_of_columns : list of numpy arrays
  2781. List of arrays to be concatenated with sep;
  2782. these arrays may not contain NaNs!
  2783. sep : string
  2784. The separator string for concatenating the columns.
  2785. Returns
  2786. -------
  2787. nd.array
  2788. The concatenation of list_of_columns with sep.
  2789. """
  2790. if sep == "":
  2791. # no need to interleave sep if it is empty
  2792. arr_of_cols = np.asarray(list_of_columns, dtype=object)
  2793. return np.sum(arr_of_cols, axis=0)
  2794. list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
  2795. list_with_sep[::2] = list_of_columns
  2796. arr_with_sep = np.asarray(list_with_sep, dtype=object)
  2797. return np.sum(arr_with_sep, axis=0)
  2798. def _result_dtype(arr):
  2799. # workaround #27953
  2800. # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
  2801. # when the list of values is empty.
  2802. from pandas.core.arrays.string_ import StringDtype
  2803. if isinstance(arr.dtype, StringDtype):
  2804. return arr.dtype
  2805. else:
  2806. return object
  2807. def _get_single_group_name(regex: re.Pattern) -> Hashable:
  2808. if regex.groupindex:
  2809. return next(iter(regex.groupindex))
  2810. else:
  2811. return None
  2812. def _get_group_names(regex: re.Pattern) -> list[Hashable]:
  2813. """
  2814. Get named groups from compiled regex.
  2815. Unnamed groups are numbered.
  2816. Parameters
  2817. ----------
  2818. regex : compiled regex
  2819. Returns
  2820. -------
  2821. list of column labels
  2822. """
  2823. names = {v: k for k, v in regex.groupindex.items()}
  2824. return [names.get(1 + i, i) for i in range(regex.groups)]
  2825. def str_extractall(arr, pat, flags: int = 0):
  2826. regex = re.compile(pat, flags=flags)
  2827. # the regex must contain capture groups.
  2828. if regex.groups == 0:
  2829. raise ValueError("pattern contains no capture groups")
  2830. if isinstance(arr, ABCIndex):
  2831. arr = arr.to_series().reset_index(drop=True)
  2832. columns = _get_group_names(regex)
  2833. match_list = []
  2834. index_list = []
  2835. is_mi = arr.index.nlevels > 1
  2836. for subject_key, subject in arr.items():
  2837. if isinstance(subject, str):
  2838. if not is_mi:
  2839. subject_key = (subject_key,)
  2840. for match_i, match_tuple in enumerate(regex.findall(subject)):
  2841. if isinstance(match_tuple, str):
  2842. match_tuple = (match_tuple,)
  2843. na_tuple = [np.NaN if group == "" else group for group in match_tuple]
  2844. match_list.append(na_tuple)
  2845. result_key = tuple(subject_key + (match_i,))
  2846. index_list.append(result_key)
  2847. from pandas import MultiIndex
  2848. index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
  2849. dtype = _result_dtype(arr)
  2850. result = arr._constructor_expanddim(
  2851. match_list, index=index, columns=columns, dtype=dtype
  2852. )
  2853. return result