123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- import re
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Series,
- _testing as tm,
- concat,
- )
- def assert_series_or_index_equal(left, right):
- if isinstance(left, Series):
- tm.assert_series_equal(left, right)
- else: # Index
- tm.assert_index_equal(left, right)
- @pytest.mark.parametrize("other", [None, Series, Index])
- def test_str_cat_name(index_or_series, other):
- # GH 21053
- box = index_or_series
- values = ["a", "b"]
- if other:
- other = other(values)
- else:
- other = values
- result = box(values, name="name").str.cat(other, sep=",")
- assert result.name == "name"
- def test_str_cat(index_or_series):
- box = index_or_series
- # test_cat above tests "str_cat" from ndarray;
- # here testing "str.cat" from Series/Index to ndarray/list
- s = box(["a", "a", "b", "b", "c", np.nan])
- # single array
- result = s.str.cat()
- expected = "aabbc"
- assert result == expected
- result = s.str.cat(na_rep="-")
- expected = "aabbc-"
- assert result == expected
- result = s.str.cat(sep="_", na_rep="NA")
- expected = "a_a_b_b_c_NA"
- assert result == expected
- t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
- expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
- # Series/Index with array
- result = s.str.cat(t, na_rep="-")
- assert_series_or_index_equal(result, expected)
- # Series/Index with list
- result = s.str.cat(list(t), na_rep="-")
- assert_series_or_index_equal(result, expected)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"])
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(z.values)
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(list(z))
- def test_str_cat_raises_intuitive_error(index_or_series):
- # GH 11334
- box = index_or_series
- s = box(["a", "b", "c", "d"])
- message = "Did you mean to supply a `sep` keyword?"
- with pytest.raises(ValueError, match=message):
- s.str.cat("|")
- with pytest.raises(ValueError, match=message):
- s.str.cat(" ")
- @pytest.mark.parametrize("sep", ["", None])
- @pytest.mark.parametrize("dtype_target", ["object", "category"])
- @pytest.mark.parametrize("dtype_caller", ["object", "category"])
- def test_str_cat_categorical(index_or_series, dtype_caller, dtype_target, sep):
- box = index_or_series
- s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
- s = s if box == Index else Series(s, index=s)
- t = Index(["b", "a", "b", "c"], dtype=dtype_target)
- expected = Index(["ab", "aa", "bb", "ac"])
- expected = expected if box == Index else Series(expected, index=s)
- # Series/Index with unaligned Index -> t.values
- result = s.str.cat(t.values, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series having matching Index
- t = Series(t.values, index=s)
- result = s.str.cat(t, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series.values
- result = s.str.cat(t.values, sep=sep)
- assert_series_or_index_equal(result, expected)
- # Series/Index with Series having different Index
- t = Series(t.values, index=t.values)
- expected = Index(["aa", "aa", "aa", "bb", "bb"])
- expected = expected if box == Index else Series(expected, index=expected.str[:1])
- result = s.str.cat(t, sep=sep)
- assert_series_or_index_equal(result, expected)
- @pytest.mark.parametrize(
- "data",
- [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
- ids=["integers", "floats", "mixed"],
- )
- # without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
- @pytest.mark.parametrize(
- "box",
- [Series, Index, list, lambda x: np.array(x, dtype=object)],
- ids=["Series", "Index", "list", "np.array"],
- )
- def test_str_cat_wrong_dtype_raises(box, data):
- # GH 22722
- s = Series(["a", "b", "c"])
- t = box(data)
- msg = "Concatenation requires list-likes containing only strings.*"
- with pytest.raises(TypeError, match=msg):
- # need to use outer and na_rep, as otherwise Index would not raise
- s.str.cat(t, join="outer", na_rep="-")
- def test_str_cat_mixed_inputs(index_or_series):
- box = index_or_series
- s = Index(["a", "b", "c", "d"])
- s = s if box == Index else Series(s, index=s)
- t = Series(["A", "B", "C", "D"], index=s.values)
- d = concat([t, Series(s, index=s)], axis=1)
- expected = Index(["aAa", "bBb", "cCc", "dDd"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- # Series/Index with DataFrame
- result = s.str.cat(d)
- assert_series_or_index_equal(result, expected)
- # Series/Index with two-dimensional ndarray
- result = s.str.cat(d.values)
- assert_series_or_index_equal(result, expected)
- # Series/Index with list of Series
- result = s.str.cat([t, s])
- assert_series_or_index_equal(result, expected)
- # Series/Index with mixed list of Series/array
- result = s.str.cat([t, s.values])
- assert_series_or_index_equal(result, expected)
- # Series/Index with list of Series; different indexes
- t.index = ["b", "c", "d", "a"]
- expected = box(["aDa", "bAb", "cBc", "dCd"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- result = s.str.cat([t, s])
- assert_series_or_index_equal(result, expected)
- # Series/Index with mixed list; different index
- result = s.str.cat([t, s.values])
- assert_series_or_index_equal(result, expected)
- # Series/Index with DataFrame; different indexes
- d.index = ["b", "c", "d", "a"]
- expected = box(["aDd", "bAa", "cBb", "dCc"])
- expected = expected if box == Index else Series(expected.values, index=s.values)
- result = s.str.cat(d)
- assert_series_or_index_equal(result, expected)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"])
- e = concat([z, z], axis=1)
- # two-dimensional ndarray
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(e.values)
- # list of list-likes
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([z.values, s.values])
- # mixed list of Series/list-like
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([z.values, s])
- # errors for incorrect arguments in list-like
- rgx = "others must be Series, Index, DataFrame,.*"
- # make sure None/NaN do not crash checks in _get_series_list
- u = Series(["a", np.nan, "c", None])
- # mix of string and Series
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, "u"])
- # DataFrame in list
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, d])
- # 2-dim ndarray in list
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, d.values])
- # nested lists
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, [u, d]])
- # forbidden input type: set
- # GH 23009
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(set(u))
- # forbidden input type: set in list
- # GH 23009
- with pytest.raises(TypeError, match=rgx):
- s.str.cat([u, set(u)])
- # other forbidden input type, e.g. int
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(1)
- # nested list-likes
- with pytest.raises(TypeError, match=rgx):
- s.str.cat(iter([t.values, list(s)]))
- @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
- def test_str_cat_align_indexed(index_or_series, join):
- # https://github.com/pandas-dev/pandas/issues/18657
- box = index_or_series
- s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
- t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
- sa, ta = s.align(t, join=join)
- # result after manual alignment of inputs
- expected = sa.str.cat(ta, na_rep="-")
- if box == Index:
- s = Index(s)
- sa = Index(sa)
- expected = Index(expected)
- result = s.str.cat(t, join=join, na_rep="-")
- assert_series_or_index_equal(result, expected)
- @pytest.mark.parametrize("join", ["left", "outer", "inner", "right"])
- def test_str_cat_align_mixed_inputs(join):
- s = Series(["a", "b", "c", "d"])
- t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
- d = concat([t, t], axis=1)
- expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
- expected = expected_outer.loc[s.index.join(t.index, how=join)]
- # list of Series
- result = s.str.cat([t, t], join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- # DataFrame
- result = s.str.cat(d, join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- # mixed list of indexed/unindexed
- u = np.array(["A", "B", "C", "D"])
- expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
- # joint index of rhs [t, u]; u will be forced have index of s
- rhs_idx = (
- t.index.intersection(s.index)
- if join == "inner"
- else t.index.union(s.index)
- if join == "outer"
- else t.index.append(s.index.difference(t.index))
- )
- expected = expected_outer.loc[s.index.join(rhs_idx, how=join)]
- result = s.str.cat([t, u], join=join, na_rep="-")
- tm.assert_series_equal(result, expected)
- with pytest.raises(TypeError, match="others must be Series,.*"):
- # nested lists are forbidden
- s.str.cat([t, list(u)], join=join)
- # errors for incorrect lengths
- rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
- z = Series(["1", "2", "3"]).values
- # unindexed object of wrong length
- with pytest.raises(ValueError, match=rgx):
- s.str.cat(z, join=join)
- # unindexed object of wrong length in list
- with pytest.raises(ValueError, match=rgx):
- s.str.cat([t, z], join=join)
- def test_str_cat_all_na(index_or_series, index_or_series2):
- # GH 24044
- box = index_or_series
- other = index_or_series2
- # check that all NaNs in caller / target work
- s = Index(["a", "b", "c", "d"])
- s = s if box == Index else Series(s, index=s)
- t = other([np.nan] * 4, dtype=object)
- # add index of s for alignment
- t = t if other == Index else Series(t, index=s)
- # all-NA target
- if box == Series:
- expected = Series([np.nan] * 4, index=s.index, dtype=object)
- else: # box == Index
- expected = Index([np.nan] * 4, dtype=object)
- result = s.str.cat(t, join="left")
- assert_series_or_index_equal(result, expected)
- # all-NA caller (only for Series)
- if other == Series:
- expected = Series([np.nan] * 4, dtype=object, index=t.index)
- result = t.str.cat(s, join="left")
- tm.assert_series_equal(result, expected)
- def test_str_cat_special_cases():
- s = Series(["a", "b", "c", "d"])
- t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
- # iterator of elements with different types
- expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
- result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
- tm.assert_series_equal(result, expected)
- # right-align with different indexes in others
- expected = Series(["aa-", "d-d"], index=[0, 3])
- result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
- tm.assert_series_equal(result, expected)
- def test_cat_on_filtered_index():
- df = DataFrame(
- index=MultiIndex.from_product(
- [[2011, 2012], [1, 2, 3]], names=["year", "month"]
- )
- )
- df = df.reset_index()
- df = df[df.month > 1]
- str_year = df.year.astype("str")
- str_month = df.month.astype("str")
- str_both = str_year.str.cat(str_month, sep=" ")
- assert str_both.loc[1] == "2011 2"
- str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
- assert str_multiple.loc[1] == "2011 2 2"
- @pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index])
- def test_cat_different_classes(klass):
- # https://github.com/pandas-dev/pandas/issues/33425
- s = Series(["a", "b", "c"])
- result = s.str.cat(klass(["x", "y", "z"]))
- expected = Series(["ax", "by", "cz"])
- tm.assert_series_equal(result, expected)
- def test_cat_on_series_dot_str():
- # GH 28277
- ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"])
- message = re.escape(
- "others must be Series, Index, DataFrame, np.ndarray "
- "or list-like (either containing only strings or "
- "containing only objects of type Series/Index/"
- "np.ndarray[1-dim])"
- )
- with pytest.raises(TypeError, match=message):
- ps.str.cat(others=ps.str)
|