test_crosstab.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.common import is_categorical_dtype
  4. import pandas as pd
  5. from pandas import (
  6. CategoricalIndex,
  7. DataFrame,
  8. Index,
  9. MultiIndex,
  10. Series,
  11. crosstab,
  12. )
  13. import pandas._testing as tm
  14. @pytest.fixture
  15. def df():
  16. df = DataFrame(
  17. {
  18. "A": [
  19. "foo",
  20. "foo",
  21. "foo",
  22. "foo",
  23. "bar",
  24. "bar",
  25. "bar",
  26. "bar",
  27. "foo",
  28. "foo",
  29. "foo",
  30. ],
  31. "B": [
  32. "one",
  33. "one",
  34. "one",
  35. "two",
  36. "one",
  37. "one",
  38. "one",
  39. "two",
  40. "two",
  41. "two",
  42. "one",
  43. ],
  44. "C": [
  45. "dull",
  46. "dull",
  47. "shiny",
  48. "dull",
  49. "dull",
  50. "shiny",
  51. "shiny",
  52. "dull",
  53. "shiny",
  54. "shiny",
  55. "shiny",
  56. ],
  57. "D": np.random.randn(11),
  58. "E": np.random.randn(11),
  59. "F": np.random.randn(11),
  60. }
  61. )
  62. return pd.concat([df, df], ignore_index=True)
  63. class TestCrosstab:
  64. def test_crosstab_single(self, df):
  65. result = crosstab(df["A"], df["C"])
  66. expected = df.groupby(["A", "C"]).size().unstack()
  67. tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
  68. def test_crosstab_multiple(self, df):
  69. result = crosstab(df["A"], [df["B"], df["C"]])
  70. expected = df.groupby(["A", "B", "C"]).size()
  71. expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
  72. tm.assert_frame_equal(result, expected)
  73. result = crosstab([df["B"], df["C"]], df["A"])
  74. expected = df.groupby(["B", "C", "A"]).size()
  75. expected = expected.unstack("A").fillna(0).astype(np.int64)
  76. tm.assert_frame_equal(result, expected)
  77. @pytest.mark.parametrize("box", [np.array, list, tuple])
  78. def test_crosstab_ndarray(self, box):
  79. # GH 44076
  80. a = box(np.random.randint(0, 5, size=100))
  81. b = box(np.random.randint(0, 3, size=100))
  82. c = box(np.random.randint(0, 10, size=100))
  83. df = DataFrame({"a": a, "b": b, "c": c})
  84. result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
  85. expected = crosstab(df["a"], [df["b"], df["c"]])
  86. tm.assert_frame_equal(result, expected)
  87. result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
  88. expected = crosstab([df["b"], df["c"]], df["a"])
  89. tm.assert_frame_equal(result, expected)
  90. # assign arbitrary names
  91. result = crosstab(a, c)
  92. expected = crosstab(df["a"], df["c"])
  93. expected.index.names = ["row_0"]
  94. expected.columns.names = ["col_0"]
  95. tm.assert_frame_equal(result, expected)
  96. def test_crosstab_non_aligned(self):
  97. # GH 17005
  98. a = Series([0, 1, 1], index=["a", "b", "c"])
  99. b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
  100. c = np.array([3, 4, 3], dtype=np.int64)
  101. expected = DataFrame(
  102. [[1, 0], [1, 1]],
  103. index=Index([0, 1], name="row_0"),
  104. columns=Index([3, 4], name="col_0"),
  105. )
  106. result = crosstab(a, b)
  107. tm.assert_frame_equal(result, expected)
  108. result = crosstab(a, c)
  109. tm.assert_frame_equal(result, expected)
  110. def test_crosstab_margins(self):
  111. a = np.random.randint(0, 7, size=100)
  112. b = np.random.randint(0, 3, size=100)
  113. c = np.random.randint(0, 5, size=100)
  114. df = DataFrame({"a": a, "b": b, "c": c})
  115. result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
  116. assert result.index.names == ("a",)
  117. assert result.columns.names == ["b", "c"]
  118. all_cols = result["All", ""]
  119. exp_cols = df.groupby(["a"]).size().astype("i8")
  120. # to keep index.name
  121. exp_margin = Series([len(df)], index=Index(["All"], name="a"))
  122. exp_cols = pd.concat([exp_cols, exp_margin])
  123. exp_cols.name = ("All", "")
  124. tm.assert_series_equal(all_cols, exp_cols)
  125. all_rows = result.loc["All"]
  126. exp_rows = df.groupby(["b", "c"]).size().astype("i8")
  127. exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
  128. exp_rows.name = "All"
  129. exp_rows = exp_rows.reindex(all_rows.index)
  130. exp_rows = exp_rows.fillna(0).astype(np.int64)
  131. tm.assert_series_equal(all_rows, exp_rows)
  132. def test_crosstab_margins_set_margin_name(self):
  133. # GH 15972
  134. a = np.random.randint(0, 7, size=100)
  135. b = np.random.randint(0, 3, size=100)
  136. c = np.random.randint(0, 5, size=100)
  137. df = DataFrame({"a": a, "b": b, "c": c})
  138. result = crosstab(
  139. a,
  140. [b, c],
  141. rownames=["a"],
  142. colnames=("b", "c"),
  143. margins=True,
  144. margins_name="TOTAL",
  145. )
  146. assert result.index.names == ("a",)
  147. assert result.columns.names == ["b", "c"]
  148. all_cols = result["TOTAL", ""]
  149. exp_cols = df.groupby(["a"]).size().astype("i8")
  150. # to keep index.name
  151. exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
  152. exp_cols = pd.concat([exp_cols, exp_margin])
  153. exp_cols.name = ("TOTAL", "")
  154. tm.assert_series_equal(all_cols, exp_cols)
  155. all_rows = result.loc["TOTAL"]
  156. exp_rows = df.groupby(["b", "c"]).size().astype("i8")
  157. exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
  158. exp_rows.name = "TOTAL"
  159. exp_rows = exp_rows.reindex(all_rows.index)
  160. exp_rows = exp_rows.fillna(0).astype(np.int64)
  161. tm.assert_series_equal(all_rows, exp_rows)
  162. msg = "margins_name argument must be a string"
  163. for margins_name in [666, None, ["a", "b"]]:
  164. with pytest.raises(ValueError, match=msg):
  165. crosstab(
  166. a,
  167. [b, c],
  168. rownames=["a"],
  169. colnames=("b", "c"),
  170. margins=True,
  171. margins_name=margins_name,
  172. )
  173. def test_crosstab_pass_values(self):
  174. a = np.random.randint(0, 7, size=100)
  175. b = np.random.randint(0, 3, size=100)
  176. c = np.random.randint(0, 5, size=100)
  177. values = np.random.randn(100)
  178. table = crosstab(
  179. [a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"]
  180. )
  181. df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
  182. expected = df.pivot_table(
  183. "values", index=["foo", "bar"], columns="baz", aggfunc=np.sum
  184. )
  185. tm.assert_frame_equal(table, expected)
  186. def test_crosstab_dropna(self):
  187. # GH 3820
  188. a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
  189. b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
  190. c = np.array(
  191. ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
  192. )
  193. res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
  194. m = MultiIndex.from_tuples(
  195. [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
  196. names=["b", "c"],
  197. )
  198. tm.assert_index_equal(res.columns, m)
  199. def test_crosstab_no_overlap(self):
  200. # GS 10291
  201. s1 = Series([1, 2, 3], index=[1, 2, 3])
  202. s2 = Series([4, 5, 6], index=[4, 5, 6])
  203. actual = crosstab(s1, s2)
  204. expected = DataFrame(
  205. index=Index([], dtype="int64", name="row_0"),
  206. columns=Index([], dtype="int64", name="col_0"),
  207. )
  208. tm.assert_frame_equal(actual, expected)
  209. def test_margin_dropna(self):
  210. # GH 12577
  211. # pivot_table counts null into margin ('All')
  212. # when margins=true and dropna=true
  213. df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
  214. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  215. expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
  216. expected.index = Index([1.0, 2.0, "All"], name="a")
  217. expected.columns = Index([3, 4, "All"], name="b")
  218. tm.assert_frame_equal(actual, expected)
  219. def test_margin_dropna2(self):
  220. df = DataFrame(
  221. {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
  222. )
  223. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  224. expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
  225. expected.index = Index([1.0, 2.0, "All"], name="a")
  226. expected.columns = Index([3.0, 4.0, "All"], name="b")
  227. tm.assert_frame_equal(actual, expected)
  228. def test_margin_dropna3(self):
  229. df = DataFrame(
  230. {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
  231. )
  232. actual = crosstab(df.a, df.b, margins=True, dropna=True)
  233. expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
  234. expected.index = Index([1.0, 2.0, "All"], name="a")
  235. expected.columns = Index([3, 4, "All"], name="b")
  236. tm.assert_frame_equal(actual, expected)
  237. def test_margin_dropna4(self):
  238. # GH 12642
  239. # _add_margins raises KeyError: Level None not found
  240. # when margins=True and dropna=False
  241. df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
  242. actual = crosstab(df.a, df.b, margins=True, dropna=False)
  243. expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
  244. expected.index = Index([1.0, 2.0, "All"], name="a")
  245. expected.columns = Index([3, 4, "All"], name="b")
  246. tm.assert_frame_equal(actual, expected)
  247. def test_margin_dropna5(self):
  248. df = DataFrame(
  249. {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
  250. )
  251. actual = crosstab(df.a, df.b, margins=True, dropna=False)
  252. expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
  253. expected.index = Index([1.0, 2.0, "All"], name="a")
  254. expected.columns = Index([3.0, 4.0, "All"], name="b")
  255. tm.assert_frame_equal(actual, expected)
  256. def test_margin_dropna6(self):
  257. a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
  258. b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
  259. c = np.array(
  260. ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
  261. )
  262. actual = crosstab(
  263. a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
  264. )
  265. m = MultiIndex.from_arrays(
  266. [
  267. ["one", "one", "two", "two", "All"],
  268. ["dull", "shiny", "dull", "shiny", ""],
  269. ],
  270. names=["b", "c"],
  271. )
  272. expected = DataFrame(
  273. [[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], [3, 0, 2, 1, 7]], columns=m
  274. )
  275. expected.index = Index(["bar", "foo", "All"], name="a")
  276. tm.assert_frame_equal(actual, expected)
  277. actual = crosstab(
  278. [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
  279. )
  280. m = MultiIndex.from_arrays(
  281. [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
  282. names=["a", "b"],
  283. )
  284. expected = DataFrame(
  285. [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m
  286. )
  287. expected.columns = Index(["dull", "shiny", "All"], name="c")
  288. tm.assert_frame_equal(actual, expected)
  289. actual = crosstab(
  290. [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
  291. )
  292. m = MultiIndex.from_arrays(
  293. [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
  294. names=["a", "b"],
  295. )
  296. expected = DataFrame(
  297. [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
  298. )
  299. expected.columns = Index(["dull", "shiny", "All"], name="c")
  300. tm.assert_frame_equal(actual, expected)
  301. def test_crosstab_normalize(self):
  302. # Issue 12578
  303. df = DataFrame(
  304. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  305. )
  306. rindex = Index([1, 2], name="a")
  307. cindex = Index([3, 4], name="b")
  308. full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
  309. row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
  310. col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
  311. # Check all normalize args
  312. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
  313. tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
  314. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
  315. tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
  316. tm.assert_frame_equal(
  317. crosstab(df.a, df.b, normalize=1),
  318. crosstab(df.a, df.b, normalize="columns"),
  319. )
  320. tm.assert_frame_equal(
  321. crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
  322. )
  323. row_normal_margins = DataFrame(
  324. [[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
  325. index=Index([1, 2, "All"], name="a", dtype="object"),
  326. columns=Index([3, 4], name="b", dtype="object"),
  327. )
  328. col_normal_margins = DataFrame(
  329. [[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
  330. index=Index([1, 2], name="a", dtype="object"),
  331. columns=Index([3, 4, "All"], name="b", dtype="object"),
  332. )
  333. all_normal_margins = DataFrame(
  334. [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
  335. index=Index([1, 2, "All"], name="a", dtype="object"),
  336. columns=Index([3, 4, "All"], name="b", dtype="object"),
  337. )
  338. tm.assert_frame_equal(
  339. crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
  340. )
  341. tm.assert_frame_equal(
  342. crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
  343. )
  344. tm.assert_frame_equal(
  345. crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
  346. )
  347. def test_crosstab_normalize_arrays(self):
  348. # GH#12578
  349. df = DataFrame(
  350. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  351. )
  352. # Test arrays
  353. crosstab(
  354. [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
  355. )
  356. # Test with aggfunc
  357. norm_counts = DataFrame(
  358. [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
  359. index=Index([1, 2, "All"], name="a", dtype="object"),
  360. columns=Index([3, 4, "All"], name="b"),
  361. )
  362. test_case = crosstab(
  363. df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
  364. )
  365. tm.assert_frame_equal(test_case, norm_counts)
  366. df = DataFrame(
  367. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
  368. )
  369. norm_sum = DataFrame(
  370. [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
  371. index=Index([1, 2, "All"], name="a", dtype="object"),
  372. columns=Index([3, 4, "All"], name="b", dtype="object"),
  373. )
  374. test_case = crosstab(
  375. df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
  376. )
  377. tm.assert_frame_equal(test_case, norm_sum)
  378. def test_crosstab_with_empties(self, using_array_manager):
  379. # Check handling of empties
  380. df = DataFrame(
  381. {
  382. "a": [1, 2, 2, 2, 2],
  383. "b": [3, 3, 4, 4, 4],
  384. "c": [np.nan, np.nan, np.nan, np.nan, np.nan],
  385. }
  386. )
  387. empty = DataFrame(
  388. [[0.0, 0.0], [0.0, 0.0]],
  389. index=Index([1, 2], name="a", dtype="int64"),
  390. columns=Index([3, 4], name="b"),
  391. )
  392. for i in [True, "index", "columns"]:
  393. calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
  394. tm.assert_frame_equal(empty, calculated)
  395. nans = DataFrame(
  396. [[0.0, np.nan], [0.0, 0.0]],
  397. index=Index([1, 2], name="a", dtype="int64"),
  398. columns=Index([3, 4], name="b"),
  399. )
  400. if using_array_manager:
  401. # INFO(ArrayManager) column without NaNs can preserve int dtype
  402. nans[3] = nans[3].astype("int64")
  403. calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
  404. tm.assert_frame_equal(nans, calculated)
  405. def test_crosstab_errors(self):
  406. # Issue 12578
  407. df = DataFrame(
  408. {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
  409. )
  410. error = "values cannot be used without an aggfunc."
  411. with pytest.raises(ValueError, match=error):
  412. crosstab(df.a, df.b, values=df.c)
  413. error = "aggfunc cannot be used without values"
  414. with pytest.raises(ValueError, match=error):
  415. crosstab(df.a, df.b, aggfunc=np.mean)
  416. error = "Not a valid normalize argument"
  417. with pytest.raises(ValueError, match=error):
  418. crosstab(df.a, df.b, normalize="42")
  419. with pytest.raises(ValueError, match=error):
  420. crosstab(df.a, df.b, normalize=42)
  421. error = "Not a valid margins argument"
  422. with pytest.raises(ValueError, match=error):
  423. crosstab(df.a, df.b, normalize="all", margins=42)
  424. def test_crosstab_with_categorial_columns(self):
  425. # GH 8860
  426. df = DataFrame(
  427. {
  428. "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
  429. "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
  430. }
  431. )
  432. categories = ["Sedan", "Electric", "Pickup"]
  433. df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
  434. result = crosstab(df["MAKE"], df["MODEL"])
  435. expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
  436. expected_columns = CategoricalIndex(
  437. categories, categories=categories, ordered=False, name="MODEL"
  438. )
  439. expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
  440. expected = DataFrame(
  441. expected_data, index=expected_index, columns=expected_columns
  442. )
  443. tm.assert_frame_equal(result, expected)
  444. def test_crosstab_with_numpy_size(self):
  445. # GH 4003
  446. df = DataFrame(
  447. {
  448. "A": ["one", "one", "two", "three"] * 6,
  449. "B": ["A", "B", "C"] * 8,
  450. "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
  451. "D": np.random.randn(24),
  452. "E": np.random.randn(24),
  453. }
  454. )
  455. result = crosstab(
  456. index=[df["A"], df["B"]],
  457. columns=[df["C"]],
  458. margins=True,
  459. aggfunc=np.size,
  460. values=df["D"],
  461. )
  462. expected_index = MultiIndex(
  463. levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
  464. codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
  465. names=["A", "B"],
  466. )
  467. expected_column = Index(["bar", "foo", "All"], dtype="object", name="C")
  468. expected_data = np.array(
  469. [
  470. [2.0, 2.0, 4.0],
  471. [2.0, 2.0, 4.0],
  472. [2.0, 2.0, 4.0],
  473. [2.0, np.nan, 2.0],
  474. [np.nan, 2.0, 2.0],
  475. [2.0, np.nan, 2.0],
  476. [np.nan, 2.0, 2.0],
  477. [2.0, np.nan, 2.0],
  478. [np.nan, 2.0, 2.0],
  479. [12.0, 12.0, 24.0],
  480. ]
  481. )
  482. expected = DataFrame(
  483. expected_data, index=expected_index, columns=expected_column
  484. )
  485. # aggfunc is np.size, resulting in integers
  486. expected["All"] = expected["All"].astype("int64")
  487. tm.assert_frame_equal(result, expected)
  488. def test_crosstab_duplicate_names(self):
  489. # GH 13279 / 22529
  490. s1 = Series(range(3), name="foo")
  491. s2_foo = Series(range(1, 4), name="foo")
  492. s2_bar = Series(range(1, 4), name="bar")
  493. s3 = Series(range(3), name="waldo")
  494. # check result computed with duplicate labels against
  495. # result computed with unique labels, then relabelled
  496. mapper = {"bar": "foo"}
  497. # duplicate row, column labels
  498. result = crosstab(s1, s2_foo)
  499. expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
  500. tm.assert_frame_equal(result, expected)
  501. # duplicate row, unique column labels
  502. result = crosstab([s1, s2_foo], s3)
  503. expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
  504. tm.assert_frame_equal(result, expected)
  505. # unique row, duplicate column labels
  506. result = crosstab(s3, [s1, s2_foo])
  507. expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
  508. tm.assert_frame_equal(result, expected)
  509. @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
  510. def test_crosstab_tuple_name(self, names):
  511. s1 = Series(range(3), name=names[0])
  512. s2 = Series(range(1, 4), name=names[1])
  513. mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
  514. expected = Series(1, index=mi).unstack(1, fill_value=0)
  515. result = crosstab(s1, s2)
  516. tm.assert_frame_equal(result, expected)
  517. def test_crosstab_both_tuple_names(self):
  518. # GH 18321
  519. s1 = Series(range(3), name=("a", "b"))
  520. s2 = Series(range(3), name=("c", "d"))
  521. expected = DataFrame(
  522. np.eye(3, dtype="int64"),
  523. index=Index(range(3), name=("a", "b")),
  524. columns=Index(range(3), name=("c", "d")),
  525. )
  526. result = crosstab(s1, s2)
  527. tm.assert_frame_equal(result, expected)
  528. def test_crosstab_unsorted_order(self):
  529. df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
  530. result = crosstab(df.index, [df.b, df.a])
  531. e_idx = Index(["A", "B", "C"], name="row_0")
  532. e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
  533. expected = DataFrame(
  534. [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
  535. )
  536. tm.assert_frame_equal(result, expected)
  537. def test_crosstab_normalize_multiple_columns(self):
  538. # GH 15150
  539. df = DataFrame(
  540. {
  541. "A": ["one", "one", "two", "three"] * 6,
  542. "B": ["A", "B", "C"] * 8,
  543. "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
  544. "D": [0] * 24,
  545. "E": [0] * 24,
  546. }
  547. )
  548. result = crosstab(
  549. [df.A, df.B],
  550. df.C,
  551. values=df.D,
  552. aggfunc=np.sum,
  553. normalize=True,
  554. margins=True,
  555. )
  556. expected = DataFrame(
  557. np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
  558. columns=Index(["bar", "foo", "All"], dtype="object", name="C"),
  559. index=MultiIndex.from_tuples(
  560. [
  561. ("one", "A"),
  562. ("one", "B"),
  563. ("one", "C"),
  564. ("three", "A"),
  565. ("three", "B"),
  566. ("three", "C"),
  567. ("two", "A"),
  568. ("two", "B"),
  569. ("two", "C"),
  570. ("All", ""),
  571. ],
  572. names=["A", "B"],
  573. ),
  574. )
  575. tm.assert_frame_equal(result, expected)
  576. def test_margin_normalize(self):
  577. # GH 27500
  578. df = DataFrame(
  579. {
  580. "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
  581. "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
  582. "C": [
  583. "small",
  584. "large",
  585. "large",
  586. "small",
  587. "small",
  588. "large",
  589. "small",
  590. "small",
  591. "large",
  592. ],
  593. "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  594. "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
  595. }
  596. )
  597. # normalize on index
  598. result = crosstab(
  599. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
  600. )
  601. expected = DataFrame(
  602. [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
  603. )
  604. expected.index = MultiIndex(
  605. levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
  606. codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
  607. names=["A", "B"],
  608. )
  609. expected.columns = Index(["large", "small"], dtype="object", name="C")
  610. tm.assert_frame_equal(result, expected)
  611. # normalize on columns
  612. result = crosstab(
  613. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
  614. )
  615. expected = DataFrame(
  616. [
  617. [0.25, 0.2, 0.222222],
  618. [0.25, 0.2, 0.222222],
  619. [0.5, 0.2, 0.333333],
  620. [0, 0.4, 0.222222],
  621. ]
  622. )
  623. expected.columns = Index(
  624. ["large", "small", "Sub-Total"], dtype="object", name="C"
  625. )
  626. expected.index = MultiIndex(
  627. levels=[["bar", "foo"], ["one", "two"]],
  628. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  629. names=["A", "B"],
  630. )
  631. tm.assert_frame_equal(result, expected)
  632. # normalize on both index and column
  633. result = crosstab(
  634. [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
  635. )
  636. expected = DataFrame(
  637. [
  638. [0.111111, 0.111111, 0.222222],
  639. [0.111111, 0.111111, 0.222222],
  640. [0.222222, 0.111111, 0.333333],
  641. [0.000000, 0.222222, 0.222222],
  642. [0.444444, 0.555555, 1],
  643. ]
  644. )
  645. expected.columns = Index(
  646. ["large", "small", "Sub-Total"], dtype="object", name="C"
  647. )
  648. expected.index = MultiIndex(
  649. levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
  650. codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
  651. names=["A", "B"],
  652. )
  653. tm.assert_frame_equal(result, expected)
  654. def test_margin_normalize_multiple_columns(self):
  655. # GH 35144
  656. # use multiple columns with margins and normalization
  657. df = DataFrame(
  658. {
  659. "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
  660. "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
  661. "C": [
  662. "small",
  663. "large",
  664. "large",
  665. "small",
  666. "small",
  667. "large",
  668. "small",
  669. "small",
  670. "large",
  671. ],
  672. "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  673. "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
  674. }
  675. )
  676. result = crosstab(
  677. index=df.C,
  678. columns=[df.A, df.B],
  679. margins=True,
  680. margins_name="margin",
  681. normalize=True,
  682. )
  683. expected = DataFrame(
  684. [
  685. [0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
  686. [0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
  687. [0.222222, 0.222222, 0.333333, 0.222222, 1.0],
  688. ],
  689. index=["large", "small", "margin"],
  690. )
  691. expected.columns = MultiIndex(
  692. levels=[["bar", "foo", "margin"], ["", "one", "two"]],
  693. codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
  694. names=["A", "B"],
  695. )
  696. expected.index.name = "C"
  697. tm.assert_frame_equal(result, expected)
  698. def test_margin_support_Float(self):
  699. # GH 50313
  700. # use Float64 formats and function aggfunc with margins
  701. df = DataFrame(
  702. {"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
  703. dtype="Float64",
  704. )
  705. result = crosstab(
  706. df["A"],
  707. df["B"],
  708. values=df["C"],
  709. aggfunc="sum",
  710. margins=True,
  711. )
  712. expected = DataFrame(
  713. [
  714. [-1.0, pd.NA, 10.0, 9.0],
  715. [10.0, 1.0, pd.NA, 11.0],
  716. [9.0, 1.0, 10.0, 20.0],
  717. ],
  718. index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
  719. columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
  720. dtype="Float64",
  721. )
  722. tm.assert_frame_equal(result, expected)
  723. @pytest.mark.parametrize("a_dtype", ["category", "int64"])
  724. @pytest.mark.parametrize("b_dtype", ["category", "int64"])
  725. def test_categoricals(a_dtype, b_dtype):
  726. # https://github.com/pandas-dev/pandas/issues/37465
  727. g = np.random.RandomState(25982704)
  728. a = Series(g.randint(0, 3, size=100)).astype(a_dtype)
  729. b = Series(g.randint(0, 2, size=100)).astype(b_dtype)
  730. result = crosstab(a, b, margins=True, dropna=False)
  731. columns = Index([0, 1, "All"], dtype="object", name="col_0")
  732. index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
  733. values = [[18, 16, 34], [18, 16, 34], [16, 16, 32], [52, 48, 100]]
  734. expected = DataFrame(values, index, columns)
  735. tm.assert_frame_equal(result, expected)
  736. # Verify when categorical does not have all values present
  737. a.loc[a == 1] = 2
  738. a_is_cat = is_categorical_dtype(a.dtype)
  739. assert not a_is_cat or a.value_counts().loc[1] == 0
  740. result = crosstab(a, b, margins=True, dropna=False)
  741. values = [[18, 16, 34], [0, 0, 0], [34, 32, 66], [52, 48, 100]]
  742. expected = DataFrame(values, index, columns)
  743. if not a_is_cat:
  744. expected = expected.loc[[0, 2, "All"]]
  745. expected["All"] = expected["All"].astype("int64")
  746. repr(result)
  747. repr(expected)
  748. repr(expected.loc[[0, 2, "All"]])
  749. tm.assert_frame_equal(result, expected)