test_value_counts.py 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144
  1. """
  2. these are systematically testing all of the args to value_counts
  3. with different size combinations. This is to ensure stability of the sorting
  4. and proper parameter handling
  5. """
  6. from itertools import product
  7. import numpy as np
  8. import pytest
  9. from pandas import (
  10. Categorical,
  11. CategoricalIndex,
  12. DataFrame,
  13. Grouper,
  14. Index,
  15. MultiIndex,
  16. Series,
  17. date_range,
  18. to_datetime,
  19. )
  20. import pandas._testing as tm
  21. from pandas.util.version import Version
  22. def tests_value_counts_index_names_category_column():
  23. # GH44324 Missing name of index category column
  24. df = DataFrame(
  25. {
  26. "gender": ["female"],
  27. "country": ["US"],
  28. }
  29. )
  30. df["gender"] = df["gender"].astype("category")
  31. result = df.groupby("country")["gender"].value_counts()
  32. # Construct expected, very specific multiindex
  33. df_mi_expected = DataFrame([["US", "female"]], columns=["country", "gender"])
  34. df_mi_expected["gender"] = df_mi_expected["gender"].astype("category")
  35. mi_expected = MultiIndex.from_frame(df_mi_expected)
  36. expected = Series([1], index=mi_expected, name="count")
  37. tm.assert_series_equal(result, expected)
  38. # our starting frame
  39. def seed_df(seed_nans, n, m):
  40. np.random.seed(1234)
  41. days = date_range("2015-08-24", periods=10)
  42. frame = DataFrame(
  43. {
  44. "1st": np.random.choice(list("abcd"), n),
  45. "2nd": np.random.choice(days, n),
  46. "3rd": np.random.randint(1, m + 1, n),
  47. }
  48. )
  49. if seed_nans:
  50. # Explicitly cast to float to avoid implicit cast when setting nan
  51. frame["3rd"] = frame["3rd"].astype("float")
  52. frame.loc[1::11, "1st"] = np.nan
  53. frame.loc[3::17, "2nd"] = np.nan
  54. frame.loc[7::19, "3rd"] = np.nan
  55. frame.loc[8::19, "3rd"] = np.nan
  56. frame.loc[9::19, "3rd"] = np.nan
  57. return frame
  58. # create input df, keys, and the bins
  59. binned = []
  60. ids = []
  61. for seed_nans in [True, False]:
  62. for n, m in product((100, 1000), (5, 20)):
  63. df = seed_df(seed_nans, n, m)
  64. bins = None, np.arange(0, max(5, df["3rd"].max()) + 1, 2)
  65. keys = "1st", "2nd", ["1st", "2nd"]
  66. for k, b in product(keys, bins):
  67. binned.append((df, k, b, n, m))
  68. ids.append(f"{k}-{n}-{m}")
  69. @pytest.mark.slow
  70. @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids)
  71. @pytest.mark.parametrize("isort", [True, False])
  72. @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
  73. @pytest.mark.parametrize("sort", [True, False])
  74. @pytest.mark.parametrize("ascending", [True, False])
  75. @pytest.mark.parametrize("dropna", [True, False])
  76. def test_series_groupby_value_counts(
  77. df, keys, bins, n, m, isort, normalize, name, sort, ascending, dropna
  78. ):
  79. def rebuild_index(df):
  80. arr = list(map(df.index.get_level_values, range(df.index.nlevels)))
  81. df.index = MultiIndex.from_arrays(arr, names=df.index.names)
  82. return df
  83. kwargs = {
  84. "normalize": normalize,
  85. "sort": sort,
  86. "ascending": ascending,
  87. "dropna": dropna,
  88. "bins": bins,
  89. }
  90. gr = df.groupby(keys, sort=isort)
  91. left = gr["3rd"].value_counts(**kwargs)
  92. gr = df.groupby(keys, sort=isort)
  93. right = gr["3rd"].apply(Series.value_counts, **kwargs)
  94. right.index.names = right.index.names[:-1] + ["3rd"]
  95. # https://github.com/pandas-dev/pandas/issues/49909
  96. right = right.rename(name)
  97. # have to sort on index because of unstable sort on values
  98. left, right = map(rebuild_index, (left, right)) # xref GH9212
  99. tm.assert_series_equal(left.sort_index(), right.sort_index())
  100. @pytest.mark.parametrize("utc", [True, False])
  101. def test_series_groupby_value_counts_with_grouper(utc):
  102. # GH28479
  103. df = DataFrame(
  104. {
  105. "Timestamp": [
  106. 1565083561,
  107. 1565083561 + 86400,
  108. 1565083561 + 86500,
  109. 1565083561 + 86400 * 2,
  110. 1565083561 + 86400 * 3,
  111. 1565083561 + 86500 * 3,
  112. 1565083561 + 86400 * 4,
  113. ],
  114. "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
  115. }
  116. ).drop([3])
  117. df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
  118. dfg = df.groupby(Grouper(freq="1D", key="Datetime"))
  119. # have to sort on index because of unstable sort on values xref GH9212
  120. result = dfg["Food"].value_counts().sort_index()
  121. expected = dfg["Food"].apply(Series.value_counts).sort_index()
  122. expected.index.names = result.index.names
  123. # https://github.com/pandas-dev/pandas/issues/49909
  124. expected = expected.rename("count")
  125. tm.assert_series_equal(result, expected)
  126. @pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
  127. def test_series_groupby_value_counts_empty(columns):
  128. # GH39172
  129. df = DataFrame(columns=columns)
  130. dfg = df.groupby(columns[:-1])
  131. result = dfg[columns[-1]].value_counts()
  132. expected = Series([], dtype=result.dtype, name="count")
  133. expected.index = MultiIndex.from_arrays([[]] * len(columns), names=columns)
  134. tm.assert_series_equal(result, expected)
  135. @pytest.mark.parametrize("columns", [["A", "B"], ["A", "B", "C"]])
  136. def test_series_groupby_value_counts_one_row(columns):
  137. # GH42618
  138. df = DataFrame(data=[range(len(columns))], columns=columns)
  139. dfg = df.groupby(columns[:-1])
  140. result = dfg[columns[-1]].value_counts()
  141. expected = df.value_counts()
  142. tm.assert_series_equal(result, expected)
  143. def test_series_groupby_value_counts_on_categorical():
  144. # GH38672
  145. s = Series(Categorical(["a"], categories=["a", "b"]))
  146. result = s.groupby([0]).value_counts()
  147. expected = Series(
  148. data=[1, 0],
  149. index=MultiIndex.from_arrays(
  150. [
  151. np.array([0, 0]),
  152. CategoricalIndex(
  153. ["a", "b"], categories=["a", "b"], ordered=False, dtype="category"
  154. ),
  155. ]
  156. ),
  157. name="count",
  158. )
  159. # Expected:
  160. # 0 a 1
  161. # b 0
  162. # dtype: int64
  163. tm.assert_series_equal(result, expected)
  164. def test_series_groupby_value_counts_no_sort():
  165. # GH#50482
  166. df = DataFrame(
  167. {
  168. "gender": ["male", "male", "female", "male", "female", "male"],
  169. "education": ["low", "medium", "high", "low", "high", "low"],
  170. "country": ["US", "FR", "US", "FR", "FR", "FR"],
  171. }
  172. )
  173. gb = df.groupby(["country", "gender"], sort=False)["education"]
  174. result = gb.value_counts(sort=False)
  175. index = MultiIndex(
  176. levels=[["US", "FR"], ["male", "female"], ["low", "medium", "high"]],
  177. codes=[[0, 1, 0, 1, 1], [0, 0, 1, 0, 1], [0, 1, 2, 0, 2]],
  178. names=["country", "gender", "education"],
  179. )
  180. expected = Series([1, 1, 1, 2, 1], index=index, name="count")
  181. tm.assert_series_equal(result, expected)
  182. @pytest.fixture
  183. def education_df():
  184. return DataFrame(
  185. {
  186. "gender": ["male", "male", "female", "male", "female", "male"],
  187. "education": ["low", "medium", "high", "low", "high", "low"],
  188. "country": ["US", "FR", "US", "FR", "FR", "FR"],
  189. }
  190. )
  191. def test_axis(education_df):
  192. gp = education_df.groupby("country", axis=1)
  193. with pytest.raises(NotImplementedError, match="axis"):
  194. gp.value_counts()
  195. def test_bad_subset(education_df):
  196. gp = education_df.groupby("country")
  197. with pytest.raises(ValueError, match="subset"):
  198. gp.value_counts(subset=["country"])
  199. def test_basic(education_df, request):
  200. # gh43564
  201. if Version(np.__version__) >= Version("1.25"):
  202. request.node.add_marker(
  203. pytest.mark.xfail(
  204. reason=(
  205. "pandas default unstable sorting of duplicates"
  206. "issue with numpy>=1.25 with AVX instructions"
  207. ),
  208. strict=False,
  209. )
  210. )
  211. result = education_df.groupby("country")[["gender", "education"]].value_counts(
  212. normalize=True
  213. )
  214. expected = Series(
  215. data=[0.5, 0.25, 0.25, 0.5, 0.5],
  216. index=MultiIndex.from_tuples(
  217. [
  218. ("FR", "male", "low"),
  219. ("FR", "female", "high"),
  220. ("FR", "male", "medium"),
  221. ("US", "female", "high"),
  222. ("US", "male", "low"),
  223. ],
  224. names=["country", "gender", "education"],
  225. ),
  226. name="proportion",
  227. )
  228. tm.assert_series_equal(result, expected)
  229. def _frame_value_counts(df, keys, normalize, sort, ascending):
  230. return df[keys].value_counts(normalize=normalize, sort=sort, ascending=ascending)
  231. @pytest.mark.parametrize("groupby", ["column", "array", "function"])
  232. @pytest.mark.parametrize("normalize, name", [(True, "proportion"), (False, "count")])
  233. @pytest.mark.parametrize(
  234. "sort, ascending",
  235. [
  236. (False, None),
  237. (True, True),
  238. (True, False),
  239. ],
  240. )
  241. @pytest.mark.parametrize("as_index", [True, False])
  242. @pytest.mark.parametrize("frame", [True, False])
  243. def test_against_frame_and_seriesgroupby(
  244. education_df, groupby, normalize, name, sort, ascending, as_index, frame, request
  245. ):
  246. # test all parameters:
  247. # - Use column, array or function as by= parameter
  248. # - Whether or not to normalize
  249. # - Whether or not to sort and how
  250. # - Whether or not to use the groupby as an index
  251. # - 3-way compare against:
  252. # - apply with :meth:`~DataFrame.value_counts`
  253. # - `~SeriesGroupBy.value_counts`
  254. if Version(np.__version__) >= Version("1.25") and frame and sort and normalize:
  255. request.node.add_marker(
  256. pytest.mark.xfail(
  257. reason=(
  258. "pandas default unstable sorting of duplicates"
  259. "issue with numpy>=1.25 with AVX instructions"
  260. ),
  261. strict=False,
  262. )
  263. )
  264. by = {
  265. "column": "country",
  266. "array": education_df["country"].values,
  267. "function": lambda x: education_df["country"][x] == "US",
  268. }[groupby]
  269. gp = education_df.groupby(by=by, as_index=as_index)
  270. result = gp[["gender", "education"]].value_counts(
  271. normalize=normalize, sort=sort, ascending=ascending
  272. )
  273. if frame:
  274. # compare against apply with DataFrame value_counts
  275. expected = gp.apply(
  276. _frame_value_counts, ["gender", "education"], normalize, sort, ascending
  277. )
  278. if as_index:
  279. tm.assert_series_equal(result, expected)
  280. else:
  281. name = "proportion" if normalize else "count"
  282. expected = expected.reset_index().rename({0: name}, axis=1)
  283. if groupby == "column":
  284. expected = expected.rename({"level_0": "country"}, axis=1)
  285. expected["country"] = np.where(expected["country"], "US", "FR")
  286. elif groupby == "function":
  287. expected["level_0"] = expected["level_0"] == 1
  288. else:
  289. expected["level_0"] = np.where(expected["level_0"], "US", "FR")
  290. tm.assert_frame_equal(result, expected)
  291. else:
  292. # compare against SeriesGroupBy value_counts
  293. education_df["both"] = education_df["gender"] + "-" + education_df["education"]
  294. expected = gp["both"].value_counts(
  295. normalize=normalize, sort=sort, ascending=ascending
  296. )
  297. expected.name = name
  298. if as_index:
  299. index_frame = expected.index.to_frame(index=False)
  300. index_frame["gender"] = index_frame["both"].str.split("-").str.get(0)
  301. index_frame["education"] = index_frame["both"].str.split("-").str.get(1)
  302. del index_frame["both"]
  303. index_frame = index_frame.rename({0: None}, axis=1)
  304. expected.index = MultiIndex.from_frame(index_frame)
  305. tm.assert_series_equal(result, expected)
  306. else:
  307. expected.insert(1, "gender", expected["both"].str.split("-").str.get(0))
  308. expected.insert(2, "education", expected["both"].str.split("-").str.get(1))
  309. del expected["both"]
  310. tm.assert_frame_equal(result, expected)
  311. @pytest.mark.parametrize("normalize", [True, False])
  312. @pytest.mark.parametrize(
  313. "sort, ascending, expected_rows, expected_count, expected_group_size",
  314. [
  315. (False, None, [0, 1, 2, 3, 4], [1, 1, 1, 2, 1], [1, 3, 1, 3, 1]),
  316. (True, False, [4, 3, 1, 2, 0], [1, 2, 1, 1, 1], [1, 3, 3, 1, 1]),
  317. (True, True, [4, 1, 3, 2, 0], [1, 1, 2, 1, 1], [1, 3, 3, 1, 1]),
  318. ],
  319. )
  320. def test_compound(
  321. education_df,
  322. normalize,
  323. sort,
  324. ascending,
  325. expected_rows,
  326. expected_count,
  327. expected_group_size,
  328. ):
  329. # Multiple groupby keys and as_index=False
  330. gp = education_df.groupby(["country", "gender"], as_index=False, sort=False)
  331. result = gp["education"].value_counts(
  332. normalize=normalize, sort=sort, ascending=ascending
  333. )
  334. expected = DataFrame()
  335. for column in ["country", "gender", "education"]:
  336. expected[column] = [education_df[column][row] for row in expected_rows]
  337. if normalize:
  338. expected["proportion"] = expected_count
  339. expected["proportion"] /= expected_group_size
  340. else:
  341. expected["count"] = expected_count
  342. tm.assert_frame_equal(result, expected)
  343. @pytest.fixture
  344. def animals_df():
  345. return DataFrame(
  346. {"key": [1, 1, 1, 1], "num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
  347. index=["falcon", "dog", "cat", "ant"],
  348. )
  349. @pytest.mark.parametrize(
  350. "sort, ascending, normalize, name, expected_data, expected_index",
  351. [
  352. (False, None, False, "count", [1, 2, 1], [(1, 1, 1), (2, 4, 6), (2, 0, 0)]),
  353. (True, True, False, "count", [1, 1, 2], [(1, 1, 1), (2, 6, 4), (2, 0, 0)]),
  354. (True, False, False, "count", [2, 1, 1], [(1, 1, 1), (4, 2, 6), (0, 2, 0)]),
  355. (
  356. True,
  357. False,
  358. True,
  359. "proportion",
  360. [0.5, 0.25, 0.25],
  361. [(1, 1, 1), (4, 2, 6), (0, 2, 0)],
  362. ),
  363. ],
  364. )
  365. def test_data_frame_value_counts(
  366. animals_df, sort, ascending, normalize, name, expected_data, expected_index
  367. ):
  368. # 3-way compare with :meth:`~DataFrame.value_counts`
  369. # Tests from frame/methods/test_value_counts.py
  370. result_frame = animals_df.value_counts(
  371. sort=sort, ascending=ascending, normalize=normalize
  372. )
  373. expected = Series(
  374. data=expected_data,
  375. index=MultiIndex.from_arrays(
  376. expected_index, names=["key", "num_legs", "num_wings"]
  377. ),
  378. name=name,
  379. )
  380. tm.assert_series_equal(result_frame, expected)
  381. result_frame_groupby = animals_df.groupby("key").value_counts(
  382. sort=sort, ascending=ascending, normalize=normalize
  383. )
  384. tm.assert_series_equal(result_frame_groupby, expected)
  385. @pytest.fixture
  386. def nulls_df():
  387. n = np.nan
  388. return DataFrame(
  389. {
  390. "A": [1, 1, n, 4, n, 6, 6, 6, 6],
  391. "B": [1, 1, 3, n, n, 6, 6, 6, 6],
  392. "C": [1, 2, 3, 4, 5, 6, n, 8, n],
  393. "D": [1, 2, 3, 4, 5, 6, 7, n, n],
  394. }
  395. )
  396. @pytest.mark.parametrize(
  397. "group_dropna, count_dropna, expected_rows, expected_values",
  398. [
  399. (
  400. False,
  401. False,
  402. [0, 1, 3, 5, 7, 6, 8, 2, 4],
  403. [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
  404. ),
  405. (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
  406. (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
  407. (True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
  408. ],
  409. )
  410. def test_dropna_combinations(
  411. nulls_df, group_dropna, count_dropna, expected_rows, expected_values, request
  412. ):
  413. if Version(np.__version__) >= Version("1.25") and not group_dropna:
  414. request.node.add_marker(
  415. pytest.mark.xfail(
  416. reason=(
  417. "pandas default unstable sorting of duplicates"
  418. "issue with numpy>=1.25 with AVX instructions"
  419. ),
  420. strict=False,
  421. )
  422. )
  423. gp = nulls_df.groupby(["A", "B"], dropna=group_dropna)
  424. result = gp.value_counts(normalize=True, sort=True, dropna=count_dropna)
  425. columns = DataFrame()
  426. for column in nulls_df.columns:
  427. columns[column] = [nulls_df[column][row] for row in expected_rows]
  428. index = MultiIndex.from_frame(columns)
  429. expected = Series(data=expected_values, index=index, name="proportion")
  430. tm.assert_series_equal(result, expected)
  431. @pytest.fixture
  432. def names_with_nulls_df(nulls_fixture):
  433. return DataFrame(
  434. {
  435. "key": [1, 1, 1, 1],
  436. "first_name": ["John", "Anne", "John", "Beth"],
  437. "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
  438. },
  439. )
  440. @pytest.mark.parametrize(
  441. "dropna, expected_data, expected_index",
  442. [
  443. (
  444. True,
  445. [1, 1],
  446. MultiIndex.from_arrays(
  447. [(1, 1), ("Beth", "John"), ("Louise", "Smith")],
  448. names=["key", "first_name", "middle_name"],
  449. ),
  450. ),
  451. (
  452. False,
  453. [1, 1, 1, 1],
  454. MultiIndex(
  455. levels=[
  456. Index([1]),
  457. Index(["Anne", "Beth", "John"]),
  458. Index(["Louise", "Smith", np.nan]),
  459. ],
  460. codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
  461. names=["key", "first_name", "middle_name"],
  462. ),
  463. ),
  464. ],
  465. )
  466. @pytest.mark.parametrize("normalize, name", [(False, "count"), (True, "proportion")])
  467. def test_data_frame_value_counts_dropna(
  468. names_with_nulls_df, dropna, normalize, name, expected_data, expected_index
  469. ):
  470. # GH 41334
  471. # 3-way compare with :meth:`~DataFrame.value_counts`
  472. # Tests with nulls from frame/methods/test_value_counts.py
  473. result_frame = names_with_nulls_df.value_counts(dropna=dropna, normalize=normalize)
  474. expected = Series(
  475. data=expected_data,
  476. index=expected_index,
  477. name=name,
  478. )
  479. if normalize:
  480. expected /= float(len(expected_data))
  481. tm.assert_series_equal(result_frame, expected)
  482. result_frame_groupby = names_with_nulls_df.groupby("key").value_counts(
  483. dropna=dropna, normalize=normalize
  484. )
  485. tm.assert_series_equal(result_frame_groupby, expected)
  486. @pytest.mark.parametrize("as_index", [False, True])
  487. @pytest.mark.parametrize("observed", [False, True])
  488. @pytest.mark.parametrize(
  489. "normalize, name, expected_data",
  490. [
  491. (
  492. False,
  493. "count",
  494. np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
  495. ),
  496. (
  497. True,
  498. "proportion",
  499. np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
  500. ),
  501. ],
  502. )
  503. def test_categorical_single_grouper_with_only_observed_categories(
  504. education_df, as_index, observed, normalize, name, expected_data, request
  505. ):
  506. # Test single categorical grouper with only observed grouping categories
  507. # when non-groupers are also categorical
  508. if Version(np.__version__) >= Version("1.25"):
  509. request.node.add_marker(
  510. pytest.mark.xfail(
  511. reason=(
  512. "pandas default unstable sorting of duplicates"
  513. "issue with numpy>=1.25 with AVX instructions"
  514. ),
  515. strict=False,
  516. )
  517. )
  518. gp = education_df.astype("category").groupby(
  519. "country", as_index=as_index, observed=observed
  520. )
  521. result = gp.value_counts(normalize=normalize)
  522. expected_index = MultiIndex.from_tuples(
  523. [
  524. ("FR", "male", "low"),
  525. ("FR", "female", "high"),
  526. ("FR", "male", "medium"),
  527. ("FR", "female", "low"),
  528. ("FR", "female", "medium"),
  529. ("FR", "male", "high"),
  530. ("US", "female", "high"),
  531. ("US", "male", "low"),
  532. ("US", "female", "low"),
  533. ("US", "female", "medium"),
  534. ("US", "male", "high"),
  535. ("US", "male", "medium"),
  536. ],
  537. names=["country", "gender", "education"],
  538. )
  539. expected_series = Series(
  540. data=expected_data,
  541. index=expected_index,
  542. name=name,
  543. )
  544. for i in range(3):
  545. expected_series.index = expected_series.index.set_levels(
  546. CategoricalIndex(expected_series.index.levels[i]), level=i
  547. )
  548. if as_index:
  549. tm.assert_series_equal(result, expected_series)
  550. else:
  551. expected = expected_series.reset_index(
  552. name="proportion" if normalize else "count"
  553. )
  554. tm.assert_frame_equal(result, expected)
  555. def assert_categorical_single_grouper(
  556. education_df, as_index, observed, expected_index, normalize, name, expected_data
  557. ):
  558. # Test single categorical grouper when non-groupers are also categorical
  559. education_df = education_df.copy().astype("category")
  560. # Add non-observed grouping categories
  561. education_df["country"] = education_df["country"].cat.add_categories(["ASIA"])
  562. gp = education_df.groupby("country", as_index=as_index, observed=observed)
  563. result = gp.value_counts(normalize=normalize)
  564. expected_series = Series(
  565. data=expected_data,
  566. index=MultiIndex.from_tuples(
  567. expected_index,
  568. names=["country", "gender", "education"],
  569. ),
  570. name=name,
  571. )
  572. for i in range(3):
  573. index_level = CategoricalIndex(expected_series.index.levels[i])
  574. if i == 0:
  575. index_level = index_level.set_categories(
  576. education_df["country"].cat.categories
  577. )
  578. expected_series.index = expected_series.index.set_levels(index_level, level=i)
  579. if as_index:
  580. tm.assert_series_equal(result, expected_series)
  581. else:
  582. expected = expected_series.reset_index(name=name)
  583. tm.assert_frame_equal(result, expected)
  584. @pytest.mark.parametrize("as_index", [True, False])
  585. @pytest.mark.parametrize(
  586. "normalize, name, expected_data",
  587. [
  588. (
  589. False,
  590. "count",
  591. np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
  592. ),
  593. (
  594. True,
  595. "proportion",
  596. np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
  597. ),
  598. ],
  599. )
  600. def test_categorical_single_grouper_observed_true(
  601. education_df, as_index, normalize, name, expected_data, request
  602. ):
  603. # GH#46357
  604. if Version(np.__version__) >= Version("1.25"):
  605. request.node.add_marker(
  606. pytest.mark.xfail(
  607. reason=(
  608. "pandas default unstable sorting of duplicates"
  609. "issue with numpy>=1.25 with AVX instructions"
  610. ),
  611. strict=False,
  612. )
  613. )
  614. expected_index = [
  615. ("FR", "male", "low"),
  616. ("FR", "female", "high"),
  617. ("FR", "male", "medium"),
  618. ("FR", "female", "low"),
  619. ("FR", "female", "medium"),
  620. ("FR", "male", "high"),
  621. ("US", "female", "high"),
  622. ("US", "male", "low"),
  623. ("US", "female", "low"),
  624. ("US", "female", "medium"),
  625. ("US", "male", "high"),
  626. ("US", "male", "medium"),
  627. ]
  628. assert_categorical_single_grouper(
  629. education_df=education_df,
  630. as_index=as_index,
  631. observed=True,
  632. expected_index=expected_index,
  633. normalize=normalize,
  634. name=name,
  635. expected_data=expected_data,
  636. )
  637. @pytest.mark.parametrize("as_index", [True, False])
  638. @pytest.mark.parametrize(
  639. "normalize, name, expected_data",
  640. [
  641. (
  642. False,
  643. "count",
  644. np.array(
  645. [2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=np.int64
  646. ),
  647. ),
  648. (
  649. True,
  650. "proportion",
  651. np.array(
  652. [
  653. 0.5,
  654. 0.25,
  655. 0.25,
  656. 0.0,
  657. 0.0,
  658. 0.0,
  659. 0.5,
  660. 0.5,
  661. 0.0,
  662. 0.0,
  663. 0.0,
  664. 0.0,
  665. 0.0,
  666. 0.0,
  667. 0.0,
  668. 0.0,
  669. 0.0,
  670. 0.0,
  671. ]
  672. ),
  673. ),
  674. ],
  675. )
  676. def test_categorical_single_grouper_observed_false(
  677. education_df, as_index, normalize, name, expected_data, request
  678. ):
  679. # GH#46357
  680. if Version(np.__version__) >= Version("1.25"):
  681. request.node.add_marker(
  682. pytest.mark.xfail(
  683. reason=(
  684. "pandas default unstable sorting of duplicates"
  685. "issue with numpy>=1.25 with AVX instructions"
  686. ),
  687. strict=False,
  688. )
  689. )
  690. expected_index = [
  691. ("FR", "male", "low"),
  692. ("FR", "female", "high"),
  693. ("FR", "male", "medium"),
  694. ("FR", "female", "low"),
  695. ("FR", "male", "high"),
  696. ("FR", "female", "medium"),
  697. ("US", "female", "high"),
  698. ("US", "male", "low"),
  699. ("US", "male", "medium"),
  700. ("US", "male", "high"),
  701. ("US", "female", "medium"),
  702. ("US", "female", "low"),
  703. ("ASIA", "male", "low"),
  704. ("ASIA", "male", "high"),
  705. ("ASIA", "female", "medium"),
  706. ("ASIA", "female", "low"),
  707. ("ASIA", "female", "high"),
  708. ("ASIA", "male", "medium"),
  709. ]
  710. assert_categorical_single_grouper(
  711. education_df=education_df,
  712. as_index=as_index,
  713. observed=False,
  714. expected_index=expected_index,
  715. normalize=normalize,
  716. name=name,
  717. expected_data=expected_data,
  718. )
  719. @pytest.mark.parametrize("as_index", [True, False])
  720. @pytest.mark.parametrize(
  721. "observed, expected_index",
  722. [
  723. (
  724. False,
  725. [
  726. ("FR", "high", "female"),
  727. ("FR", "high", "male"),
  728. ("FR", "low", "male"),
  729. ("FR", "low", "female"),
  730. ("FR", "medium", "male"),
  731. ("FR", "medium", "female"),
  732. ("US", "high", "female"),
  733. ("US", "high", "male"),
  734. ("US", "low", "male"),
  735. ("US", "low", "female"),
  736. ("US", "medium", "female"),
  737. ("US", "medium", "male"),
  738. ],
  739. ),
  740. (
  741. True,
  742. [
  743. ("FR", "high", "female"),
  744. ("FR", "low", "male"),
  745. ("FR", "medium", "male"),
  746. ("US", "high", "female"),
  747. ("US", "low", "male"),
  748. ],
  749. ),
  750. ],
  751. )
  752. @pytest.mark.parametrize(
  753. "normalize, name, expected_data",
  754. [
  755. (
  756. False,
  757. "count",
  758. np.array([1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 0], dtype=np.int64),
  759. ),
  760. (
  761. True,
  762. "proportion",
  763. # NaN values corresponds to non-observed groups
  764. np.array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0]),
  765. ),
  766. ],
  767. )
  768. def test_categorical_multiple_groupers(
  769. education_df, as_index, observed, expected_index, normalize, name, expected_data
  770. ):
  771. # GH#46357
  772. # Test multiple categorical groupers when non-groupers are non-categorical
  773. education_df = education_df.copy()
  774. education_df["country"] = education_df["country"].astype("category")
  775. education_df["education"] = education_df["education"].astype("category")
  776. gp = education_df.groupby(
  777. ["country", "education"], as_index=as_index, observed=observed
  778. )
  779. result = gp.value_counts(normalize=normalize)
  780. expected_series = Series(
  781. data=expected_data[expected_data > 0.0] if observed else expected_data,
  782. index=MultiIndex.from_tuples(
  783. expected_index,
  784. names=["country", "education", "gender"],
  785. ),
  786. name=name,
  787. )
  788. for i in range(2):
  789. expected_series.index = expected_series.index.set_levels(
  790. CategoricalIndex(expected_series.index.levels[i]), level=i
  791. )
  792. if as_index:
  793. tm.assert_series_equal(result, expected_series)
  794. else:
  795. expected = expected_series.reset_index(
  796. name="proportion" if normalize else "count"
  797. )
  798. tm.assert_frame_equal(result, expected)
  799. @pytest.mark.parametrize("as_index", [False, True])
  800. @pytest.mark.parametrize("observed", [False, True])
  801. @pytest.mark.parametrize(
  802. "normalize, name, expected_data",
  803. [
  804. (
  805. False,
  806. "count",
  807. np.array([2, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0], dtype=np.int64),
  808. ),
  809. (
  810. True,
  811. "proportion",
  812. # NaN values corresponds to non-observed groups
  813. np.array([0.5, 0.25, 0.25, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0]),
  814. ),
  815. ],
  816. )
  817. def test_categorical_non_groupers(
  818. education_df, as_index, observed, normalize, name, expected_data, request
  819. ):
  820. # GH#46357 Test non-observed categories are included in the result,
  821. # regardless of `observed`
  822. if Version(np.__version__) >= Version("1.25"):
  823. request.node.add_marker(
  824. pytest.mark.xfail(
  825. reason=(
  826. "pandas default unstable sorting of duplicates"
  827. "issue with numpy>=1.25 with AVX instructions"
  828. ),
  829. strict=False,
  830. )
  831. )
  832. education_df = education_df.copy()
  833. education_df["gender"] = education_df["gender"].astype("category")
  834. education_df["education"] = education_df["education"].astype("category")
  835. gp = education_df.groupby("country", as_index=as_index, observed=observed)
  836. result = gp.value_counts(normalize=normalize)
  837. expected_index = [
  838. ("FR", "male", "low"),
  839. ("FR", "female", "high"),
  840. ("FR", "male", "medium"),
  841. ("FR", "female", "low"),
  842. ("FR", "female", "medium"),
  843. ("FR", "male", "high"),
  844. ("US", "female", "high"),
  845. ("US", "male", "low"),
  846. ("US", "female", "low"),
  847. ("US", "female", "medium"),
  848. ("US", "male", "high"),
  849. ("US", "male", "medium"),
  850. ]
  851. expected_series = Series(
  852. data=expected_data,
  853. index=MultiIndex.from_tuples(
  854. expected_index,
  855. names=["country", "gender", "education"],
  856. ),
  857. name=name,
  858. )
  859. for i in range(1, 3):
  860. expected_series.index = expected_series.index.set_levels(
  861. CategoricalIndex(expected_series.index.levels[i]), level=i
  862. )
  863. if as_index:
  864. tm.assert_series_equal(result, expected_series)
  865. else:
  866. expected = expected_series.reset_index(
  867. name="proportion" if normalize else "count"
  868. )
  869. tm.assert_frame_equal(result, expected)
  870. @pytest.mark.parametrize(
  871. "normalize, expected_label, expected_values",
  872. [
  873. (False, "count", [1, 1, 1]),
  874. (True, "proportion", [0.5, 0.5, 1.0]),
  875. ],
  876. )
  877. def test_mixed_groupings(normalize, expected_label, expected_values):
  878. # Test multiple groupings
  879. df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
  880. gp = df.groupby([[4, 5, 4], "A", lambda i: 7 if i == 1 else 8], as_index=False)
  881. result = gp.value_counts(sort=True, normalize=normalize)
  882. expected = DataFrame(
  883. {
  884. "level_0": np.array([4, 4, 5], dtype=np.int_),
  885. "A": [1, 1, 2],
  886. "level_2": [8, 8, 7],
  887. "B": [1, 3, 2],
  888. expected_label: expected_values,
  889. }
  890. )
  891. tm.assert_frame_equal(result, expected)
  892. @pytest.mark.parametrize(
  893. "test, columns, expected_names",
  894. [
  895. ("repeat", list("abbde"), ["a", None, "d", "b", "b", "e"]),
  896. ("level", list("abcd") + ["level_1"], ["a", None, "d", "b", "c", "level_1"]),
  897. ],
  898. )
  899. @pytest.mark.parametrize("as_index", [False, True])
  900. def test_column_label_duplicates(test, columns, expected_names, as_index):
  901. # GH 44992
  902. # Test for duplicate input column labels and generated duplicate labels
  903. df = DataFrame([[1, 3, 5, 7, 9], [2, 4, 6, 8, 10]], columns=columns)
  904. expected_data = [(1, 0, 7, 3, 5, 9), (2, 1, 8, 4, 6, 10)]
  905. keys = ["a", np.array([0, 1], dtype=np.int64), "d"]
  906. result = df.groupby(keys, as_index=as_index).value_counts()
  907. if as_index:
  908. expected = Series(
  909. data=(1, 1),
  910. index=MultiIndex.from_tuples(
  911. expected_data,
  912. names=expected_names,
  913. ),
  914. name="count",
  915. )
  916. tm.assert_series_equal(result, expected)
  917. else:
  918. expected_data = [list(row) + [1] for row in expected_data]
  919. expected_columns = list(expected_names)
  920. expected_columns[1] = "level_1"
  921. expected_columns.append("count")
  922. expected = DataFrame(expected_data, columns=expected_columns)
  923. tm.assert_frame_equal(result, expected)
  924. @pytest.mark.parametrize(
  925. "normalize, expected_label",
  926. [
  927. (False, "count"),
  928. (True, "proportion"),
  929. ],
  930. )
  931. def test_result_label_duplicates(normalize, expected_label):
  932. # Test for result column label duplicating an input column label
  933. gb = DataFrame([[1, 2, 3]], columns=["a", "b", expected_label]).groupby(
  934. "a", as_index=False
  935. )
  936. msg = f"Column label '{expected_label}' is duplicate of result column"
  937. with pytest.raises(ValueError, match=msg):
  938. gb.value_counts(normalize=normalize)
  939. def test_ambiguous_grouping():
  940. # Test that groupby is not confused by groupings length equal to row count
  941. df = DataFrame({"a": [1, 1]})
  942. gb = df.groupby(np.array([1, 1], dtype=np.int64))
  943. result = gb.value_counts()
  944. expected = Series(
  945. [2], index=MultiIndex.from_tuples([[1, 1]], names=[None, "a"]), name="count"
  946. )
  947. tm.assert_series_equal(result, expected)
  948. def test_subset_overlaps_gb_key_raises():
  949. # GH 46383
  950. df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
  951. msg = "Keys {'c1'} in subset cannot be in the groupby column keys."
  952. with pytest.raises(ValueError, match=msg):
  953. df.groupby("c1").value_counts(subset=["c1"])
  954. def test_subset_doesnt_exist_in_frame():
  955. # GH 46383
  956. df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
  957. msg = "Keys {'c3'} in subset do not exist in the DataFrame."
  958. with pytest.raises(ValueError, match=msg):
  959. df.groupby("c1").value_counts(subset=["c3"])
  960. def test_subset():
  961. # GH 46383
  962. df = DataFrame({"c1": ["a", "b", "c"], "c2": ["x", "y", "y"]}, index=[0, 1, 1])
  963. result = df.groupby(level=0).value_counts(subset=["c2"])
  964. expected = Series(
  965. [1, 2],
  966. index=MultiIndex.from_arrays([[0, 1], ["x", "y"]], names=[None, "c2"]),
  967. name="count",
  968. )
  969. tm.assert_series_equal(result, expected)
  970. def test_subset_duplicate_columns():
  971. # GH 46383
  972. df = DataFrame(
  973. [["a", "x", "x"], ["b", "y", "y"], ["b", "y", "y"]],
  974. index=[0, 1, 1],
  975. columns=["c1", "c2", "c2"],
  976. )
  977. result = df.groupby(level=0).value_counts(subset=["c2"])
  978. expected = Series(
  979. [1, 2],
  980. index=MultiIndex.from_arrays(
  981. [[0, 1], ["x", "y"], ["x", "y"]], names=[None, "c2", "c2"]
  982. ),
  983. name="count",
  984. )
  985. tm.assert_series_equal(result, expected)
  986. @pytest.mark.parametrize("utc", [True, False])
  987. def test_value_counts_time_grouper(utc):
  988. # GH#50486
  989. df = DataFrame(
  990. {
  991. "Timestamp": [
  992. 1565083561,
  993. 1565083561 + 86400,
  994. 1565083561 + 86500,
  995. 1565083561 + 86400 * 2,
  996. 1565083561 + 86400 * 3,
  997. 1565083561 + 86500 * 3,
  998. 1565083561 + 86400 * 4,
  999. ],
  1000. "Food": ["apple", "apple", "banana", "banana", "orange", "orange", "pear"],
  1001. }
  1002. ).drop([3])
  1003. df["Datetime"] = to_datetime(df["Timestamp"], utc=utc, unit="s")
  1004. gb = df.groupby(Grouper(freq="1D", key="Datetime"))
  1005. result = gb.value_counts()
  1006. dates = to_datetime(
  1007. ["2019-08-06", "2019-08-07", "2019-08-09", "2019-08-10"], utc=utc
  1008. )
  1009. timestamps = df["Timestamp"].unique()
  1010. index = MultiIndex(
  1011. levels=[dates, timestamps, ["apple", "banana", "orange", "pear"]],
  1012. codes=[[0, 1, 1, 2, 2, 3], range(6), [0, 0, 1, 2, 2, 3]],
  1013. names=["Datetime", "Timestamp", "Food"],
  1014. )
  1015. expected = Series(1, index=index, name="count")
  1016. tm.assert_series_equal(result, expected)