123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- Index,
- Series,
- )
- import pandas._testing as tm
- class TestSeriesValueCounts:
- def test_value_counts_datetime(self):
- # most dtypes are tested in tests/base
- values = [
- pd.Timestamp("2011-01-01 09:00"),
- pd.Timestamp("2011-01-01 10:00"),
- pd.Timestamp("2011-01-01 11:00"),
- pd.Timestamp("2011-01-01 09:00"),
- pd.Timestamp("2011-01-01 09:00"),
- pd.Timestamp("2011-01-01 11:00"),
- ]
- exp_idx = pd.DatetimeIndex(
- ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"],
- name="xxx",
- )
- exp = Series([3, 2, 1], index=exp_idx, name="count")
- ser = Series(values, name="xxx")
- tm.assert_series_equal(ser.value_counts(), exp)
- # check DatetimeIndex outputs the same result
- idx = pd.DatetimeIndex(values, name="xxx")
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
- tm.assert_series_equal(ser.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_datetime_tz(self):
- values = [
- pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
- pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"),
- pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
- pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
- pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
- pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
- ]
- exp_idx = pd.DatetimeIndex(
- ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"],
- tz="US/Eastern",
- name="xxx",
- )
- exp = Series([3, 2, 1], index=exp_idx, name="count")
- ser = Series(values, name="xxx")
- tm.assert_series_equal(ser.value_counts(), exp)
- idx = pd.DatetimeIndex(values, name="xxx")
- tm.assert_series_equal(idx.value_counts(), exp)
- exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
- tm.assert_series_equal(ser.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_period(self):
- values = [
- pd.Period("2011-01", freq="M"),
- pd.Period("2011-02", freq="M"),
- pd.Period("2011-03", freq="M"),
- pd.Period("2011-01", freq="M"),
- pd.Period("2011-01", freq="M"),
- pd.Period("2011-03", freq="M"),
- ]
- exp_idx = pd.PeriodIndex(
- ["2011-01", "2011-03", "2011-02"], freq="M", name="xxx"
- )
- exp = Series([3, 2, 1], index=exp_idx, name="count")
- ser = Series(values, name="xxx")
- tm.assert_series_equal(ser.value_counts(), exp)
- # check DatetimeIndex outputs the same result
- idx = pd.PeriodIndex(values, name="xxx")
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
- tm.assert_series_equal(ser.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_categorical_ordered(self):
- # most dtypes are tested in tests/base
- values = Categorical([1, 2, 3, 1, 1, 3], ordered=True)
- exp_idx = CategoricalIndex(
- [1, 3, 2], categories=[1, 2, 3], ordered=True, name="xxx"
- )
- exp = Series([3, 2, 1], index=exp_idx, name="count")
- ser = Series(values, name="xxx")
- tm.assert_series_equal(ser.value_counts(), exp)
- # check CategoricalIndex outputs the same result
- idx = CategoricalIndex(values, name="xxx")
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
- tm.assert_series_equal(ser.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_categorical_not_ordered(self):
- values = Categorical([1, 2, 3, 1, 1, 3], ordered=False)
- exp_idx = CategoricalIndex(
- [1, 3, 2], categories=[1, 2, 3], ordered=False, name="xxx"
- )
- exp = Series([3, 2, 1], index=exp_idx, name="count")
- ser = Series(values, name="xxx")
- tm.assert_series_equal(ser.value_counts(), exp)
- # check CategoricalIndex outputs the same result
- idx = CategoricalIndex(values, name="xxx")
- tm.assert_series_equal(idx.value_counts(), exp)
- # normalize
- exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
- tm.assert_series_equal(ser.value_counts(normalize=True), exp)
- tm.assert_series_equal(idx.value_counts(normalize=True), exp)
- def test_value_counts_categorical(self):
- # GH#12835
- cats = Categorical(list("abcccb"), categories=list("cabd"))
- ser = Series(cats, name="xxx")
- res = ser.value_counts(sort=False)
- exp_index = CategoricalIndex(
- list("cabd"), categories=cats.categories, name="xxx"
- )
- exp = Series([3, 1, 2, 0], name="count", index=exp_index)
- tm.assert_series_equal(res, exp)
- res = ser.value_counts(sort=True)
- exp_index = CategoricalIndex(
- list("cbad"), categories=cats.categories, name="xxx"
- )
- exp = Series([3, 2, 1, 0], name="count", index=exp_index)
- tm.assert_series_equal(res, exp)
- # check object dtype handles the Series.name as the same
- # (tested in tests/base)
- ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx")
- res = ser.value_counts()
- exp = Series([3, 2, 1], name="count", index=Index(["c", "b", "a"], name="xxx"))
- tm.assert_series_equal(res, exp)
- def test_value_counts_categorical_with_nan(self):
- # see GH#9443
- # sanity check
- ser = Series(["a", "b", "a"], dtype="category")
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count")
- res = ser.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- res = ser.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- # same Series via two different constructions --> same behaviour
- series = [
- Series(["a", "b", None, "a", None, None], dtype="category"),
- Series(
- Categorical(["a", "b", None, "a", None, None], categories=["a", "b"])
- ),
- ]
- for ser in series:
- # None is a NaN value, so we exclude its count here
- exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count")
- res = ser.value_counts(dropna=True)
- tm.assert_series_equal(res, exp)
- # we don't exclude the count of None and sort by counts
- exp = Series(
- [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), name="count"
- )
- res = ser.value_counts(dropna=False)
- tm.assert_series_equal(res, exp)
- # When we aren't sorting by counts, and np.nan isn't a
- # category, it should be last.
- exp = Series(
- [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), name="count"
- )
- res = ser.value_counts(dropna=False, sort=False)
- tm.assert_series_equal(res, exp)
- @pytest.mark.parametrize(
- "ser, dropna, exp",
- [
- (
- Series([False, True, True, pd.NA]),
- False,
- Series([2, 1, 1], index=[True, False, pd.NA], name="count"),
- ),
- (
- Series([False, True, True, pd.NA]),
- True,
- Series([2, 1], index=Index([True, False], dtype=object), name="count"),
- ),
- (
- Series(range(3), index=[True, False, np.nan]).index,
- False,
- Series([1, 1, 1], index=[True, False, np.nan], name="count"),
- ),
- ],
- )
- def test_value_counts_bool_with_nan(self, ser, dropna, exp):
- # GH32146
- out = ser.value_counts(dropna=dropna)
- tm.assert_series_equal(out, exp)
- @pytest.mark.parametrize(
- "input_array,expected",
- [
- (
- [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j],
- Series(
- [3, 2, 1],
- index=Index([3j, 1 + 1j, 1], dtype=np.complex128),
- name="count",
- ),
- ),
- (
- np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64),
- Series(
- [3, 2, 1],
- index=Index([3j, 1 + 1j, 1], dtype=np.complex64),
- name="count",
- ),
- ),
- ],
- )
- def test_value_counts_complex_numbers(self, input_array, expected):
- # GH 17927
- result = Series(input_array).value_counts()
- tm.assert_series_equal(result, expected)
|