test_unique.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import numpy as np
  2. import pytest
  3. from pandas.core.dtypes.common import is_datetime64tz_dtype
  4. import pandas as pd
  5. import pandas._testing as tm
  6. from pandas.tests.base.common import allow_na_ops
  7. def test_unique(index_or_series_obj):
  8. obj = index_or_series_obj
  9. obj = np.repeat(obj, range(1, len(obj) + 1))
  10. result = obj.unique()
  11. # dict.fromkeys preserves the order
  12. unique_values = list(dict.fromkeys(obj.values))
  13. if isinstance(obj, pd.MultiIndex):
  14. expected = pd.MultiIndex.from_tuples(unique_values)
  15. expected.names = obj.names
  16. tm.assert_index_equal(result, expected, exact=True)
  17. elif isinstance(obj, pd.Index):
  18. expected = pd.Index(unique_values, dtype=obj.dtype)
  19. if is_datetime64tz_dtype(obj.dtype):
  20. expected = expected.normalize()
  21. tm.assert_index_equal(result, expected, exact=True)
  22. else:
  23. expected = np.array(unique_values)
  24. tm.assert_numpy_array_equal(result, expected)
  25. @pytest.mark.parametrize("null_obj", [np.nan, None])
  26. def test_unique_null(null_obj, index_or_series_obj):
  27. obj = index_or_series_obj
  28. if not allow_na_ops(obj):
  29. pytest.skip("type doesn't allow for NA operations")
  30. elif len(obj) < 1:
  31. pytest.skip("Test doesn't make sense on empty data")
  32. elif isinstance(obj, pd.MultiIndex):
  33. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  34. values = obj._values
  35. values[0:2] = null_obj
  36. klass = type(obj)
  37. repeated_values = np.repeat(values, range(1, len(values) + 1))
  38. obj = klass(repeated_values, dtype=obj.dtype)
  39. result = obj.unique()
  40. unique_values_raw = dict.fromkeys(obj.values)
  41. # because np.nan == np.nan is False, but None == None is True
  42. # np.nan would be duplicated, whereas None wouldn't
  43. unique_values_not_null = [val for val in unique_values_raw if not pd.isnull(val)]
  44. unique_values = [null_obj] + unique_values_not_null
  45. if isinstance(obj, pd.Index):
  46. expected = pd.Index(unique_values, dtype=obj.dtype)
  47. if is_datetime64tz_dtype(obj.dtype):
  48. result = result.normalize()
  49. expected = expected.normalize()
  50. tm.assert_index_equal(result, expected, exact=True)
  51. else:
  52. expected = np.array(unique_values, dtype=obj.dtype)
  53. tm.assert_numpy_array_equal(result, expected)
  54. def test_nunique(index_or_series_obj):
  55. obj = index_or_series_obj
  56. obj = np.repeat(obj, range(1, len(obj) + 1))
  57. expected = len(obj.unique())
  58. assert obj.nunique(dropna=False) == expected
  59. @pytest.mark.parametrize("null_obj", [np.nan, None])
  60. def test_nunique_null(null_obj, index_or_series_obj):
  61. obj = index_or_series_obj
  62. if not allow_na_ops(obj):
  63. pytest.skip("type doesn't allow for NA operations")
  64. elif isinstance(obj, pd.MultiIndex):
  65. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  66. values = obj._values
  67. values[0:2] = null_obj
  68. klass = type(obj)
  69. repeated_values = np.repeat(values, range(1, len(values) + 1))
  70. obj = klass(repeated_values, dtype=obj.dtype)
  71. if isinstance(obj, pd.CategoricalIndex):
  72. assert obj.nunique() == len(obj.categories)
  73. assert obj.nunique(dropna=False) == len(obj.categories) + 1
  74. else:
  75. num_unique_values = len(obj.unique())
  76. assert obj.nunique() == max(0, num_unique_values - 1)
  77. assert obj.nunique(dropna=False) == max(0, num_unique_values)
  78. @pytest.mark.single_cpu
  79. def test_unique_bad_unicode(index_or_series):
  80. # regression test for #34550
  81. uval = "\ud83d" # smiley emoji
  82. obj = index_or_series([uval] * 2)
  83. result = obj.unique()
  84. if isinstance(obj, pd.Index):
  85. expected = pd.Index(["\ud83d"], dtype=object)
  86. tm.assert_index_equal(result, expected, exact=True)
  87. else:
  88. expected = np.array(["\ud83d"], dtype=object)
  89. tm.assert_numpy_array_equal(result, expected)
  90. @pytest.mark.parametrize("dropna", [True, False])
  91. def test_nunique_dropna(dropna):
  92. # GH37566
  93. ser = pd.Series(["yes", "yes", pd.NA, np.nan, None, pd.NaT])
  94. res = ser.nunique(dropna)
  95. assert res == 1 if dropna else 5