test_categorical.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. Categorical,
  5. DataFrame,
  6. Series,
  7. _testing as tm,
  8. concat,
  9. read_hdf,
  10. )
  11. from pandas.tests.io.pytables.common import (
  12. _maybe_remove,
  13. ensure_clean_store,
  14. )
  15. pytestmark = [
  16. pytest.mark.single_cpu,
  17. ]
  18. def test_categorical(setup_path):
  19. with ensure_clean_store(setup_path) as store:
  20. # Basic
  21. _maybe_remove(store, "s")
  22. s = Series(
  23. Categorical(
  24. ["a", "b", "b", "a", "a", "c"],
  25. categories=["a", "b", "c", "d"],
  26. ordered=False,
  27. )
  28. )
  29. store.append("s", s, format="table")
  30. result = store.select("s")
  31. tm.assert_series_equal(s, result)
  32. _maybe_remove(store, "s_ordered")
  33. s = Series(
  34. Categorical(
  35. ["a", "b", "b", "a", "a", "c"],
  36. categories=["a", "b", "c", "d"],
  37. ordered=True,
  38. )
  39. )
  40. store.append("s_ordered", s, format="table")
  41. result = store.select("s_ordered")
  42. tm.assert_series_equal(s, result)
  43. _maybe_remove(store, "df")
  44. df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
  45. store.append("df", df, format="table")
  46. result = store.select("df")
  47. tm.assert_frame_equal(result, df)
  48. # Dtypes
  49. _maybe_remove(store, "si")
  50. s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
  51. store.append("si", s)
  52. result = store.select("si")
  53. tm.assert_series_equal(result, s)
  54. _maybe_remove(store, "si2")
  55. s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
  56. store.append("si2", s)
  57. result = store.select("si2")
  58. tm.assert_series_equal(result, s)
  59. # Multiple
  60. _maybe_remove(store, "df2")
  61. df2 = df.copy()
  62. df2["s2"] = Series(list("abcdefg")).astype("category")
  63. store.append("df2", df2)
  64. result = store.select("df2")
  65. tm.assert_frame_equal(result, df2)
  66. # Make sure the metadata is OK
  67. info = store.info()
  68. assert "/df2 " in info
  69. # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
  70. assert "/df2/meta/values_block_0/meta" in info
  71. assert "/df2/meta/values_block_2/meta" in info
  72. # unordered
  73. _maybe_remove(store, "s2")
  74. s = Series(
  75. Categorical(
  76. ["a", "b", "b", "a", "a", "c"],
  77. categories=["a", "b", "c", "d"],
  78. ordered=False,
  79. )
  80. )
  81. store.append("s2", s, format="table")
  82. result = store.select("s2")
  83. tm.assert_series_equal(result, s)
  84. # Query
  85. _maybe_remove(store, "df3")
  86. store.append("df3", df, data_columns=["s"])
  87. expected = df[df.s.isin(["b", "c"])]
  88. result = store.select("df3", where=['s in ["b","c"]'])
  89. tm.assert_frame_equal(result, expected)
  90. expected = df[df.s.isin(["b", "c"])]
  91. result = store.select("df3", where=['s = ["b","c"]'])
  92. tm.assert_frame_equal(result, expected)
  93. expected = df[df.s.isin(["d"])]
  94. result = store.select("df3", where=['s in ["d"]'])
  95. tm.assert_frame_equal(result, expected)
  96. expected = df[df.s.isin(["f"])]
  97. result = store.select("df3", where=['s in ["f"]'])
  98. tm.assert_frame_equal(result, expected)
  99. # Appending with same categories is ok
  100. store.append("df3", df)
  101. df = concat([df, df])
  102. expected = df[df.s.isin(["b", "c"])]
  103. result = store.select("df3", where=['s in ["b","c"]'])
  104. tm.assert_frame_equal(result, expected)
  105. # Appending must have the same categories
  106. df3 = df.copy()
  107. df3["s"] = df3["s"].cat.remove_unused_categories()
  108. msg = "cannot append a categorical with different categories to the existing"
  109. with pytest.raises(ValueError, match=msg):
  110. store.append("df3", df3)
  111. # Remove, and make sure meta data is removed (its a recursive
  112. # removal so should be).
  113. result = store.select("df3/meta/s/meta")
  114. assert result is not None
  115. store.remove("df3")
  116. with pytest.raises(
  117. KeyError, match="'No object named df3/meta/s/meta in the file'"
  118. ):
  119. store.select("df3/meta/s/meta")
  120. def test_categorical_conversion(tmp_path, setup_path):
  121. # GH13322
  122. # Check that read_hdf with categorical columns doesn't return rows if
  123. # where criteria isn't met.
  124. obsids = ["ESP_012345_6789", "ESP_987654_3210"]
  125. imgids = ["APF00006np", "APF0001imm"]
  126. data = [4.3, 9.8]
  127. # Test without categories
  128. df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
  129. # We are expecting an empty DataFrame matching types of df
  130. expected = df.iloc[[], :]
  131. path = tmp_path / setup_path
  132. df.to_hdf(path, "df", format="table", data_columns=True)
  133. result = read_hdf(path, "df", where="obsids=B")
  134. tm.assert_frame_equal(result, expected)
  135. # Test with categories
  136. df.obsids = df.obsids.astype("category")
  137. df.imgids = df.imgids.astype("category")
  138. # We are expecting an empty DataFrame matching types of df
  139. expected = df.iloc[[], :]
  140. path = tmp_path / setup_path
  141. df.to_hdf(path, "df", format="table", data_columns=True)
  142. result = read_hdf(path, "df", where="obsids=B")
  143. tm.assert_frame_equal(result, expected)
  144. def test_categorical_nan_only_columns(tmp_path, setup_path):
  145. # GH18413
  146. # Check that read_hdf with categorical columns with NaN-only values can
  147. # be read back.
  148. df = DataFrame(
  149. {
  150. "a": ["a", "b", "c", np.nan],
  151. "b": [np.nan, np.nan, np.nan, np.nan],
  152. "c": [1, 2, 3, 4],
  153. "d": Series([None] * 4, dtype=object),
  154. }
  155. )
  156. df["a"] = df.a.astype("category")
  157. df["b"] = df.b.astype("category")
  158. df["d"] = df.b.astype("category")
  159. expected = df
  160. path = tmp_path / setup_path
  161. df.to_hdf(path, "df", format="table", data_columns=True)
  162. result = read_hdf(path, "df")
  163. tm.assert_frame_equal(result, expected)
  164. @pytest.mark.parametrize(
  165. "where, df, expected",
  166. [
  167. ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
  168. ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
  169. ],
  170. )
  171. def test_convert_value(
  172. tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
  173. ):
  174. # GH39420
  175. # Check that read_hdf with categorical columns can filter by where condition.
  176. df.col = df.col.astype("category")
  177. max_widths = {"col": 1}
  178. categorical_values = sorted(df.col.unique())
  179. expected.col = expected.col.astype("category")
  180. expected.col = expected.col.cat.set_categories(categorical_values)
  181. path = tmp_path / setup_path
  182. df.to_hdf(path, "df", format="table", min_itemsize=max_widths)
  183. result = read_hdf(path, where=where)
  184. tm.assert_frame_equal(result, expected)