test_downstream.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. """
  2. Testing that we work in the downstream packages
  3. """
  4. import importlib
  5. import subprocess
  6. import sys
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import IntCastingNaNError
  10. import pandas.util._test_decorators as td
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame,
  14. Series,
  15. )
  16. import pandas._testing as tm
  17. def import_module(name):
  18. # we *only* want to skip if the module is truly not available
  19. # and NOT just an actual import error because of pandas changes
  20. try:
  21. return importlib.import_module(name)
  22. except ModuleNotFoundError:
  23. pytest.skip(f"skipping as {name} not available")
  24. @pytest.fixture
  25. def df():
  26. return DataFrame({"A": [1, 2, 3]})
  27. def test_dask(df):
  28. # dask sets "compute.use_numexpr" to False, so catch the current value
  29. # and ensure to reset it afterwards to avoid impacting other tests
  30. olduse = pd.get_option("compute.use_numexpr")
  31. try:
  32. toolz = import_module("toolz") # noqa:F841
  33. dask = import_module("dask") # noqa:F841
  34. import dask.dataframe as dd
  35. ddf = dd.from_pandas(df, npartitions=3)
  36. assert ddf.A is not None
  37. assert ddf.compute() is not None
  38. finally:
  39. pd.set_option("compute.use_numexpr", olduse)
  40. def test_dask_ufunc():
  41. # dask sets "compute.use_numexpr" to False, so catch the current value
  42. # and ensure to reset it afterwards to avoid impacting other tests
  43. olduse = pd.get_option("compute.use_numexpr")
  44. try:
  45. dask = import_module("dask") # noqa:F841
  46. import dask.array as da
  47. import dask.dataframe as dd
  48. s = Series([1.5, 2.3, 3.7, 4.0])
  49. ds = dd.from_pandas(s, npartitions=2)
  50. result = da.fix(ds).compute()
  51. expected = np.fix(s)
  52. tm.assert_series_equal(result, expected)
  53. finally:
  54. pd.set_option("compute.use_numexpr", olduse)
  55. @td.skip_if_no("dask")
  56. def test_construct_dask_float_array_int_dtype_match_ndarray():
  57. # GH#40110 make sure we treat a float-dtype dask array with the same
  58. # rules we would for an ndarray
  59. import dask.dataframe as dd
  60. arr = np.array([1, 2.5, 3])
  61. darr = dd.from_array(arr)
  62. res = Series(darr)
  63. expected = Series(arr)
  64. tm.assert_series_equal(res, expected)
  65. # GH#49599 in 2.0 we raise instead of silently ignoring the dtype
  66. msg = "Trying to coerce float values to integers"
  67. with pytest.raises(ValueError, match=msg):
  68. Series(darr, dtype="i8")
  69. msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
  70. arr[2] = np.nan
  71. with pytest.raises(IntCastingNaNError, match=msg):
  72. Series(darr, dtype="i8")
  73. # which is the same as we get with a numpy input
  74. with pytest.raises(IntCastingNaNError, match=msg):
  75. Series(arr, dtype="i8")
  76. def test_xarray(df):
  77. xarray = import_module("xarray") # noqa:F841
  78. assert df.to_xarray() is not None
  79. @td.skip_if_no("cftime")
  80. @td.skip_if_no("xarray", "0.21.0")
  81. def test_xarray_cftimeindex_nearest():
  82. # https://github.com/pydata/xarray/issues/3751
  83. import cftime
  84. import xarray
  85. times = xarray.cftime_range("0001", periods=2)
  86. key = cftime.DatetimeGregorian(2000, 1, 1)
  87. result = times.get_indexer([key], method="nearest")
  88. expected = 1
  89. assert result == expected
  90. def test_oo_optimizable():
  91. # GH 21071
  92. subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
  93. def test_oo_optimized_datetime_index_unpickle():
  94. # GH 42866
  95. subprocess.check_call(
  96. [
  97. sys.executable,
  98. "-OO",
  99. "-c",
  100. (
  101. "import pandas as pd, pickle; "
  102. "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))"
  103. ),
  104. ]
  105. )
  106. @pytest.mark.network
  107. @tm.network
  108. def test_statsmodels():
  109. statsmodels = import_module("statsmodels") # noqa:F841
  110. import statsmodels.api as sm
  111. import statsmodels.formula.api as smf
  112. df = sm.datasets.get_rdataset("Guerry", "HistData").data
  113. smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
  114. def test_scikit_learn():
  115. sklearn = import_module("sklearn") # noqa:F841
  116. from sklearn import (
  117. datasets,
  118. svm,
  119. )
  120. digits = datasets.load_digits()
  121. clf = svm.SVC(gamma=0.001, C=100.0)
  122. clf.fit(digits.data[:-1], digits.target[:-1])
  123. clf.predict(digits.data[-1:])
  124. @pytest.mark.network
  125. @tm.network
  126. def test_seaborn():
  127. seaborn = import_module("seaborn")
  128. tips = seaborn.load_dataset("tips")
  129. seaborn.stripplot(x="day", y="total_bill", data=tips)
  130. def test_pandas_gbq():
  131. # Older versions import from non-public, non-existent pandas funcs
  132. pytest.importorskip("pandas_gbq", minversion="0.10.0")
  133. pandas_gbq = import_module("pandas_gbq") # noqa:F841
  134. @pytest.mark.network
  135. @tm.network
  136. @pytest.mark.xfail(
  137. raises=ValueError,
  138. reason="The Quandl API key must be provided either through the api_key "
  139. "variable or through the environmental variable QUANDL_API_KEY",
  140. )
  141. def test_pandas_datareader():
  142. pandas_datareader = import_module("pandas_datareader")
  143. pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01")
  144. def test_pyarrow(df):
  145. pyarrow = import_module("pyarrow")
  146. table = pyarrow.Table.from_pandas(df)
  147. result = table.to_pandas()
  148. tm.assert_frame_equal(result, df)
  149. def test_yaml_dump(df):
  150. # GH#42748
  151. yaml = import_module("yaml")
  152. dumped = yaml.dump(df)
  153. loaded = yaml.load(dumped, Loader=yaml.Loader)
  154. tm.assert_frame_equal(df, loaded)
  155. loaded2 = yaml.load(dumped, Loader=yaml.UnsafeLoader)
  156. tm.assert_frame_equal(df, loaded2)
  157. def test_missing_required_dependency():
  158. # GH 23868
  159. # To ensure proper isolation, we pass these flags
  160. # -S : disable site-packages
  161. # -s : disable user site-packages
  162. # -E : disable PYTHON* env vars, especially PYTHONPATH
  163. # https://github.com/MacPython/pandas-wheels/pull/50
  164. pyexe = sys.executable.replace("\\", "/")
  165. # We skip this test if pandas is installed as a site package. We first
  166. # import the package normally and check the path to the module before
  167. # executing the test which imports pandas with site packages disabled.
  168. call = [pyexe, "-c", "import pandas;print(pandas.__file__)"]
  169. output = subprocess.check_output(call).decode()
  170. if "site-packages" in output:
  171. pytest.skip("pandas installed as site package")
  172. # This test will fail if pandas is installed as a site package. The flags
  173. # prevent pandas being imported and the test will report Failed: DID NOT
  174. # RAISE <class 'subprocess.CalledProcessError'>
  175. call = [pyexe, "-sSE", "-c", "import pandas"]
  176. msg = (
  177. rf"Command '\['{pyexe}', '-sSE', '-c', 'import pandas'\]' "
  178. "returned non-zero exit status 1."
  179. )
  180. with pytest.raises(subprocess.CalledProcessError, match=msg) as exc:
  181. subprocess.check_output(call, stderr=subprocess.STDOUT)
  182. output = exc.value.stdout.decode()
  183. for name in ["numpy", "pytz", "dateutil"]:
  184. assert name in output
  185. def test_frame_setitem_dask_array_into_new_col():
  186. # GH#47128
  187. # dask sets "compute.use_numexpr" to False, so catch the current value
  188. # and ensure to reset it afterwards to avoid impacting other tests
  189. olduse = pd.get_option("compute.use_numexpr")
  190. try:
  191. dask = import_module("dask") # noqa:F841
  192. import dask.array as da
  193. dda = da.array([1, 2])
  194. df = DataFrame({"a": ["a", "b"]})
  195. df["b"] = dda
  196. df["c"] = dda
  197. df.loc[[False, True], "b"] = 100
  198. result = df.loc[[1], :]
  199. expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
  200. tm.assert_frame_equal(result, expected)
  201. finally:
  202. pd.set_option("compute.use_numexpr", olduse)