test_orc.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. """ test orc compat """
  2. import datetime
  3. from decimal import Decimal
  4. from io import BytesIO
  5. import os
  6. import numpy as np
  7. import pytest
  8. import pandas.util._test_decorators as td
  9. import pandas as pd
  10. from pandas import read_orc
  11. import pandas._testing as tm
  12. from pandas.core.arrays import StringArray
  13. pytest.importorskip("pyarrow.orc")
  14. import pyarrow as pa
  15. @pytest.fixture
  16. def dirpath(datapath):
  17. return datapath("io", "data", "orc")
  18. # Examples of dataframes with dtypes for which conversion to ORC
  19. # hasn't been implemented yet, that is, Category, unsigned integers,
  20. # interval, period and sparse.
  21. orc_writer_dtypes_not_supported = [
  22. pd.DataFrame({"unimpl": np.array([1, 20], dtype="uint64")}),
  23. pd.DataFrame({"unimpl": pd.Series(["a", "b", "a"], dtype="category")}),
  24. pd.DataFrame(
  25. {"unimpl": [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)]}
  26. ),
  27. pd.DataFrame(
  28. {
  29. "unimpl": [
  30. pd.Period("2022-01-03", freq="D"),
  31. pd.Period("2022-01-04", freq="D"),
  32. ]
  33. }
  34. ),
  35. pd.DataFrame({"unimpl": [np.nan] * 50}).astype(pd.SparseDtype("float", np.nan)),
  36. ]
  37. def test_orc_reader_empty(dirpath):
  38. columns = [
  39. "boolean1",
  40. "byte1",
  41. "short1",
  42. "int1",
  43. "long1",
  44. "float1",
  45. "double1",
  46. "bytes1",
  47. "string1",
  48. ]
  49. dtypes = [
  50. "bool",
  51. "int8",
  52. "int16",
  53. "int32",
  54. "int64",
  55. "float32",
  56. "float64",
  57. "object",
  58. "object",
  59. ]
  60. expected = pd.DataFrame(index=pd.RangeIndex(0))
  61. for colname, dtype in zip(columns, dtypes):
  62. expected[colname] = pd.Series(dtype=dtype)
  63. inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
  64. got = read_orc(inputfile, columns=columns)
  65. tm.assert_equal(expected, got)
  66. def test_orc_reader_basic(dirpath):
  67. data = {
  68. "boolean1": np.array([False, True], dtype="bool"),
  69. "byte1": np.array([1, 100], dtype="int8"),
  70. "short1": np.array([1024, 2048], dtype="int16"),
  71. "int1": np.array([65536, 65536], dtype="int32"),
  72. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  73. "float1": np.array([1.0, 2.0], dtype="float32"),
  74. "double1": np.array([-15.0, -5.0], dtype="float64"),
  75. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  76. "string1": np.array(["hi", "bye"], dtype="object"),
  77. }
  78. expected = pd.DataFrame.from_dict(data)
  79. inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
  80. got = read_orc(inputfile, columns=data.keys())
  81. tm.assert_equal(expected, got)
  82. def test_orc_reader_decimal(dirpath):
  83. # Only testing the first 10 rows of data
  84. data = {
  85. "_col0": np.array(
  86. [
  87. Decimal("-1000.50000"),
  88. Decimal("-999.60000"),
  89. Decimal("-998.70000"),
  90. Decimal("-997.80000"),
  91. Decimal("-996.90000"),
  92. Decimal("-995.10000"),
  93. Decimal("-994.11000"),
  94. Decimal("-993.12000"),
  95. Decimal("-992.13000"),
  96. Decimal("-991.14000"),
  97. ],
  98. dtype="object",
  99. )
  100. }
  101. expected = pd.DataFrame.from_dict(data)
  102. inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
  103. got = read_orc(inputfile).iloc[:10]
  104. tm.assert_equal(expected, got)
  105. def test_orc_reader_date_low(dirpath):
  106. data = {
  107. "time": np.array(
  108. [
  109. "1900-05-05 12:34:56.100000",
  110. "1900-05-05 12:34:56.100100",
  111. "1900-05-05 12:34:56.100200",
  112. "1900-05-05 12:34:56.100300",
  113. "1900-05-05 12:34:56.100400",
  114. "1900-05-05 12:34:56.100500",
  115. "1900-05-05 12:34:56.100600",
  116. "1900-05-05 12:34:56.100700",
  117. "1900-05-05 12:34:56.100800",
  118. "1900-05-05 12:34:56.100900",
  119. ],
  120. dtype="datetime64[ns]",
  121. ),
  122. "date": np.array(
  123. [
  124. datetime.date(1900, 12, 25),
  125. datetime.date(1900, 12, 25),
  126. datetime.date(1900, 12, 25),
  127. datetime.date(1900, 12, 25),
  128. datetime.date(1900, 12, 25),
  129. datetime.date(1900, 12, 25),
  130. datetime.date(1900, 12, 25),
  131. datetime.date(1900, 12, 25),
  132. datetime.date(1900, 12, 25),
  133. datetime.date(1900, 12, 25),
  134. ],
  135. dtype="object",
  136. ),
  137. }
  138. expected = pd.DataFrame.from_dict(data)
  139. inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
  140. got = read_orc(inputfile).iloc[:10]
  141. tm.assert_equal(expected, got)
  142. def test_orc_reader_date_high(dirpath):
  143. data = {
  144. "time": np.array(
  145. [
  146. "2038-05-05 12:34:56.100000",
  147. "2038-05-05 12:34:56.100100",
  148. "2038-05-05 12:34:56.100200",
  149. "2038-05-05 12:34:56.100300",
  150. "2038-05-05 12:34:56.100400",
  151. "2038-05-05 12:34:56.100500",
  152. "2038-05-05 12:34:56.100600",
  153. "2038-05-05 12:34:56.100700",
  154. "2038-05-05 12:34:56.100800",
  155. "2038-05-05 12:34:56.100900",
  156. ],
  157. dtype="datetime64[ns]",
  158. ),
  159. "date": np.array(
  160. [
  161. datetime.date(2038, 12, 25),
  162. datetime.date(2038, 12, 25),
  163. datetime.date(2038, 12, 25),
  164. datetime.date(2038, 12, 25),
  165. datetime.date(2038, 12, 25),
  166. datetime.date(2038, 12, 25),
  167. datetime.date(2038, 12, 25),
  168. datetime.date(2038, 12, 25),
  169. datetime.date(2038, 12, 25),
  170. datetime.date(2038, 12, 25),
  171. ],
  172. dtype="object",
  173. ),
  174. }
  175. expected = pd.DataFrame.from_dict(data)
  176. inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
  177. got = read_orc(inputfile).iloc[:10]
  178. tm.assert_equal(expected, got)
  179. def test_orc_reader_snappy_compressed(dirpath):
  180. data = {
  181. "int1": np.array(
  182. [
  183. -1160101563,
  184. 1181413113,
  185. 2065821249,
  186. -267157795,
  187. 172111193,
  188. 1752363137,
  189. 1406072123,
  190. 1911809390,
  191. -1308542224,
  192. -467100286,
  193. ],
  194. dtype="int32",
  195. ),
  196. "string1": np.array(
  197. [
  198. "f50dcb8",
  199. "382fdaaa",
  200. "90758c6",
  201. "9e8caf3f",
  202. "ee97332b",
  203. "d634da1",
  204. "2bea4396",
  205. "d67d89e8",
  206. "ad71007e",
  207. "e8c82066",
  208. ],
  209. dtype="object",
  210. ),
  211. }
  212. expected = pd.DataFrame.from_dict(data)
  213. inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
  214. got = read_orc(inputfile).iloc[:10]
  215. tm.assert_equal(expected, got)
  216. @td.skip_if_no("pyarrow", min_version="7.0.0")
  217. def test_orc_roundtrip_file(dirpath):
  218. # GH44554
  219. # PyArrow gained ORC write support with the current argument order
  220. data = {
  221. "boolean1": np.array([False, True], dtype="bool"),
  222. "byte1": np.array([1, 100], dtype="int8"),
  223. "short1": np.array([1024, 2048], dtype="int16"),
  224. "int1": np.array([65536, 65536], dtype="int32"),
  225. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  226. "float1": np.array([1.0, 2.0], dtype="float32"),
  227. "double1": np.array([-15.0, -5.0], dtype="float64"),
  228. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  229. "string1": np.array(["hi", "bye"], dtype="object"),
  230. }
  231. expected = pd.DataFrame.from_dict(data)
  232. with tm.ensure_clean() as path:
  233. expected.to_orc(path)
  234. got = read_orc(path)
  235. tm.assert_equal(expected, got)
  236. @td.skip_if_no("pyarrow", min_version="7.0.0")
  237. def test_orc_roundtrip_bytesio():
  238. # GH44554
  239. # PyArrow gained ORC write support with the current argument order
  240. data = {
  241. "boolean1": np.array([False, True], dtype="bool"),
  242. "byte1": np.array([1, 100], dtype="int8"),
  243. "short1": np.array([1024, 2048], dtype="int16"),
  244. "int1": np.array([65536, 65536], dtype="int32"),
  245. "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
  246. "float1": np.array([1.0, 2.0], dtype="float32"),
  247. "double1": np.array([-15.0, -5.0], dtype="float64"),
  248. "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
  249. "string1": np.array(["hi", "bye"], dtype="object"),
  250. }
  251. expected = pd.DataFrame.from_dict(data)
  252. bytes = expected.to_orc()
  253. got = read_orc(BytesIO(bytes))
  254. tm.assert_equal(expected, got)
  255. @td.skip_if_no("pyarrow", min_version="7.0.0")
  256. @pytest.mark.parametrize("df_not_supported", orc_writer_dtypes_not_supported)
  257. def test_orc_writer_dtypes_not_supported(df_not_supported):
  258. # GH44554
  259. # PyArrow gained ORC write support with the current argument order
  260. msg = "The dtype of one or more columns is not supported yet."
  261. with pytest.raises(NotImplementedError, match=msg):
  262. df_not_supported.to_orc()
  263. @td.skip_if_no("pyarrow", min_version="7.0.0")
  264. def test_orc_dtype_backend_pyarrow():
  265. df = pd.DataFrame(
  266. {
  267. "string": list("abc"),
  268. "string_with_nan": ["a", np.nan, "c"],
  269. "string_with_none": ["a", None, "c"],
  270. "bytes": [b"foo", b"bar", None],
  271. "int": list(range(1, 4)),
  272. "float": np.arange(4.0, 7.0, dtype="float64"),
  273. "float_with_nan": [2.0, np.nan, 3.0],
  274. "bool": [True, False, True],
  275. "bool_with_na": [True, False, None],
  276. "datetime": pd.date_range("20130101", periods=3),
  277. "datetime_with_nat": [
  278. pd.Timestamp("20130101"),
  279. pd.NaT,
  280. pd.Timestamp("20130103"),
  281. ],
  282. }
  283. )
  284. bytes_data = df.copy().to_orc()
  285. result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
  286. expected = pd.DataFrame(
  287. {
  288. col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
  289. for col in df.columns
  290. }
  291. )
  292. tm.assert_frame_equal(result, expected)
  293. @td.skip_if_no("pyarrow", min_version="7.0.0")
  294. def test_orc_dtype_backend_numpy_nullable():
  295. # GH#50503
  296. df = pd.DataFrame(
  297. {
  298. "string": list("abc"),
  299. "string_with_nan": ["a", np.nan, "c"],
  300. "string_with_none": ["a", None, "c"],
  301. "int": list(range(1, 4)),
  302. "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
  303. "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
  304. "float": np.arange(4.0, 7.0, dtype="float64"),
  305. "float_with_nan": [2.0, np.nan, 3.0],
  306. "bool": [True, False, True],
  307. "bool_with_na": [True, False, None],
  308. }
  309. )
  310. bytes_data = df.copy().to_orc()
  311. result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
  312. expected = pd.DataFrame(
  313. {
  314. "string": StringArray(np.array(["a", "b", "c"], dtype=np.object_)),
  315. "string_with_nan": StringArray(
  316. np.array(["a", pd.NA, "c"], dtype=np.object_)
  317. ),
  318. "string_with_none": StringArray(
  319. np.array(["a", pd.NA, "c"], dtype=np.object_)
  320. ),
  321. "int": pd.Series([1, 2, 3], dtype="Int64"),
  322. "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
  323. "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
  324. "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
  325. "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
  326. "bool": pd.Series([True, False, True], dtype="boolean"),
  327. "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
  328. }
  329. )
  330. tm.assert_frame_equal(result, expected)
  331. def test_invalid_dtype_backend():
  332. msg = (
  333. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  334. "'pyarrow' are allowed."
  335. )
  336. df = pd.DataFrame({"int": list(range(1, 4))})
  337. with tm.ensure_clean("tmp.orc") as path:
  338. df.to_orc(path)
  339. with pytest.raises(ValueError, match=msg):
  340. read_orc(path, dtype_backend="numpy")