test_xport.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. from pandas.io.sas.sasreader import read_sas
  6. # CSV versions of test xpt files were obtained using the R foreign library
  7. # Numbers in a SAS xport file are always float64, so need to convert
  8. # before making comparisons.
  9. def numeric_as_float(data):
  10. for v in data.columns:
  11. if data[v].dtype is np.dtype("int64"):
  12. data[v] = data[v].astype(np.float64)
  13. class TestXport:
  14. @pytest.fixture
  15. def file01(self, datapath):
  16. return datapath("io", "sas", "data", "DEMO_G.xpt")
  17. @pytest.fixture
  18. def file02(self, datapath):
  19. return datapath("io", "sas", "data", "SSHSV1_A.xpt")
  20. @pytest.fixture
  21. def file03(self, datapath):
  22. return datapath("io", "sas", "data", "DRXFCD_G.xpt")
  23. @pytest.fixture
  24. def file04(self, datapath):
  25. return datapath("io", "sas", "data", "paxraw_d_short.xpt")
  26. @pytest.fixture
  27. def file05(self, datapath):
  28. return datapath("io", "sas", "data", "DEMO_PUF.cpt")
  29. @pytest.mark.slow
  30. def test1_basic(self, file01):
  31. # Tests with DEMO_G.xpt (all numeric file)
  32. # Compare to this
  33. data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
  34. numeric_as_float(data_csv)
  35. # Read full file
  36. data = read_sas(file01, format="xport")
  37. tm.assert_frame_equal(data, data_csv)
  38. num_rows = data.shape[0]
  39. # Test reading beyond end of file
  40. with read_sas(file01, format="xport", iterator=True) as reader:
  41. data = reader.read(num_rows + 100)
  42. assert data.shape[0] == num_rows
  43. # Test incremental read with `read` method.
  44. with read_sas(file01, format="xport", iterator=True) as reader:
  45. data = reader.read(10)
  46. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  47. # Test incremental read with `get_chunk` method.
  48. with read_sas(file01, format="xport", chunksize=10) as reader:
  49. data = reader.get_chunk()
  50. tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
  51. # Test read in loop
  52. m = 0
  53. with read_sas(file01, format="xport", chunksize=100) as reader:
  54. for x in reader:
  55. m += x.shape[0]
  56. assert m == num_rows
  57. # Read full file with `read_sas` method
  58. data = read_sas(file01)
  59. tm.assert_frame_equal(data, data_csv)
  60. def test1_index(self, file01):
  61. # Tests with DEMO_G.xpt using index (all numeric file)
  62. # Compare to this
  63. data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
  64. data_csv = data_csv.set_index("SEQN")
  65. numeric_as_float(data_csv)
  66. # Read full file
  67. data = read_sas(file01, index="SEQN", format="xport")
  68. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  69. # Test incremental read with `read` method.
  70. with read_sas(file01, index="SEQN", format="xport", iterator=True) as reader:
  71. data = reader.read(10)
  72. tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
  73. # Test incremental read with `get_chunk` method.
  74. with read_sas(file01, index="SEQN", format="xport", chunksize=10) as reader:
  75. data = reader.get_chunk()
  76. tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False)
  77. def test1_incremental(self, file01):
  78. # Test with DEMO_G.xpt, reading full file incrementally
  79. data_csv = pd.read_csv(file01.replace(".xpt", ".csv"))
  80. data_csv = data_csv.set_index("SEQN")
  81. numeric_as_float(data_csv)
  82. with read_sas(file01, index="SEQN", chunksize=1000) as reader:
  83. all_data = list(reader)
  84. data = pd.concat(all_data, axis=0)
  85. tm.assert_frame_equal(data, data_csv, check_index_type=False)
  86. def test2(self, file02):
  87. # Test with SSHSV1_A.xpt
  88. # Compare to this
  89. data_csv = pd.read_csv(file02.replace(".xpt", ".csv"))
  90. numeric_as_float(data_csv)
  91. data = read_sas(file02)
  92. tm.assert_frame_equal(data, data_csv)
  93. def test2_binary(self, file02):
  94. # Test with SSHSV1_A.xpt, read as a binary file
  95. # Compare to this
  96. data_csv = pd.read_csv(file02.replace(".xpt", ".csv"))
  97. numeric_as_float(data_csv)
  98. with open(file02, "rb") as fd:
  99. # GH#35693 ensure that if we pass an open file, we
  100. # dont incorrectly close it in read_sas
  101. data = read_sas(fd, format="xport")
  102. tm.assert_frame_equal(data, data_csv)
  103. def test_multiple_types(self, file03):
  104. # Test with DRXFCD_G.xpt (contains text and numeric variables)
  105. # Compare to this
  106. data_csv = pd.read_csv(file03.replace(".xpt", ".csv"))
  107. data = read_sas(file03, encoding="utf-8")
  108. tm.assert_frame_equal(data, data_csv)
  109. def test_truncated_float_support(self, file04):
  110. # Test with paxraw_d_short.xpt, a shortened version of:
  111. # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP
  112. # This file has truncated floats (5 bytes in this case).
  113. # GH 11713
  114. data_csv = pd.read_csv(file04.replace(".xpt", ".csv"))
  115. data = read_sas(file04, format="xport")
  116. tm.assert_frame_equal(data.astype("int64"), data_csv)
  117. def test_cport_header_found_raises(self, file05):
  118. # Test with DEMO_PUF.cpt, the beginning of puf2019_1_fall.xpt
  119. # from https://www.cms.gov/files/zip/puf2019.zip
  120. # (despite the extension, it's a cpt file)
  121. msg = "Header record indicates a CPORT file, which is not readable."
  122. with pytest.raises(ValueError, match=msg):
  123. read_sas(file05, format="xport")