sas7bdat.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747
  1. """
  2. Read SAS7BDAT files
  3. Based on code written by Jared Hobbs:
  4. https://bitbucket.org/jaredhobbs/sas7bdat
  5. See also:
  6. https://github.com/BioStatMatt/sas7bdat
  7. Partial documentation of the file format:
  8. https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
  9. Reference for binary data compression:
  10. http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
  11. """
  12. from __future__ import annotations
  13. from collections import abc
  14. from datetime import (
  15. datetime,
  16. timedelta,
  17. )
  18. import sys
  19. from typing import cast
  20. import numpy as np
  21. from pandas._typing import (
  22. CompressionOptions,
  23. FilePath,
  24. ReadBuffer,
  25. )
  26. from pandas.errors import (
  27. EmptyDataError,
  28. OutOfBoundsDatetime,
  29. )
  30. import pandas as pd
  31. from pandas import (
  32. DataFrame,
  33. isna,
  34. )
  35. from pandas.io.common import get_handle
  36. from pandas.io.sas._byteswap import (
  37. read_double_with_byteswap,
  38. read_float_with_byteswap,
  39. read_uint16_with_byteswap,
  40. read_uint32_with_byteswap,
  41. read_uint64_with_byteswap,
  42. )
  43. from pandas.io.sas._sas import (
  44. Parser,
  45. get_subheader_index,
  46. )
  47. import pandas.io.sas.sas_constants as const
  48. from pandas.io.sas.sasreader import ReaderBase
  49. def _parse_datetime(sas_datetime: float, unit: str):
  50. if isna(sas_datetime):
  51. return pd.NaT
  52. if unit == "s":
  53. return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
  54. elif unit == "d":
  55. return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
  56. else:
  57. raise ValueError("unit must be 'd' or 's'")
  58. def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
  59. """
  60. Convert to Timestamp if possible, otherwise to datetime.datetime.
  61. SAS float64 lacks precision for more than ms resolution so the fit
  62. to datetime.datetime is ok.
  63. Parameters
  64. ----------
  65. sas_datetimes : {Series, Sequence[float]}
  66. Dates or datetimes in SAS
  67. unit : {str}
  68. "d" if the floats represent dates, "s" for datetimes
  69. Returns
  70. -------
  71. Series
  72. Series of datetime64 dtype or datetime.datetime.
  73. """
  74. try:
  75. return pd.to_datetime(sas_datetimes, unit=unit, origin="1960-01-01")
  76. except OutOfBoundsDatetime:
  77. s_series = sas_datetimes.apply(_parse_datetime, unit=unit)
  78. s_series = cast(pd.Series, s_series)
  79. return s_series
  80. class _Column:
  81. col_id: int
  82. name: str | bytes
  83. label: str | bytes
  84. format: str | bytes
  85. ctype: bytes
  86. length: int
  87. def __init__(
  88. self,
  89. col_id: int,
  90. # These can be bytes when convert_header_text is False
  91. name: str | bytes,
  92. label: str | bytes,
  93. format: str | bytes,
  94. ctype: bytes,
  95. length: int,
  96. ) -> None:
  97. self.col_id = col_id
  98. self.name = name
  99. self.label = label
  100. self.format = format
  101. self.ctype = ctype
  102. self.length = length
  103. # SAS7BDAT represents a SAS data file in SAS7BDAT format.
  104. class SAS7BDATReader(ReaderBase, abc.Iterator):
  105. """
  106. Read SAS files in SAS7BDAT format.
  107. Parameters
  108. ----------
  109. path_or_buf : path name or buffer
  110. Name of SAS file or file-like object pointing to SAS file
  111. contents.
  112. index : column identifier, defaults to None
  113. Column to use as index.
  114. convert_dates : bool, defaults to True
  115. Attempt to convert dates to Pandas datetime values. Note that
  116. some rarely used SAS date formats may be unsupported.
  117. blank_missing : bool, defaults to True
  118. Convert empty strings to missing values (SAS uses blanks to
  119. indicate missing character variables).
  120. chunksize : int, defaults to None
  121. Return SAS7BDATReader object for iterations, returns chunks
  122. with given number of lines.
  123. encoding : str, 'infer', defaults to None
  124. String encoding acc. to Python standard encodings,
  125. encoding='infer' tries to detect the encoding from the file header,
  126. encoding=None will leave the data in binary format.
  127. convert_text : bool, defaults to True
  128. If False, text variables are left as raw bytes.
  129. convert_header_text : bool, defaults to True
  130. If False, header text, including column names, are left as raw
  131. bytes.
  132. """
  133. _int_length: int
  134. _cached_page: bytes | None
  135. def __init__(
  136. self,
  137. path_or_buf: FilePath | ReadBuffer[bytes],
  138. index=None,
  139. convert_dates: bool = True,
  140. blank_missing: bool = True,
  141. chunksize: int | None = None,
  142. encoding: str | None = None,
  143. convert_text: bool = True,
  144. convert_header_text: bool = True,
  145. compression: CompressionOptions = "infer",
  146. ) -> None:
  147. self.index = index
  148. self.convert_dates = convert_dates
  149. self.blank_missing = blank_missing
  150. self.chunksize = chunksize
  151. self.encoding = encoding
  152. self.convert_text = convert_text
  153. self.convert_header_text = convert_header_text
  154. self.default_encoding = "latin-1"
  155. self.compression = b""
  156. self.column_names_raw: list[bytes] = []
  157. self.column_names: list[str | bytes] = []
  158. self.column_formats: list[str | bytes] = []
  159. self.columns: list[_Column] = []
  160. self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
  161. self._cached_page = None
  162. self._column_data_lengths: list[int] = []
  163. self._column_data_offsets: list[int] = []
  164. self._column_types: list[bytes] = []
  165. self._current_row_in_file_index = 0
  166. self._current_row_on_page_index = 0
  167. self._current_row_in_file_index = 0
  168. self.handles = get_handle(
  169. path_or_buf, "rb", is_text=False, compression=compression
  170. )
  171. self._path_or_buf = self.handles.handle
  172. # Same order as const.SASIndex
  173. self._subheader_processors = [
  174. self._process_rowsize_subheader,
  175. self._process_columnsize_subheader,
  176. self._process_subheader_counts,
  177. self._process_columntext_subheader,
  178. self._process_columnname_subheader,
  179. self._process_columnattributes_subheader,
  180. self._process_format_subheader,
  181. self._process_columnlist_subheader,
  182. None, # Data
  183. ]
  184. try:
  185. self._get_properties()
  186. self._parse_metadata()
  187. except Exception:
  188. self.close()
  189. raise
  190. def column_data_lengths(self) -> np.ndarray:
  191. """Return a numpy int64 array of the column data lengths"""
  192. return np.asarray(self._column_data_lengths, dtype=np.int64)
  193. def column_data_offsets(self) -> np.ndarray:
  194. """Return a numpy int64 array of the column offsets"""
  195. return np.asarray(self._column_data_offsets, dtype=np.int64)
  196. def column_types(self) -> np.ndarray:
  197. """
  198. Returns a numpy character array of the column types:
  199. s (string) or d (double)
  200. """
  201. return np.asarray(self._column_types, dtype=np.dtype("S1"))
  202. def close(self) -> None:
  203. self.handles.close()
  204. def _get_properties(self) -> None:
  205. # Check magic number
  206. self._path_or_buf.seek(0)
  207. self._cached_page = self._path_or_buf.read(288)
  208. if self._cached_page[0 : len(const.magic)] != const.magic:
  209. raise ValueError("magic number mismatch (not a SAS file?)")
  210. # Get alignment information
  211. buf = self._read_bytes(const.align_1_offset, const.align_1_length)
  212. if buf == const.u64_byte_checker_value:
  213. self.U64 = True
  214. self._int_length = 8
  215. self._page_bit_offset = const.page_bit_offset_x64
  216. self._subheader_pointer_length = const.subheader_pointer_length_x64
  217. else:
  218. self.U64 = False
  219. self._page_bit_offset = const.page_bit_offset_x86
  220. self._subheader_pointer_length = const.subheader_pointer_length_x86
  221. self._int_length = 4
  222. buf = self._read_bytes(const.align_2_offset, const.align_2_length)
  223. if buf == const.align_1_checker_value:
  224. align1 = const.align_2_value
  225. else:
  226. align1 = 0
  227. # Get endianness information
  228. buf = self._read_bytes(const.endianness_offset, const.endianness_length)
  229. if buf == b"\x01":
  230. self.byte_order = "<"
  231. self.need_byteswap = sys.byteorder == "big"
  232. else:
  233. self.byte_order = ">"
  234. self.need_byteswap = sys.byteorder == "little"
  235. # Get encoding information
  236. buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
  237. if buf in const.encoding_names:
  238. self.inferred_encoding = const.encoding_names[buf]
  239. if self.encoding == "infer":
  240. self.encoding = self.inferred_encoding
  241. else:
  242. self.inferred_encoding = f"unknown (code={buf})"
  243. # Timestamp is epoch 01/01/1960
  244. epoch = datetime(1960, 1, 1)
  245. x = self._read_float(
  246. const.date_created_offset + align1, const.date_created_length
  247. )
  248. self.date_created = epoch + pd.to_timedelta(x, unit="s")
  249. x = self._read_float(
  250. const.date_modified_offset + align1, const.date_modified_length
  251. )
  252. self.date_modified = epoch + pd.to_timedelta(x, unit="s")
  253. self.header_length = self._read_uint(
  254. const.header_size_offset + align1, const.header_size_length
  255. )
  256. # Read the rest of the header into cached_page.
  257. buf = self._path_or_buf.read(self.header_length - 288)
  258. self._cached_page += buf
  259. # error: Argument 1 to "len" has incompatible type "Optional[bytes]";
  260. # expected "Sized"
  261. if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
  262. raise ValueError("The SAS7BDAT file appears to be truncated.")
  263. self._page_length = self._read_uint(
  264. const.page_size_offset + align1, const.page_size_length
  265. )
  266. def __next__(self) -> DataFrame:
  267. da = self.read(nrows=self.chunksize or 1)
  268. if da.empty:
  269. self.close()
  270. raise StopIteration
  271. return da
  272. # Read a single float of the given width (4 or 8).
  273. def _read_float(self, offset: int, width: int):
  274. assert self._cached_page is not None
  275. if width == 4:
  276. return read_float_with_byteswap(
  277. self._cached_page, offset, self.need_byteswap
  278. )
  279. elif width == 8:
  280. return read_double_with_byteswap(
  281. self._cached_page, offset, self.need_byteswap
  282. )
  283. else:
  284. self.close()
  285. raise ValueError("invalid float width")
  286. # Read a single unsigned integer of the given width (1, 2, 4 or 8).
  287. def _read_uint(self, offset: int, width: int) -> int:
  288. assert self._cached_page is not None
  289. if width == 1:
  290. return self._read_bytes(offset, 1)[0]
  291. elif width == 2:
  292. return read_uint16_with_byteswap(
  293. self._cached_page, offset, self.need_byteswap
  294. )
  295. elif width == 4:
  296. return read_uint32_with_byteswap(
  297. self._cached_page, offset, self.need_byteswap
  298. )
  299. elif width == 8:
  300. return read_uint64_with_byteswap(
  301. self._cached_page, offset, self.need_byteswap
  302. )
  303. else:
  304. self.close()
  305. raise ValueError("invalid int width")
  306. def _read_bytes(self, offset: int, length: int):
  307. assert self._cached_page is not None
  308. if offset + length > len(self._cached_page):
  309. self.close()
  310. raise ValueError("The cached page is too small.")
  311. return self._cached_page[offset : offset + length]
  312. def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
  313. return self._convert_header_text(
  314. self._read_bytes(offset, length).rstrip(b"\x00 ")
  315. )
  316. def _parse_metadata(self) -> None:
  317. done = False
  318. while not done:
  319. self._cached_page = self._path_or_buf.read(self._page_length)
  320. if len(self._cached_page) <= 0:
  321. break
  322. if len(self._cached_page) != self._page_length:
  323. raise ValueError("Failed to read a meta data page from the SAS file.")
  324. done = self._process_page_meta()
  325. def _process_page_meta(self) -> bool:
  326. self._read_page_header()
  327. pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
  328. if self._current_page_type in pt:
  329. self._process_page_metadata()
  330. is_data_page = self._current_page_type == const.page_data_type
  331. is_mix_page = self._current_page_type == const.page_mix_type
  332. return bool(
  333. is_data_page
  334. or is_mix_page
  335. or self._current_page_data_subheader_pointers != []
  336. )
  337. def _read_page_header(self) -> None:
  338. bit_offset = self._page_bit_offset
  339. tx = const.page_type_offset + bit_offset
  340. self._current_page_type = (
  341. self._read_uint(tx, const.page_type_length) & const.page_type_mask2
  342. )
  343. tx = const.block_count_offset + bit_offset
  344. self._current_page_block_count = self._read_uint(tx, const.block_count_length)
  345. tx = const.subheader_count_offset + bit_offset
  346. self._current_page_subheaders_count = self._read_uint(
  347. tx, const.subheader_count_length
  348. )
  349. def _process_page_metadata(self) -> None:
  350. bit_offset = self._page_bit_offset
  351. for i in range(self._current_page_subheaders_count):
  352. offset = const.subheader_pointers_offset + bit_offset
  353. total_offset = offset + self._subheader_pointer_length * i
  354. subheader_offset = self._read_uint(total_offset, self._int_length)
  355. total_offset += self._int_length
  356. subheader_length = self._read_uint(total_offset, self._int_length)
  357. total_offset += self._int_length
  358. subheader_compression = self._read_uint(total_offset, 1)
  359. total_offset += 1
  360. subheader_type = self._read_uint(total_offset, 1)
  361. if (
  362. subheader_length == 0
  363. or subheader_compression == const.truncated_subheader_id
  364. ):
  365. continue
  366. subheader_signature = self._read_bytes(subheader_offset, self._int_length)
  367. subheader_index = get_subheader_index(subheader_signature)
  368. subheader_processor = self._subheader_processors[subheader_index]
  369. if subheader_processor is None:
  370. f1 = subheader_compression in (const.compressed_subheader_id, 0)
  371. f2 = subheader_type == const.compressed_subheader_type
  372. if self.compression and f1 and f2:
  373. self._current_page_data_subheader_pointers.append(
  374. (subheader_offset, subheader_length)
  375. )
  376. else:
  377. self.close()
  378. raise ValueError(
  379. f"Unknown subheader signature {subheader_signature}"
  380. )
  381. else:
  382. subheader_processor(subheader_offset, subheader_length)
  383. def _process_rowsize_subheader(self, offset: int, length: int) -> None:
  384. int_len = self._int_length
  385. lcs_offset = offset
  386. lcp_offset = offset
  387. if self.U64:
  388. lcs_offset += 682
  389. lcp_offset += 706
  390. else:
  391. lcs_offset += 354
  392. lcp_offset += 378
  393. self.row_length = self._read_uint(
  394. offset + const.row_length_offset_multiplier * int_len,
  395. int_len,
  396. )
  397. self.row_count = self._read_uint(
  398. offset + const.row_count_offset_multiplier * int_len,
  399. int_len,
  400. )
  401. self.col_count_p1 = self._read_uint(
  402. offset + const.col_count_p1_multiplier * int_len, int_len
  403. )
  404. self.col_count_p2 = self._read_uint(
  405. offset + const.col_count_p2_multiplier * int_len, int_len
  406. )
  407. mx = const.row_count_on_mix_page_offset_multiplier * int_len
  408. self._mix_page_row_count = self._read_uint(offset + mx, int_len)
  409. self._lcs = self._read_uint(lcs_offset, 2)
  410. self._lcp = self._read_uint(lcp_offset, 2)
  411. def _process_columnsize_subheader(self, offset: int, length: int) -> None:
  412. int_len = self._int_length
  413. offset += int_len
  414. self.column_count = self._read_uint(offset, int_len)
  415. if self.col_count_p1 + self.col_count_p2 != self.column_count:
  416. print(
  417. f"Warning: column count mismatch ({self.col_count_p1} + "
  418. f"{self.col_count_p2} != {self.column_count})\n"
  419. )
  420. # Unknown purpose
  421. def _process_subheader_counts(self, offset: int, length: int) -> None:
  422. pass
  423. def _process_columntext_subheader(self, offset: int, length: int) -> None:
  424. offset += self._int_length
  425. text_block_size = self._read_uint(offset, const.text_block_size_length)
  426. buf = self._read_bytes(offset, text_block_size)
  427. cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
  428. self.column_names_raw.append(cname_raw)
  429. if len(self.column_names_raw) == 1:
  430. compression_literal = b""
  431. for cl in const.compression_literals:
  432. if cl in cname_raw:
  433. compression_literal = cl
  434. self.compression = compression_literal
  435. offset -= self._int_length
  436. offset1 = offset + 16
  437. if self.U64:
  438. offset1 += 4
  439. buf = self._read_bytes(offset1, self._lcp)
  440. compression_literal = buf.rstrip(b"\x00")
  441. if compression_literal == b"":
  442. self._lcs = 0
  443. offset1 = offset + 32
  444. if self.U64:
  445. offset1 += 4
  446. buf = self._read_bytes(offset1, self._lcp)
  447. self.creator_proc = buf[0 : self._lcp]
  448. elif compression_literal == const.rle_compression:
  449. offset1 = offset + 40
  450. if self.U64:
  451. offset1 += 4
  452. buf = self._read_bytes(offset1, self._lcp)
  453. self.creator_proc = buf[0 : self._lcp]
  454. elif self._lcs > 0:
  455. self._lcp = 0
  456. offset1 = offset + 16
  457. if self.U64:
  458. offset1 += 4
  459. buf = self._read_bytes(offset1, self._lcs)
  460. self.creator_proc = buf[0 : self._lcp]
  461. if hasattr(self, "creator_proc"):
  462. self.creator_proc = self._convert_header_text(self.creator_proc)
  463. def _process_columnname_subheader(self, offset: int, length: int) -> None:
  464. int_len = self._int_length
  465. offset += int_len
  466. column_name_pointers_count = (length - 2 * int_len - 12) // 8
  467. for i in range(column_name_pointers_count):
  468. text_subheader = (
  469. offset
  470. + const.column_name_pointer_length * (i + 1)
  471. + const.column_name_text_subheader_offset
  472. )
  473. col_name_offset = (
  474. offset
  475. + const.column_name_pointer_length * (i + 1)
  476. + const.column_name_offset_offset
  477. )
  478. col_name_length = (
  479. offset
  480. + const.column_name_pointer_length * (i + 1)
  481. + const.column_name_length_offset
  482. )
  483. idx = self._read_uint(
  484. text_subheader, const.column_name_text_subheader_length
  485. )
  486. col_offset = self._read_uint(
  487. col_name_offset, const.column_name_offset_length
  488. )
  489. col_len = self._read_uint(col_name_length, const.column_name_length_length)
  490. name_raw = self.column_names_raw[idx]
  491. cname = name_raw[col_offset : col_offset + col_len]
  492. self.column_names.append(self._convert_header_text(cname))
  493. def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
  494. int_len = self._int_length
  495. column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
  496. for i in range(column_attributes_vectors_count):
  497. col_data_offset = (
  498. offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
  499. )
  500. col_data_len = (
  501. offset
  502. + 2 * int_len
  503. + const.column_data_length_offset
  504. + i * (int_len + 8)
  505. )
  506. col_types = (
  507. offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
  508. )
  509. x = self._read_uint(col_data_offset, int_len)
  510. self._column_data_offsets.append(x)
  511. x = self._read_uint(col_data_len, const.column_data_length_length)
  512. self._column_data_lengths.append(x)
  513. x = self._read_uint(col_types, const.column_type_length)
  514. self._column_types.append(b"d" if x == 1 else b"s")
  515. def _process_columnlist_subheader(self, offset: int, length: int) -> None:
  516. # unknown purpose
  517. pass
  518. def _process_format_subheader(self, offset: int, length: int) -> None:
  519. int_len = self._int_length
  520. text_subheader_format = (
  521. offset + const.column_format_text_subheader_index_offset + 3 * int_len
  522. )
  523. col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
  524. col_format_len = offset + const.column_format_length_offset + 3 * int_len
  525. text_subheader_label = (
  526. offset + const.column_label_text_subheader_index_offset + 3 * int_len
  527. )
  528. col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
  529. col_label_len = offset + const.column_label_length_offset + 3 * int_len
  530. x = self._read_uint(
  531. text_subheader_format, const.column_format_text_subheader_index_length
  532. )
  533. format_idx = min(x, len(self.column_names_raw) - 1)
  534. format_start = self._read_uint(
  535. col_format_offset, const.column_format_offset_length
  536. )
  537. format_len = self._read_uint(col_format_len, const.column_format_length_length)
  538. label_idx = self._read_uint(
  539. text_subheader_label, const.column_label_text_subheader_index_length
  540. )
  541. label_idx = min(label_idx, len(self.column_names_raw) - 1)
  542. label_start = self._read_uint(
  543. col_label_offset, const.column_label_offset_length
  544. )
  545. label_len = self._read_uint(col_label_len, const.column_label_length_length)
  546. label_names = self.column_names_raw[label_idx]
  547. column_label = self._convert_header_text(
  548. label_names[label_start : label_start + label_len]
  549. )
  550. format_names = self.column_names_raw[format_idx]
  551. column_format = self._convert_header_text(
  552. format_names[format_start : format_start + format_len]
  553. )
  554. current_column_number = len(self.columns)
  555. col = _Column(
  556. current_column_number,
  557. self.column_names[current_column_number],
  558. column_label,
  559. column_format,
  560. self._column_types[current_column_number],
  561. self._column_data_lengths[current_column_number],
  562. )
  563. self.column_formats.append(column_format)
  564. self.columns.append(col)
  565. def read(self, nrows: int | None = None) -> DataFrame:
  566. if (nrows is None) and (self.chunksize is not None):
  567. nrows = self.chunksize
  568. elif nrows is None:
  569. nrows = self.row_count
  570. if len(self._column_types) == 0:
  571. self.close()
  572. raise EmptyDataError("No columns to parse from file")
  573. if nrows > 0 and self._current_row_in_file_index >= self.row_count:
  574. return DataFrame()
  575. nrows = min(nrows, self.row_count - self._current_row_in_file_index)
  576. nd = self._column_types.count(b"d")
  577. ns = self._column_types.count(b"s")
  578. self._string_chunk = np.empty((ns, nrows), dtype=object)
  579. self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
  580. self._current_row_in_chunk_index = 0
  581. p = Parser(self)
  582. p.read(nrows)
  583. rslt = self._chunk_to_dataframe()
  584. if self.index is not None:
  585. rslt = rslt.set_index(self.index)
  586. return rslt
  587. def _read_next_page(self):
  588. self._current_page_data_subheader_pointers = []
  589. self._cached_page = self._path_or_buf.read(self._page_length)
  590. if len(self._cached_page) <= 0:
  591. return True
  592. elif len(self._cached_page) != self._page_length:
  593. self.close()
  594. msg = (
  595. "failed to read complete page from file (read "
  596. f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
  597. )
  598. raise ValueError(msg)
  599. self._read_page_header()
  600. if self._current_page_type in const.page_meta_types:
  601. self._process_page_metadata()
  602. if self._current_page_type not in const.page_meta_types + [
  603. const.page_data_type,
  604. const.page_mix_type,
  605. ]:
  606. return self._read_next_page()
  607. return False
  608. def _chunk_to_dataframe(self) -> DataFrame:
  609. n = self._current_row_in_chunk_index
  610. m = self._current_row_in_file_index
  611. ix = range(m - n, m)
  612. rslt = {}
  613. js, jb = 0, 0
  614. for j in range(self.column_count):
  615. name = self.column_names[j]
  616. if self._column_types[j] == b"d":
  617. col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
  618. rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix)
  619. if self.convert_dates:
  620. if self.column_formats[j] in const.sas_date_formats:
  621. rslt[name] = _convert_datetimes(rslt[name], "d")
  622. elif self.column_formats[j] in const.sas_datetime_formats:
  623. rslt[name] = _convert_datetimes(rslt[name], "s")
  624. jb += 1
  625. elif self._column_types[j] == b"s":
  626. rslt[name] = pd.Series(self._string_chunk[js, :], index=ix)
  627. if self.convert_text and (self.encoding is not None):
  628. rslt[name] = self._decode_string(rslt[name].str)
  629. js += 1
  630. else:
  631. self.close()
  632. raise ValueError(f"unknown column type {repr(self._column_types[j])}")
  633. df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
  634. return df
  635. def _decode_string(self, b):
  636. return b.decode(self.encoding or self.default_encoding)
  637. def _convert_header_text(self, b: bytes) -> str | bytes:
  638. if self.convert_header_text:
  639. return self._decode_string(b)
  640. else:
  641. return b