generate_legacy_storage_files.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. """
  2. self-contained to write legacy storage pickle files
  3. To use this script. Create an environment where you want
  4. generate pickles, say its for 0.20.3, with your pandas clone
  5. in ~/pandas
  6. . activate pandas_0.20.3
  7. cd ~/pandas/pandas
  8. $ python -m tests.io.generate_legacy_storage_files \
  9. tests/io/data/legacy_pickle/0.20.3/ pickle
  10. This script generates a storage file for the current arch, system,
  11. and python version
  12. pandas version: 0.20.3
  13. output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
  14. storage format: pickle
  15. created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
  16. The idea here is you are using the *current* version of the
  17. generate_legacy_storage_files with an *older* version of pandas to
  18. generate a pickle file. We will then check this file into a current
  19. branch, and test using test_pickle.py. This will load the *older*
  20. pickles and test versus the current data that is generated
  21. (with main). These are then compared.
  22. If we have cases where we changed the signature (e.g. we renamed
  23. offset -> freq in Timestamp). Then we have to conditionally execute
  24. in the generate_legacy_storage_files.py to make it
  25. run under the older AND the newer version.
  26. """
  27. from datetime import timedelta
  28. import os
  29. import pickle
  30. import platform as pl
  31. import sys
  32. import numpy as np
  33. import pandas
  34. from pandas import (
  35. Categorical,
  36. DataFrame,
  37. Index,
  38. MultiIndex,
  39. NaT,
  40. Period,
  41. RangeIndex,
  42. Series,
  43. Timestamp,
  44. bdate_range,
  45. date_range,
  46. interval_range,
  47. period_range,
  48. timedelta_range,
  49. )
  50. from pandas.arrays import SparseArray
  51. from pandas.tseries.offsets import (
  52. FY5253,
  53. BusinessDay,
  54. BusinessHour,
  55. CustomBusinessDay,
  56. DateOffset,
  57. Day,
  58. Easter,
  59. Hour,
  60. LastWeekOfMonth,
  61. Minute,
  62. MonthBegin,
  63. MonthEnd,
  64. QuarterBegin,
  65. QuarterEnd,
  66. SemiMonthBegin,
  67. SemiMonthEnd,
  68. Week,
  69. WeekOfMonth,
  70. YearBegin,
  71. YearEnd,
  72. )
  73. def _create_sp_series():
  74. nan = np.nan
  75. # nan-based
  76. arr = np.arange(15, dtype=np.float64)
  77. arr[7:12] = nan
  78. arr[-1:] = nan
  79. bseries = Series(SparseArray(arr, kind="block"))
  80. bseries.name = "bseries"
  81. return bseries
  82. def _create_sp_tsseries():
  83. nan = np.nan
  84. # nan-based
  85. arr = np.arange(15, dtype=np.float64)
  86. arr[7:12] = nan
  87. arr[-1:] = nan
  88. date_index = bdate_range("1/1/2011", periods=len(arr))
  89. bseries = Series(SparseArray(arr, kind="block"), index=date_index)
  90. bseries.name = "btsseries"
  91. return bseries
  92. def _create_sp_frame():
  93. nan = np.nan
  94. data = {
  95. "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
  96. "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
  97. "C": np.arange(10).astype(np.int64),
  98. "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
  99. }
  100. dates = bdate_range("1/1/2011", periods=10)
  101. return DataFrame(data, index=dates).apply(SparseArray)
  102. def create_data():
  103. """create the pickle data"""
  104. data = {
  105. "A": [0.0, 1.0, 2.0, 3.0, np.nan],
  106. "B": [0, 1, 0, 1, 0],
  107. "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
  108. "D": date_range("1/1/2009", periods=5),
  109. "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
  110. }
  111. scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
  112. index = {
  113. "int": Index(np.arange(10)),
  114. "date": date_range("20130101", periods=10),
  115. "period": period_range("2013-01-01", freq="M", periods=10),
  116. "float": Index(np.arange(10, dtype=np.float64)),
  117. "uint": Index(np.arange(10, dtype=np.uint64)),
  118. "timedelta": timedelta_range("00:00:00", freq="30T", periods=10),
  119. }
  120. index["range"] = RangeIndex(10)
  121. index["interval"] = interval_range(0, periods=10)
  122. mi = {
  123. "reg2": MultiIndex.from_tuples(
  124. tuple(
  125. zip(
  126. *[
  127. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  128. ["one", "two", "one", "two", "one", "two", "one", "two"],
  129. ]
  130. )
  131. ),
  132. names=["first", "second"],
  133. )
  134. }
  135. series = {
  136. "float": Series(data["A"]),
  137. "int": Series(data["B"]),
  138. "mixed": Series(data["E"]),
  139. "ts": Series(
  140. np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
  141. ),
  142. "mi": Series(
  143. np.arange(5).astype(np.float64),
  144. index=MultiIndex.from_tuples(
  145. tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
  146. ),
  147. ),
  148. "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
  149. "cat": Series(Categorical(["foo", "bar", "baz"])),
  150. "dt": Series(date_range("20130101", periods=5)),
  151. "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
  152. "period": Series([Period("2000Q1")] * 5),
  153. }
  154. mixed_dup_df = DataFrame(data)
  155. mixed_dup_df.columns = list("ABCDA")
  156. frame = {
  157. "float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
  158. "int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
  159. "mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
  160. "mi": DataFrame(
  161. {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
  162. index=MultiIndex.from_tuples(
  163. tuple(
  164. zip(
  165. *[
  166. ["bar", "bar", "baz", "baz", "baz"],
  167. ["one", "two", "one", "two", "three"],
  168. ]
  169. )
  170. ),
  171. names=["first", "second"],
  172. ),
  173. ),
  174. "dup": DataFrame(
  175. np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
  176. ),
  177. "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
  178. "cat_and_float": DataFrame(
  179. {
  180. "A": Categorical(["foo", "bar", "baz"]),
  181. "B": np.arange(3).astype(np.int64),
  182. }
  183. ),
  184. "mixed_dup": mixed_dup_df,
  185. "dt_mixed_tzs": DataFrame(
  186. {
  187. "A": Timestamp("20130102", tz="US/Eastern"),
  188. "B": Timestamp("20130603", tz="CET"),
  189. },
  190. index=range(5),
  191. ),
  192. "dt_mixed2_tzs": DataFrame(
  193. {
  194. "A": Timestamp("20130102", tz="US/Eastern"),
  195. "B": Timestamp("20130603", tz="CET"),
  196. "C": Timestamp("20130603", tz="UTC"),
  197. },
  198. index=range(5),
  199. ),
  200. }
  201. cat = {
  202. "int8": Categorical(list("abcdefg")),
  203. "int16": Categorical(np.arange(1000)),
  204. "int32": Categorical(np.arange(10000)),
  205. }
  206. timestamp = {
  207. "normal": Timestamp("2011-01-01"),
  208. "nat": NaT,
  209. "tz": Timestamp("2011-01-01", tz="US/Eastern"),
  210. }
  211. off = {
  212. "DateOffset": DateOffset(years=1),
  213. "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
  214. "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
  215. "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
  216. "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
  217. "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
  218. "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
  219. "MonthBegin": MonthBegin(1),
  220. "MonthEnd": MonthEnd(1),
  221. "QuarterBegin": QuarterBegin(1),
  222. "QuarterEnd": QuarterEnd(1),
  223. "Day": Day(1),
  224. "YearBegin": YearBegin(1),
  225. "YearEnd": YearEnd(1),
  226. "Week": Week(1),
  227. "Week_Tues": Week(2, normalize=False, weekday=1),
  228. "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
  229. "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
  230. "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
  231. "Easter": Easter(),
  232. "Hour": Hour(1),
  233. "Minute": Minute(1),
  234. }
  235. return {
  236. "series": series,
  237. "frame": frame,
  238. "index": index,
  239. "scalars": scalars,
  240. "mi": mi,
  241. "sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
  242. "sp_frame": {"float": _create_sp_frame()},
  243. "cat": cat,
  244. "timestamp": timestamp,
  245. "offsets": off,
  246. }
  247. def create_pickle_data():
  248. data = create_data()
  249. return data
  250. def platform_name():
  251. return "_".join(
  252. [
  253. str(pandas.__version__),
  254. str(pl.machine()),
  255. str(pl.system().lower()),
  256. str(pl.python_version()),
  257. ]
  258. )
  259. def write_legacy_pickles(output_dir):
  260. version = pandas.__version__
  261. print(
  262. "This script generates a storage file for the current arch, system, "
  263. "and python version"
  264. )
  265. print(f" pandas version: {version}")
  266. print(f" output dir : {output_dir}")
  267. print(" storage format: pickle")
  268. pth = f"{platform_name()}.pickle"
  269. with open(os.path.join(output_dir, pth), "wb") as fh:
  270. pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
  271. print(f"created pickle file: {pth}")
  272. def write_legacy_file():
  273. # force our cwd to be the first searched
  274. sys.path.insert(0, ".")
  275. if not 3 <= len(sys.argv) <= 4:
  276. sys.exit(
  277. "Specify output directory and storage type: generate_legacy_"
  278. "storage_files.py <output_dir> <storage_type> "
  279. )
  280. output_dir = str(sys.argv[1])
  281. storage_type = str(sys.argv[2])
  282. if storage_type == "pickle":
  283. write_legacy_pickles(output_dir=output_dir)
  284. else:
  285. sys.exit("storage_type must be one of {'pickle'}")
  286. if __name__ == "__main__":
  287. write_legacy_file()