123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- """
- self-contained to write legacy storage pickle files
- To use this script. Create an environment where you want
- generate pickles, say its for 0.20.3, with your pandas clone
- in ~/pandas
- . activate pandas_0.20.3
- cd ~/pandas/pandas
- $ python -m tests.io.generate_legacy_storage_files \
- tests/io/data/legacy_pickle/0.20.3/ pickle
- This script generates a storage file for the current arch, system,
- and python version
- pandas version: 0.20.3
- output dir : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
- storage format: pickle
- created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
- The idea here is you are using the *current* version of the
- generate_legacy_storage_files with an *older* version of pandas to
- generate a pickle file. We will then check this file into a current
- branch, and test using test_pickle.py. This will load the *older*
- pickles and test versus the current data that is generated
- (with main). These are then compared.
- If we have cases where we changed the signature (e.g. we renamed
- offset -> freq in Timestamp). Then we have to conditionally execute
- in the generate_legacy_storage_files.py to make it
- run under the older AND the newer version.
- """
- from datetime import timedelta
- import os
- import pickle
- import platform as pl
- import sys
- import numpy as np
- import pandas
- from pandas import (
- Categorical,
- DataFrame,
- Index,
- MultiIndex,
- NaT,
- Period,
- RangeIndex,
- Series,
- Timestamp,
- bdate_range,
- date_range,
- interval_range,
- period_range,
- timedelta_range,
- )
- from pandas.arrays import SparseArray
- from pandas.tseries.offsets import (
- FY5253,
- BusinessDay,
- BusinessHour,
- CustomBusinessDay,
- DateOffset,
- Day,
- Easter,
- Hour,
- LastWeekOfMonth,
- Minute,
- MonthBegin,
- MonthEnd,
- QuarterBegin,
- QuarterEnd,
- SemiMonthBegin,
- SemiMonthEnd,
- Week,
- WeekOfMonth,
- YearBegin,
- YearEnd,
- )
- def _create_sp_series():
- nan = np.nan
- # nan-based
- arr = np.arange(15, dtype=np.float64)
- arr[7:12] = nan
- arr[-1:] = nan
- bseries = Series(SparseArray(arr, kind="block"))
- bseries.name = "bseries"
- return bseries
- def _create_sp_tsseries():
- nan = np.nan
- # nan-based
- arr = np.arange(15, dtype=np.float64)
- arr[7:12] = nan
- arr[-1:] = nan
- date_index = bdate_range("1/1/2011", periods=len(arr))
- bseries = Series(SparseArray(arr, kind="block"), index=date_index)
- bseries.name = "btsseries"
- return bseries
- def _create_sp_frame():
- nan = np.nan
- data = {
- "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
- "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
- "C": np.arange(10).astype(np.int64),
- "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
- }
- dates = bdate_range("1/1/2011", periods=10)
- return DataFrame(data, index=dates).apply(SparseArray)
- def create_data():
- """create the pickle data"""
- data = {
- "A": [0.0, 1.0, 2.0, 3.0, np.nan],
- "B": [0, 1, 0, 1, 0],
- "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
- "D": date_range("1/1/2009", periods=5),
- "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
- }
- scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
- index = {
- "int": Index(np.arange(10)),
- "date": date_range("20130101", periods=10),
- "period": period_range("2013-01-01", freq="M", periods=10),
- "float": Index(np.arange(10, dtype=np.float64)),
- "uint": Index(np.arange(10, dtype=np.uint64)),
- "timedelta": timedelta_range("00:00:00", freq="30T", periods=10),
- }
- index["range"] = RangeIndex(10)
- index["interval"] = interval_range(0, periods=10)
- mi = {
- "reg2": MultiIndex.from_tuples(
- tuple(
- zip(
- *[
- ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
- ["one", "two", "one", "two", "one", "two", "one", "two"],
- ]
- )
- ),
- names=["first", "second"],
- )
- }
- series = {
- "float": Series(data["A"]),
- "int": Series(data["B"]),
- "mixed": Series(data["E"]),
- "ts": Series(
- np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
- ),
- "mi": Series(
- np.arange(5).astype(np.float64),
- index=MultiIndex.from_tuples(
- tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
- ),
- ),
- "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
- "cat": Series(Categorical(["foo", "bar", "baz"])),
- "dt": Series(date_range("20130101", periods=5)),
- "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
- "period": Series([Period("2000Q1")] * 5),
- }
- mixed_dup_df = DataFrame(data)
- mixed_dup_df.columns = list("ABCDA")
- frame = {
- "float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
- "int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
- "mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
- "mi": DataFrame(
- {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
- index=MultiIndex.from_tuples(
- tuple(
- zip(
- *[
- ["bar", "bar", "baz", "baz", "baz"],
- ["one", "two", "one", "two", "three"],
- ]
- )
- ),
- names=["first", "second"],
- ),
- ),
- "dup": DataFrame(
- np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
- ),
- "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
- "cat_and_float": DataFrame(
- {
- "A": Categorical(["foo", "bar", "baz"]),
- "B": np.arange(3).astype(np.int64),
- }
- ),
- "mixed_dup": mixed_dup_df,
- "dt_mixed_tzs": DataFrame(
- {
- "A": Timestamp("20130102", tz="US/Eastern"),
- "B": Timestamp("20130603", tz="CET"),
- },
- index=range(5),
- ),
- "dt_mixed2_tzs": DataFrame(
- {
- "A": Timestamp("20130102", tz="US/Eastern"),
- "B": Timestamp("20130603", tz="CET"),
- "C": Timestamp("20130603", tz="UTC"),
- },
- index=range(5),
- ),
- }
- cat = {
- "int8": Categorical(list("abcdefg")),
- "int16": Categorical(np.arange(1000)),
- "int32": Categorical(np.arange(10000)),
- }
- timestamp = {
- "normal": Timestamp("2011-01-01"),
- "nat": NaT,
- "tz": Timestamp("2011-01-01", tz="US/Eastern"),
- }
- off = {
- "DateOffset": DateOffset(years=1),
- "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
- "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
- "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
- "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
- "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
- "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
- "MonthBegin": MonthBegin(1),
- "MonthEnd": MonthEnd(1),
- "QuarterBegin": QuarterBegin(1),
- "QuarterEnd": QuarterEnd(1),
- "Day": Day(1),
- "YearBegin": YearBegin(1),
- "YearEnd": YearEnd(1),
- "Week": Week(1),
- "Week_Tues": Week(2, normalize=False, weekday=1),
- "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
- "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
- "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
- "Easter": Easter(),
- "Hour": Hour(1),
- "Minute": Minute(1),
- }
- return {
- "series": series,
- "frame": frame,
- "index": index,
- "scalars": scalars,
- "mi": mi,
- "sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
- "sp_frame": {"float": _create_sp_frame()},
- "cat": cat,
- "timestamp": timestamp,
- "offsets": off,
- }
- def create_pickle_data():
- data = create_data()
- return data
- def platform_name():
- return "_".join(
- [
- str(pandas.__version__),
- str(pl.machine()),
- str(pl.system().lower()),
- str(pl.python_version()),
- ]
- )
- def write_legacy_pickles(output_dir):
- version = pandas.__version__
- print(
- "This script generates a storage file for the current arch, system, "
- "and python version"
- )
- print(f" pandas version: {version}")
- print(f" output dir : {output_dir}")
- print(" storage format: pickle")
- pth = f"{platform_name()}.pickle"
- with open(os.path.join(output_dir, pth), "wb") as fh:
- pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
- print(f"created pickle file: {pth}")
- def write_legacy_file():
- # force our cwd to be the first searched
- sys.path.insert(0, ".")
- if not 3 <= len(sys.argv) <= 4:
- sys.exit(
- "Specify output directory and storage type: generate_legacy_"
- "storage_files.py <output_dir> <storage_type> "
- )
- output_dir = str(sys.argv[1])
- storage_type = str(sys.argv[2])
- if storage_type == "pickle":
- write_legacy_pickles(output_dir=output_dir)
- else:
- sys.exit("storage_type must be one of {'pickle'}")
- if __name__ == "__main__":
- write_legacy_file()
|