test_coercion.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. """
  2. Tests for values coercion in setitem-like operations on DataFrame.
  3. For the most part, these should be multi-column DataFrames, otherwise
  4. we would share the tests with Series.
  5. """
  6. import numpy as np
  7. import pytest
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. MultiIndex,
  12. NaT,
  13. Series,
  14. Timestamp,
  15. date_range,
  16. )
  17. import pandas._testing as tm
  18. class TestDataFrameSetitemCoercion:
  19. @pytest.mark.parametrize("consolidate", [True, False])
  20. def test_loc_setitem_multiindex_columns(self, consolidate):
  21. # GH#18415 Setting values in a single column preserves dtype,
  22. # while setting them in multiple columns did unwanted cast.
  23. # Note that A here has 2 blocks, below we do the same thing
  24. # with a consolidated frame.
  25. A = DataFrame(np.zeros((6, 5), dtype=np.float32))
  26. A = pd.concat([A, A], axis=1, keys=[1, 2])
  27. if consolidate:
  28. A = A._consolidate()
  29. A.loc[2:3, (1, slice(2, 3))] = np.ones((2, 2), dtype=np.float32)
  30. assert (A.dtypes == np.float32).all()
  31. A.loc[0:5, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32)
  32. assert (A.dtypes == np.float32).all()
  33. A.loc[:, (1, slice(2, 3))] = np.ones((6, 2), dtype=np.float32)
  34. assert (A.dtypes == np.float32).all()
  35. # TODO: i think this isn't about MultiIndex and could be done with iloc?
  36. def test_37477():
  37. # fixed by GH#45121
  38. orig = DataFrame({"A": [1, 2, 3], "B": [3, 4, 5]})
  39. expected = DataFrame({"A": [1, 2, 3], "B": [3, 1.2, 5]})
  40. df = orig.copy()
  41. df.at[1, "B"] = 1.2
  42. tm.assert_frame_equal(df, expected)
  43. df = orig.copy()
  44. df.loc[1, "B"] = 1.2
  45. tm.assert_frame_equal(df, expected)
  46. df = orig.copy()
  47. df.iat[1, 1] = 1.2
  48. tm.assert_frame_equal(df, expected)
  49. df = orig.copy()
  50. df.iloc[1, 1] = 1.2
  51. tm.assert_frame_equal(df, expected)
  52. def test_6942(indexer_al):
  53. # check that the .at __setitem__ after setting "Live" actually sets the data
  54. start = Timestamp("2014-04-01")
  55. t1 = Timestamp("2014-04-23 12:42:38.883082")
  56. t2 = Timestamp("2014-04-24 01:33:30.040039")
  57. dti = date_range(start, periods=1)
  58. orig = DataFrame(index=dti, columns=["timenow", "Live"])
  59. df = orig.copy()
  60. indexer_al(df)[start, "timenow"] = t1
  61. df["Live"] = True
  62. df.at[start, "timenow"] = t2
  63. assert df.iloc[0, 0] == t2
  64. def test_26395(indexer_al):
  65. # .at case fixed by GH#45121 (best guess)
  66. df = DataFrame(index=["A", "B", "C"])
  67. df["D"] = 0
  68. indexer_al(df)["C", "D"] = 2
  69. expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
  70. tm.assert_frame_equal(df, expected)
  71. indexer_al(df)["C", "D"] = 44.5
  72. expected = DataFrame({"D": [0, 0, 44.5]}, index=["A", "B", "C"], dtype=np.float64)
  73. tm.assert_frame_equal(df, expected)
  74. indexer_al(df)["C", "D"] = "hello"
  75. expected = DataFrame({"D": [0, 0, "hello"]}, index=["A", "B", "C"], dtype=object)
  76. tm.assert_frame_equal(df, expected)
  77. @pytest.mark.xfail(reason="unwanted upcast")
  78. def test_15231():
  79. df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
  80. df.loc[2] = Series({"a": 5, "b": 6})
  81. assert (df.dtypes == np.int64).all()
  82. df.loc[3] = Series({"a": 7})
  83. # df["a"] doesn't have any NaNs, should not have been cast
  84. exp_dtypes = Series([np.int64, np.float64], dtype=object, index=["a", "b"])
  85. tm.assert_series_equal(df.dtypes, exp_dtypes)
  86. def test_iloc_setitem_unnecesssary_float_upcasting():
  87. # GH#12255
  88. df = DataFrame(
  89. {
  90. 0: np.array([1, 3], dtype=np.float32),
  91. 1: np.array([2, 4], dtype=np.float32),
  92. 2: ["a", "b"],
  93. }
  94. )
  95. orig = df.copy()
  96. values = df[0].values.reshape(2, 1)
  97. df.iloc[:, 0:1] = values
  98. tm.assert_frame_equal(df, orig)
  99. @pytest.mark.xfail(reason="unwanted casting to dt64")
  100. def test_12499():
  101. # TODO: OP in GH#12499 used np.datetim64("NaT") instead of pd.NaT,
  102. # which has consequences for the expected df["two"] (though i think at
  103. # the time it might not have because of a separate bug). See if it makes
  104. # a difference which one we use here.
  105. ts = Timestamp("2016-03-01 03:13:22.98986", tz="UTC")
  106. data = [{"one": 0, "two": ts}]
  107. orig = DataFrame(data)
  108. df = orig.copy()
  109. df.loc[1] = [np.nan, NaT]
  110. expected = DataFrame(
  111. {"one": [0, np.nan], "two": Series([ts, NaT], dtype="datetime64[ns, UTC]")}
  112. )
  113. tm.assert_frame_equal(df, expected)
  114. data = [{"one": 0, "two": ts}]
  115. df = orig.copy()
  116. df.loc[1, :] = [np.nan, NaT]
  117. tm.assert_frame_equal(df, expected)
  118. def test_20476():
  119. mi = MultiIndex.from_product([["A", "B"], ["a", "b", "c"]])
  120. df = DataFrame(-1, index=range(3), columns=mi)
  121. filler = DataFrame([[1, 2, 3.0]] * 3, index=range(3), columns=["a", "b", "c"])
  122. df["A"] = filler
  123. expected = DataFrame(
  124. {
  125. 0: [1, 1, 1],
  126. 1: [2, 2, 2],
  127. 2: [3.0, 3.0, 3.0],
  128. 3: [-1, -1, -1],
  129. 4: [-1, -1, -1],
  130. 5: [-1, -1, -1],
  131. }
  132. )
  133. expected.columns = mi
  134. exp_dtypes = Series(
  135. [np.dtype(np.int64)] * 2 + [np.dtype(np.float64)] + [np.dtype(np.int64)] * 3,
  136. index=mi,
  137. )
  138. tm.assert_series_equal(df.dtypes, exp_dtypes)