test_libsparse.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552
  1. import operator
  2. import numpy as np
  3. import pytest
  4. import pandas._libs.sparse as splib
  5. import pandas.util._test_decorators as td
  6. from pandas import Series
  7. import pandas._testing as tm
  8. from pandas.core.arrays.sparse import (
  9. BlockIndex,
  10. IntIndex,
  11. make_sparse_index,
  12. )
  13. TEST_LENGTH = 20
  14. plain_case = [
  15. [0, 7, 15],
  16. [3, 5, 5],
  17. [2, 9, 14],
  18. [2, 3, 5],
  19. [2, 9, 15],
  20. [1, 3, 4],
  21. ]
  22. delete_blocks = [
  23. [0, 5],
  24. [4, 4],
  25. [1],
  26. [4],
  27. [1],
  28. [3],
  29. ]
  30. split_blocks = [
  31. [0],
  32. [10],
  33. [0, 5],
  34. [3, 7],
  35. [0, 5],
  36. [3, 5],
  37. ]
  38. skip_block = [
  39. [10],
  40. [5],
  41. [0, 12],
  42. [5, 3],
  43. [12],
  44. [3],
  45. ]
  46. no_intersect = [
  47. [0, 10],
  48. [4, 6],
  49. [5, 17],
  50. [4, 2],
  51. [],
  52. [],
  53. ]
  54. one_empty = [
  55. [0],
  56. [5],
  57. [],
  58. [],
  59. [],
  60. [],
  61. ]
  62. both_empty = [ # type: ignore[var-annotated]
  63. [],
  64. [],
  65. [],
  66. [],
  67. [],
  68. [],
  69. ]
  70. CASES = [plain_case, delete_blocks, split_blocks, skip_block, no_intersect, one_empty]
  71. IDS = [
  72. "plain_case",
  73. "delete_blocks",
  74. "split_blocks",
  75. "skip_block",
  76. "no_intersect",
  77. "one_empty",
  78. ]
  79. class TestSparseIndexUnion:
  80. @pytest.mark.parametrize(
  81. "xloc, xlen, yloc, ylen, eloc, elen",
  82. [
  83. [[0], [5], [5], [4], [0], [9]],
  84. [[0, 10], [5, 5], [2, 17], [5, 2], [0, 10, 17], [7, 5, 2]],
  85. [[1], [5], [3], [5], [1], [7]],
  86. [[2, 10], [4, 4], [4], [8], [2], [12]],
  87. [[0, 5], [3, 5], [0], [7], [0], [10]],
  88. [[2, 10], [4, 4], [4, 13], [8, 4], [2], [15]],
  89. [[2], [15], [4, 9, 14], [3, 2, 2], [2], [15]],
  90. [[0, 10], [3, 3], [5, 15], [2, 2], [0, 5, 10, 15], [3, 2, 3, 2]],
  91. ],
  92. )
  93. def test_index_make_union(self, xloc, xlen, yloc, ylen, eloc, elen):
  94. # Case 1
  95. # x: ----
  96. # y: ----
  97. # r: --------
  98. # Case 2
  99. # x: ----- -----
  100. # y: ----- --
  101. # Case 3
  102. # x: ------
  103. # y: -------
  104. # r: ----------
  105. # Case 4
  106. # x: ------ -----
  107. # y: -------
  108. # r: -------------
  109. # Case 5
  110. # x: --- -----
  111. # y: -------
  112. # r: -------------
  113. # Case 6
  114. # x: ------ -----
  115. # y: ------- ---
  116. # r: -------------
  117. # Case 7
  118. # x: ----------------------
  119. # y: ---- ---- ---
  120. # r: ----------------------
  121. # Case 8
  122. # x: ---- ---
  123. # y: --- ---
  124. xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
  125. yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
  126. bresult = xindex.make_union(yindex)
  127. assert isinstance(bresult, BlockIndex)
  128. tm.assert_numpy_array_equal(bresult.blocs, np.array(eloc, dtype=np.int32))
  129. tm.assert_numpy_array_equal(bresult.blengths, np.array(elen, dtype=np.int32))
  130. ixindex = xindex.to_int_index()
  131. iyindex = yindex.to_int_index()
  132. iresult = ixindex.make_union(iyindex)
  133. assert isinstance(iresult, IntIndex)
  134. tm.assert_numpy_array_equal(iresult.indices, bresult.to_int_index().indices)
  135. def test_int_index_make_union(self):
  136. a = IntIndex(5, np.array([0, 3, 4], dtype=np.int32))
  137. b = IntIndex(5, np.array([0, 2], dtype=np.int32))
  138. res = a.make_union(b)
  139. exp = IntIndex(5, np.array([0, 2, 3, 4], np.int32))
  140. assert res.equals(exp)
  141. a = IntIndex(5, np.array([], dtype=np.int32))
  142. b = IntIndex(5, np.array([0, 2], dtype=np.int32))
  143. res = a.make_union(b)
  144. exp = IntIndex(5, np.array([0, 2], np.int32))
  145. assert res.equals(exp)
  146. a = IntIndex(5, np.array([], dtype=np.int32))
  147. b = IntIndex(5, np.array([], dtype=np.int32))
  148. res = a.make_union(b)
  149. exp = IntIndex(5, np.array([], np.int32))
  150. assert res.equals(exp)
  151. a = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
  152. b = IntIndex(5, np.array([0, 1, 2, 3, 4], dtype=np.int32))
  153. res = a.make_union(b)
  154. exp = IntIndex(5, np.array([0, 1, 2, 3, 4], np.int32))
  155. assert res.equals(exp)
  156. a = IntIndex(5, np.array([0, 1], dtype=np.int32))
  157. b = IntIndex(4, np.array([0, 1], dtype=np.int32))
  158. msg = "Indices must reference same underlying length"
  159. with pytest.raises(ValueError, match=msg):
  160. a.make_union(b)
  161. class TestSparseIndexIntersect:
  162. @td.skip_if_windows
  163. @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS)
  164. def test_intersect(self, xloc, xlen, yloc, ylen, eloc, elen):
  165. xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
  166. yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
  167. expected = BlockIndex(TEST_LENGTH, eloc, elen)
  168. longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen)
  169. result = xindex.intersect(yindex)
  170. assert result.equals(expected)
  171. result = xindex.to_int_index().intersect(yindex.to_int_index())
  172. assert result.equals(expected.to_int_index())
  173. msg = "Indices must reference same underlying length"
  174. with pytest.raises(Exception, match=msg):
  175. xindex.intersect(longer_index)
  176. with pytest.raises(Exception, match=msg):
  177. xindex.to_int_index().intersect(longer_index.to_int_index())
  178. def test_intersect_empty(self):
  179. xindex = IntIndex(4, np.array([], dtype=np.int32))
  180. yindex = IntIndex(4, np.array([2, 3], dtype=np.int32))
  181. assert xindex.intersect(yindex).equals(xindex)
  182. assert yindex.intersect(xindex).equals(xindex)
  183. xindex = xindex.to_block_index()
  184. yindex = yindex.to_block_index()
  185. assert xindex.intersect(yindex).equals(xindex)
  186. assert yindex.intersect(xindex).equals(xindex)
  187. @pytest.mark.parametrize(
  188. "case",
  189. [
  190. IntIndex(5, np.array([1, 2], dtype=np.int32)), # type: ignore[arg-type]
  191. IntIndex(5, np.array([0, 2, 4], dtype=np.int32)), # type: ignore[arg-type]
  192. IntIndex(0, np.array([], dtype=np.int32)), # type: ignore[arg-type]
  193. IntIndex(5, np.array([], dtype=np.int32)), # type: ignore[arg-type]
  194. ],
  195. )
  196. def test_intersect_identical(self, case):
  197. assert case.intersect(case).equals(case)
  198. case = case.to_block_index()
  199. assert case.intersect(case).equals(case)
  200. class TestSparseIndexCommon:
  201. def test_int_internal(self):
  202. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
  203. assert isinstance(idx, IntIndex)
  204. assert idx.npoints == 2
  205. tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
  206. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
  207. assert isinstance(idx, IntIndex)
  208. assert idx.npoints == 0
  209. tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
  210. idx = make_sparse_index(
  211. 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
  212. )
  213. assert isinstance(idx, IntIndex)
  214. assert idx.npoints == 4
  215. tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
  216. def test_block_internal(self):
  217. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
  218. assert isinstance(idx, BlockIndex)
  219. assert idx.npoints == 2
  220. tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
  221. tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
  222. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
  223. assert isinstance(idx, BlockIndex)
  224. assert idx.npoints == 0
  225. tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
  226. tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
  227. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
  228. assert isinstance(idx, BlockIndex)
  229. assert idx.npoints == 4
  230. tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
  231. tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
  232. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
  233. assert isinstance(idx, BlockIndex)
  234. assert idx.npoints == 3
  235. tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
  236. tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
  237. @pytest.mark.parametrize("kind", ["integer", "block"])
  238. def test_lookup(self, kind):
  239. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
  240. assert idx.lookup(-1) == -1
  241. assert idx.lookup(0) == -1
  242. assert idx.lookup(1) == -1
  243. assert idx.lookup(2) == 0
  244. assert idx.lookup(3) == 1
  245. assert idx.lookup(4) == -1
  246. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
  247. for i in range(-1, 5):
  248. assert idx.lookup(i) == -1
  249. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
  250. assert idx.lookup(-1) == -1
  251. assert idx.lookup(0) == 0
  252. assert idx.lookup(1) == 1
  253. assert idx.lookup(2) == 2
  254. assert idx.lookup(3) == 3
  255. assert idx.lookup(4) == -1
  256. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
  257. assert idx.lookup(-1) == -1
  258. assert idx.lookup(0) == 0
  259. assert idx.lookup(1) == -1
  260. assert idx.lookup(2) == 1
  261. assert idx.lookup(3) == 2
  262. assert idx.lookup(4) == -1
  263. @pytest.mark.parametrize("kind", ["integer", "block"])
  264. def test_lookup_array(self, kind):
  265. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
  266. res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
  267. exp = np.array([-1, -1, 0], dtype=np.int32)
  268. tm.assert_numpy_array_equal(res, exp)
  269. res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
  270. exp = np.array([-1, 0, -1, 1], dtype=np.int32)
  271. tm.assert_numpy_array_equal(res, exp)
  272. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
  273. res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
  274. exp = np.array([-1, -1, -1, -1], dtype=np.int32)
  275. tm.assert_numpy_array_equal(res, exp)
  276. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
  277. res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
  278. exp = np.array([-1, 0, 2], dtype=np.int32)
  279. tm.assert_numpy_array_equal(res, exp)
  280. res = idx.lookup_array(np.array([4, 2, 1, 3], dtype=np.int32))
  281. exp = np.array([-1, 2, 1, 3], dtype=np.int32)
  282. tm.assert_numpy_array_equal(res, exp)
  283. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
  284. res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
  285. exp = np.array([1, -1, 2, 0], dtype=np.int32)
  286. tm.assert_numpy_array_equal(res, exp)
  287. res = idx.lookup_array(np.array([1, 4, 2, 5], dtype=np.int32))
  288. exp = np.array([-1, -1, 1, -1], dtype=np.int32)
  289. tm.assert_numpy_array_equal(res, exp)
  290. @pytest.mark.parametrize(
  291. "idx, expected",
  292. [
  293. [0, -1],
  294. [5, 0],
  295. [7, 2],
  296. [8, -1],
  297. [9, -1],
  298. [10, -1],
  299. [11, -1],
  300. [12, 3],
  301. [17, 8],
  302. [18, -1],
  303. ],
  304. )
  305. def test_lookup_basics(self, idx, expected):
  306. bindex = BlockIndex(20, [5, 12], [3, 6])
  307. assert bindex.lookup(idx) == expected
  308. iindex = bindex.to_int_index()
  309. assert iindex.lookup(idx) == expected
  310. class TestBlockIndex:
  311. def test_block_internal(self):
  312. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
  313. assert isinstance(idx, BlockIndex)
  314. assert idx.npoints == 2
  315. tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
  316. tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
  317. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
  318. assert isinstance(idx, BlockIndex)
  319. assert idx.npoints == 0
  320. tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
  321. tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
  322. idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
  323. assert isinstance(idx, BlockIndex)
  324. assert idx.npoints == 4
  325. tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
  326. tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
  327. idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
  328. assert isinstance(idx, BlockIndex)
  329. assert idx.npoints == 3
  330. tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
  331. tm.assert_numpy_array_equal(idx.blengths, np.array([1, 2], dtype=np.int32))
  332. @pytest.mark.parametrize("i", [5, 10, 100, 101])
  333. def test_make_block_boundary(self, i):
  334. idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
  335. exp = np.arange(0, i, 2, dtype=np.int32)
  336. tm.assert_numpy_array_equal(idx.blocs, exp)
  337. tm.assert_numpy_array_equal(idx.blengths, np.ones(len(exp), dtype=np.int32))
  338. def test_equals(self):
  339. index = BlockIndex(10, [0, 4], [2, 5])
  340. assert index.equals(index)
  341. assert not index.equals(BlockIndex(10, [0, 4], [2, 6]))
  342. def test_check_integrity(self):
  343. locs = []
  344. lengths = []
  345. # 0-length OK
  346. BlockIndex(0, locs, lengths)
  347. # also OK even though empty
  348. BlockIndex(1, locs, lengths)
  349. msg = "Block 0 extends beyond end"
  350. with pytest.raises(ValueError, match=msg):
  351. BlockIndex(10, [5], [10])
  352. msg = "Block 0 overlaps"
  353. with pytest.raises(ValueError, match=msg):
  354. BlockIndex(10, [2, 5], [5, 3])
  355. def test_to_int_index(self):
  356. locs = [0, 10]
  357. lengths = [4, 6]
  358. exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15]
  359. block = BlockIndex(20, locs, lengths)
  360. dense = block.to_int_index()
  361. tm.assert_numpy_array_equal(dense.indices, np.array(exp_inds, dtype=np.int32))
  362. def test_to_block_index(self):
  363. index = BlockIndex(10, [0, 5], [4, 5])
  364. assert index.to_block_index() is index
  365. class TestIntIndex:
  366. def test_check_integrity(self):
  367. # Too many indices than specified in self.length
  368. msg = "Too many indices"
  369. with pytest.raises(ValueError, match=msg):
  370. IntIndex(length=1, indices=[1, 2, 3])
  371. # No index can be negative.
  372. msg = "No index can be less than zero"
  373. with pytest.raises(ValueError, match=msg):
  374. IntIndex(length=5, indices=[1, -2, 3])
  375. # No index can be negative.
  376. msg = "No index can be less than zero"
  377. with pytest.raises(ValueError, match=msg):
  378. IntIndex(length=5, indices=[1, -2, 3])
  379. # All indices must be less than the length.
  380. msg = "All indices must be less than the length"
  381. with pytest.raises(ValueError, match=msg):
  382. IntIndex(length=5, indices=[1, 2, 5])
  383. with pytest.raises(ValueError, match=msg):
  384. IntIndex(length=5, indices=[1, 2, 6])
  385. # Indices must be strictly ascending.
  386. msg = "Indices must be strictly increasing"
  387. with pytest.raises(ValueError, match=msg):
  388. IntIndex(length=5, indices=[1, 3, 2])
  389. with pytest.raises(ValueError, match=msg):
  390. IntIndex(length=5, indices=[1, 3, 3])
  391. def test_int_internal(self):
  392. idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
  393. assert isinstance(idx, IntIndex)
  394. assert idx.npoints == 2
  395. tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
  396. idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
  397. assert isinstance(idx, IntIndex)
  398. assert idx.npoints == 0
  399. tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
  400. idx = make_sparse_index(
  401. 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
  402. )
  403. assert isinstance(idx, IntIndex)
  404. assert idx.npoints == 4
  405. tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
  406. def test_equals(self):
  407. index = IntIndex(10, [0, 1, 2, 3, 4])
  408. assert index.equals(index)
  409. assert not index.equals(IntIndex(10, [0, 1, 2, 3]))
  410. @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS)
  411. def test_to_block_index(self, xloc, xlen, yloc, ylen, eloc, elen):
  412. xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
  413. yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
  414. # see if survive the round trip
  415. xbindex = xindex.to_int_index().to_block_index()
  416. ybindex = yindex.to_int_index().to_block_index()
  417. assert isinstance(xbindex, BlockIndex)
  418. assert xbindex.equals(xindex)
  419. assert ybindex.equals(yindex)
  420. def test_to_int_index(self):
  421. index = IntIndex(10, [2, 3, 4, 5, 6])
  422. assert index.to_int_index() is index
  423. class TestSparseOperators:
  424. @pytest.mark.parametrize("opname", ["add", "sub", "mul", "truediv", "floordiv"])
  425. @pytest.mark.parametrize("xloc, xlen, yloc, ylen, eloc, elen", CASES, ids=IDS)
  426. def test_op(self, opname, xloc, xlen, yloc, ylen, eloc, elen):
  427. sparse_op = getattr(splib, f"sparse_{opname}_float64")
  428. python_op = getattr(operator, opname)
  429. xindex = BlockIndex(TEST_LENGTH, xloc, xlen)
  430. yindex = BlockIndex(TEST_LENGTH, yloc, ylen)
  431. xdindex = xindex.to_int_index()
  432. ydindex = yindex.to_int_index()
  433. x = np.arange(xindex.npoints) * 10.0 + 1
  434. y = np.arange(yindex.npoints) * 100.0 + 1
  435. xfill = 0
  436. yfill = 2
  437. result_block_vals, rb_index, bfill = sparse_op(
  438. x, xindex, xfill, y, yindex, yfill
  439. )
  440. result_int_vals, ri_index, ifill = sparse_op(
  441. x, xdindex, xfill, y, ydindex, yfill
  442. )
  443. assert rb_index.to_int_index().equals(ri_index)
  444. tm.assert_numpy_array_equal(result_block_vals, result_int_vals)
  445. assert bfill == ifill
  446. # check versus Series...
  447. xseries = Series(x, xdindex.indices)
  448. xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill)
  449. yseries = Series(y, ydindex.indices)
  450. yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill)
  451. series_result = python_op(xseries, yseries)
  452. series_result = series_result.reindex(ri_index.indices)
  453. tm.assert_numpy_array_equal(result_block_vals, series_result.values)
  454. tm.assert_numpy_array_equal(result_int_vals, series_result.values)