test_binned_statistic.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. import numpy as np
  2. from numpy.testing import assert_allclose
  3. import pytest
  4. from pytest import raises as assert_raises
  5. from scipy.stats import (binned_statistic, binned_statistic_2d,
  6. binned_statistic_dd)
  7. from scipy._lib._util import check_random_state
  8. from .common_tests import check_named_results
  9. class TestBinnedStatistic:
  10. @classmethod
  11. def setup_class(cls):
  12. rng = check_random_state(9865)
  13. cls.x = rng.uniform(size=100)
  14. cls.y = rng.uniform(size=100)
  15. cls.v = rng.uniform(size=100)
  16. cls.X = rng.uniform(size=(100, 3))
  17. cls.w = rng.uniform(size=100)
  18. cls.u = rng.uniform(size=100) + 1e6
  19. def test_1d_count(self):
  20. x = self.x
  21. v = self.v
  22. count1, edges1, bc = binned_statistic(x, v, 'count', bins=10)
  23. count2, edges2 = np.histogram(x, bins=10)
  24. assert_allclose(count1, count2)
  25. assert_allclose(edges1, edges2)
  26. def test_gh5927(self):
  27. # smoke test for gh5927 - binned_statistic was using `is` for string
  28. # comparison
  29. x = self.x
  30. v = self.v
  31. statistics = ['mean', 'median', 'count', 'sum']
  32. for statistic in statistics:
  33. binned_statistic(x, v, statistic, bins=10)
  34. def test_big_number_std(self):
  35. # tests for numerical stability of std calculation
  36. # see issue gh-10126 for more
  37. x = self.x
  38. u = self.u
  39. stat1, edges1, bc = binned_statistic(x, u, 'std', bins=10)
  40. stat2, edges2, bc = binned_statistic(x, u, np.std, bins=10)
  41. assert_allclose(stat1, stat2)
  42. def test_empty_bins_std(self):
  43. # tests that std returns gives nan for empty bins
  44. x = self.x
  45. u = self.u
  46. print(binned_statistic(x, u, 'count', bins=1000))
  47. stat1, edges1, bc = binned_statistic(x, u, 'std', bins=1000)
  48. stat2, edges2, bc = binned_statistic(x, u, np.std, bins=1000)
  49. assert_allclose(stat1, stat2)
  50. def test_non_finite_inputs_and_int_bins(self):
  51. # if either `values` or `sample` contain np.inf or np.nan throw
  52. # see issue gh-9010 for more
  53. x = self.x
  54. u = self.u
  55. orig = u[0]
  56. u[0] = np.inf
  57. assert_raises(ValueError, binned_statistic, u, x, 'std', bins=10)
  58. # need to test for non-python specific ints, e.g. np.int8, np.int64
  59. assert_raises(ValueError, binned_statistic, u, x, 'std',
  60. bins=np.int64(10))
  61. u[0] = np.nan
  62. assert_raises(ValueError, binned_statistic, u, x, 'count', bins=10)
  63. # replace original value, u belongs the class
  64. u[0] = orig
  65. def test_1d_result_attributes(self):
  66. x = self.x
  67. v = self.v
  68. res = binned_statistic(x, v, 'count', bins=10)
  69. attributes = ('statistic', 'bin_edges', 'binnumber')
  70. check_named_results(res, attributes)
  71. def test_1d_sum(self):
  72. x = self.x
  73. v = self.v
  74. sum1, edges1, bc = binned_statistic(x, v, 'sum', bins=10)
  75. sum2, edges2 = np.histogram(x, bins=10, weights=v)
  76. assert_allclose(sum1, sum2)
  77. assert_allclose(edges1, edges2)
  78. def test_1d_mean(self):
  79. x = self.x
  80. v = self.v
  81. stat1, edges1, bc = binned_statistic(x, v, 'mean', bins=10)
  82. stat2, edges2, bc = binned_statistic(x, v, np.mean, bins=10)
  83. assert_allclose(stat1, stat2)
  84. assert_allclose(edges1, edges2)
  85. def test_1d_std(self):
  86. x = self.x
  87. v = self.v
  88. stat1, edges1, bc = binned_statistic(x, v, 'std', bins=10)
  89. stat2, edges2, bc = binned_statistic(x, v, np.std, bins=10)
  90. assert_allclose(stat1, stat2)
  91. assert_allclose(edges1, edges2)
  92. def test_1d_min(self):
  93. x = self.x
  94. v = self.v
  95. stat1, edges1, bc = binned_statistic(x, v, 'min', bins=10)
  96. stat2, edges2, bc = binned_statistic(x, v, np.min, bins=10)
  97. assert_allclose(stat1, stat2)
  98. assert_allclose(edges1, edges2)
  99. def test_1d_max(self):
  100. x = self.x
  101. v = self.v
  102. stat1, edges1, bc = binned_statistic(x, v, 'max', bins=10)
  103. stat2, edges2, bc = binned_statistic(x, v, np.max, bins=10)
  104. assert_allclose(stat1, stat2)
  105. assert_allclose(edges1, edges2)
  106. def test_1d_median(self):
  107. x = self.x
  108. v = self.v
  109. stat1, edges1, bc = binned_statistic(x, v, 'median', bins=10)
  110. stat2, edges2, bc = binned_statistic(x, v, np.median, bins=10)
  111. assert_allclose(stat1, stat2)
  112. assert_allclose(edges1, edges2)
  113. def test_1d_bincode(self):
  114. x = self.x[:20]
  115. v = self.v[:20]
  116. count1, edges1, bc = binned_statistic(x, v, 'count', bins=3)
  117. bc2 = np.array([3, 2, 1, 3, 2, 3, 3, 3, 3, 1, 1, 3, 3, 1, 2, 3, 1,
  118. 1, 2, 1])
  119. bcount = [(bc == i).sum() for i in np.unique(bc)]
  120. assert_allclose(bc, bc2)
  121. assert_allclose(bcount, count1)
  122. def test_1d_range_keyword(self):
  123. # Regression test for gh-3063, range can be (min, max) or [(min, max)]
  124. np.random.seed(9865)
  125. x = np.arange(30)
  126. data = np.random.random(30)
  127. mean, bins, _ = binned_statistic(x[:15], data[:15])
  128. mean_range, bins_range, _ = binned_statistic(x, data, range=[(0, 14)])
  129. mean_range2, bins_range2, _ = binned_statistic(x, data, range=(0, 14))
  130. assert_allclose(mean, mean_range)
  131. assert_allclose(bins, bins_range)
  132. assert_allclose(mean, mean_range2)
  133. assert_allclose(bins, bins_range2)
  134. def test_1d_multi_values(self):
  135. x = self.x
  136. v = self.v
  137. w = self.w
  138. stat1v, edges1v, bc1v = binned_statistic(x, v, 'mean', bins=10)
  139. stat1w, edges1w, bc1w = binned_statistic(x, w, 'mean', bins=10)
  140. stat2, edges2, bc2 = binned_statistic(x, [v, w], 'mean', bins=10)
  141. assert_allclose(stat2[0], stat1v)
  142. assert_allclose(stat2[1], stat1w)
  143. assert_allclose(edges1v, edges2)
  144. assert_allclose(bc1v, bc2)
  145. def test_2d_count(self):
  146. x = self.x
  147. y = self.y
  148. v = self.v
  149. count1, binx1, biny1, bc = binned_statistic_2d(
  150. x, y, v, 'count', bins=5)
  151. count2, binx2, biny2 = np.histogram2d(x, y, bins=5)
  152. assert_allclose(count1, count2)
  153. assert_allclose(binx1, binx2)
  154. assert_allclose(biny1, biny2)
  155. def test_2d_result_attributes(self):
  156. x = self.x
  157. y = self.y
  158. v = self.v
  159. res = binned_statistic_2d(x, y, v, 'count', bins=5)
  160. attributes = ('statistic', 'x_edge', 'y_edge', 'binnumber')
  161. check_named_results(res, attributes)
  162. def test_2d_sum(self):
  163. x = self.x
  164. y = self.y
  165. v = self.v
  166. sum1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'sum', bins=5)
  167. sum2, binx2, biny2 = np.histogram2d(x, y, bins=5, weights=v)
  168. assert_allclose(sum1, sum2)
  169. assert_allclose(binx1, binx2)
  170. assert_allclose(biny1, biny2)
  171. def test_2d_mean(self):
  172. x = self.x
  173. y = self.y
  174. v = self.v
  175. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'mean', bins=5)
  176. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
  177. assert_allclose(stat1, stat2)
  178. assert_allclose(binx1, binx2)
  179. assert_allclose(biny1, biny2)
  180. def test_2d_mean_unicode(self):
  181. x = self.x
  182. y = self.y
  183. v = self.v
  184. stat1, binx1, biny1, bc = binned_statistic_2d(
  185. x, y, v, 'mean', bins=5)
  186. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.mean, bins=5)
  187. assert_allclose(stat1, stat2)
  188. assert_allclose(binx1, binx2)
  189. assert_allclose(biny1, biny2)
  190. def test_2d_std(self):
  191. x = self.x
  192. y = self.y
  193. v = self.v
  194. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'std', bins=5)
  195. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.std, bins=5)
  196. assert_allclose(stat1, stat2)
  197. assert_allclose(binx1, binx2)
  198. assert_allclose(biny1, biny2)
  199. def test_2d_min(self):
  200. x = self.x
  201. y = self.y
  202. v = self.v
  203. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'min', bins=5)
  204. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.min, bins=5)
  205. assert_allclose(stat1, stat2)
  206. assert_allclose(binx1, binx2)
  207. assert_allclose(biny1, biny2)
  208. def test_2d_max(self):
  209. x = self.x
  210. y = self.y
  211. v = self.v
  212. stat1, binx1, biny1, bc = binned_statistic_2d(x, y, v, 'max', bins=5)
  213. stat2, binx2, biny2, bc = binned_statistic_2d(x, y, v, np.max, bins=5)
  214. assert_allclose(stat1, stat2)
  215. assert_allclose(binx1, binx2)
  216. assert_allclose(biny1, biny2)
  217. def test_2d_median(self):
  218. x = self.x
  219. y = self.y
  220. v = self.v
  221. stat1, binx1, biny1, bc = binned_statistic_2d(
  222. x, y, v, 'median', bins=5)
  223. stat2, binx2, biny2, bc = binned_statistic_2d(
  224. x, y, v, np.median, bins=5)
  225. assert_allclose(stat1, stat2)
  226. assert_allclose(binx1, binx2)
  227. assert_allclose(biny1, biny2)
  228. def test_2d_bincode(self):
  229. x = self.x[:20]
  230. y = self.y[:20]
  231. v = self.v[:20]
  232. count1, binx1, biny1, bc = binned_statistic_2d(
  233. x, y, v, 'count', bins=3)
  234. bc2 = np.array([17, 11, 6, 16, 11, 17, 18, 17, 17, 7, 6, 18, 16,
  235. 6, 11, 16, 6, 6, 11, 8])
  236. bcount = [(bc == i).sum() for i in np.unique(bc)]
  237. assert_allclose(bc, bc2)
  238. count1adj = count1[count1.nonzero()]
  239. assert_allclose(bcount, count1adj)
  240. def test_2d_multi_values(self):
  241. x = self.x
  242. y = self.y
  243. v = self.v
  244. w = self.w
  245. stat1v, binx1v, biny1v, bc1v = binned_statistic_2d(
  246. x, y, v, 'mean', bins=8)
  247. stat1w, binx1w, biny1w, bc1w = binned_statistic_2d(
  248. x, y, w, 'mean', bins=8)
  249. stat2, binx2, biny2, bc2 = binned_statistic_2d(
  250. x, y, [v, w], 'mean', bins=8)
  251. assert_allclose(stat2[0], stat1v)
  252. assert_allclose(stat2[1], stat1w)
  253. assert_allclose(binx1v, binx2)
  254. assert_allclose(biny1w, biny2)
  255. assert_allclose(bc1v, bc2)
  256. def test_2d_binnumbers_unraveled(self):
  257. x = self.x
  258. y = self.y
  259. v = self.v
  260. stat, edgesx, bcx = binned_statistic(x, v, 'mean', bins=20)
  261. stat, edgesy, bcy = binned_statistic(y, v, 'mean', bins=10)
  262. stat2, edgesx2, edgesy2, bc2 = binned_statistic_2d(
  263. x, y, v, 'mean', bins=(20, 10), expand_binnumbers=True)
  264. bcx3 = np.searchsorted(edgesx, x, side='right')
  265. bcy3 = np.searchsorted(edgesy, y, side='right')
  266. # `numpy.searchsorted` is non-inclusive on right-edge, compensate
  267. bcx3[x == x.max()] -= 1
  268. bcy3[y == y.max()] -= 1
  269. assert_allclose(bcx, bc2[0])
  270. assert_allclose(bcy, bc2[1])
  271. assert_allclose(bcx3, bc2[0])
  272. assert_allclose(bcy3, bc2[1])
  273. def test_dd_count(self):
  274. X = self.X
  275. v = self.v
  276. count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
  277. count2, edges2 = np.histogramdd(X, bins=3)
  278. assert_allclose(count1, count2)
  279. assert_allclose(edges1, edges2)
  280. def test_dd_result_attributes(self):
  281. X = self.X
  282. v = self.v
  283. res = binned_statistic_dd(X, v, 'count', bins=3)
  284. attributes = ('statistic', 'bin_edges', 'binnumber')
  285. check_named_results(res, attributes)
  286. def test_dd_sum(self):
  287. X = self.X
  288. v = self.v
  289. sum1, edges1, bc = binned_statistic_dd(X, v, 'sum', bins=3)
  290. sum2, edges2 = np.histogramdd(X, bins=3, weights=v)
  291. sum3, edges3, bc = binned_statistic_dd(X, v, np.sum, bins=3)
  292. assert_allclose(sum1, sum2)
  293. assert_allclose(edges1, edges2)
  294. assert_allclose(sum1, sum3)
  295. assert_allclose(edges1, edges3)
  296. def test_dd_mean(self):
  297. X = self.X
  298. v = self.v
  299. stat1, edges1, bc = binned_statistic_dd(X, v, 'mean', bins=3)
  300. stat2, edges2, bc = binned_statistic_dd(X, v, np.mean, bins=3)
  301. assert_allclose(stat1, stat2)
  302. assert_allclose(edges1, edges2)
  303. def test_dd_std(self):
  304. X = self.X
  305. v = self.v
  306. stat1, edges1, bc = binned_statistic_dd(X, v, 'std', bins=3)
  307. stat2, edges2, bc = binned_statistic_dd(X, v, np.std, bins=3)
  308. assert_allclose(stat1, stat2)
  309. assert_allclose(edges1, edges2)
  310. def test_dd_min(self):
  311. X = self.X
  312. v = self.v
  313. stat1, edges1, bc = binned_statistic_dd(X, v, 'min', bins=3)
  314. stat2, edges2, bc = binned_statistic_dd(X, v, np.min, bins=3)
  315. assert_allclose(stat1, stat2)
  316. assert_allclose(edges1, edges2)
  317. def test_dd_max(self):
  318. X = self.X
  319. v = self.v
  320. stat1, edges1, bc = binned_statistic_dd(X, v, 'max', bins=3)
  321. stat2, edges2, bc = binned_statistic_dd(X, v, np.max, bins=3)
  322. assert_allclose(stat1, stat2)
  323. assert_allclose(edges1, edges2)
  324. def test_dd_median(self):
  325. X = self.X
  326. v = self.v
  327. stat1, edges1, bc = binned_statistic_dd(X, v, 'median', bins=3)
  328. stat2, edges2, bc = binned_statistic_dd(X, v, np.median, bins=3)
  329. assert_allclose(stat1, stat2)
  330. assert_allclose(edges1, edges2)
  331. def test_dd_bincode(self):
  332. X = self.X[:20]
  333. v = self.v[:20]
  334. count1, edges1, bc = binned_statistic_dd(X, v, 'count', bins=3)
  335. bc2 = np.array([63, 33, 86, 83, 88, 67, 57, 33, 42, 41, 82, 83, 92,
  336. 32, 36, 91, 43, 87, 81, 81])
  337. bcount = [(bc == i).sum() for i in np.unique(bc)]
  338. assert_allclose(bc, bc2)
  339. count1adj = count1[count1.nonzero()]
  340. assert_allclose(bcount, count1adj)
  341. def test_dd_multi_values(self):
  342. X = self.X
  343. v = self.v
  344. w = self.w
  345. for stat in ["count", "sum", "mean", "std", "min", "max", "median",
  346. np.std]:
  347. stat1v, edges1v, bc1v = binned_statistic_dd(X, v, stat, bins=8)
  348. stat1w, edges1w, bc1w = binned_statistic_dd(X, w, stat, bins=8)
  349. stat2, edges2, bc2 = binned_statistic_dd(X, [v, w], stat, bins=8)
  350. assert_allclose(stat2[0], stat1v)
  351. assert_allclose(stat2[1], stat1w)
  352. assert_allclose(edges1v, edges2)
  353. assert_allclose(edges1w, edges2)
  354. assert_allclose(bc1v, bc2)
  355. def test_dd_binnumbers_unraveled(self):
  356. X = self.X
  357. v = self.v
  358. stat, edgesx, bcx = binned_statistic(X[:, 0], v, 'mean', bins=15)
  359. stat, edgesy, bcy = binned_statistic(X[:, 1], v, 'mean', bins=20)
  360. stat, edgesz, bcz = binned_statistic(X[:, 2], v, 'mean', bins=10)
  361. stat2, edges2, bc2 = binned_statistic_dd(
  362. X, v, 'mean', bins=(15, 20, 10), expand_binnumbers=True)
  363. assert_allclose(bcx, bc2[0])
  364. assert_allclose(bcy, bc2[1])
  365. assert_allclose(bcz, bc2[2])
  366. def test_dd_binned_statistic_result(self):
  367. # NOTE: tests the reuse of bin_edges from previous call
  368. x = np.random.random((10000, 3))
  369. v = np.random.random((10000))
  370. bins = np.linspace(0, 1, 10)
  371. bins = (bins, bins, bins)
  372. result = binned_statistic_dd(x, v, 'mean', bins=bins)
  373. stat = result.statistic
  374. result = binned_statistic_dd(x, v, 'mean',
  375. binned_statistic_result=result)
  376. stat2 = result.statistic
  377. assert_allclose(stat, stat2)
  378. def test_dd_zero_dedges(self):
  379. x = np.random.random((10000, 3))
  380. v = np.random.random((10000))
  381. bins = np.linspace(0, 1, 10)
  382. bins = np.append(bins, 1)
  383. bins = (bins, bins, bins)
  384. with assert_raises(ValueError, match='difference is numerically 0'):
  385. binned_statistic_dd(x, v, 'mean', bins=bins)
  386. def test_dd_range_errors(self):
  387. # Test that descriptive exceptions are raised as appropriate for bad
  388. # values of the `range` argument. (See gh-12996)
  389. with assert_raises(ValueError,
  390. match='In range, start must be <= stop'):
  391. binned_statistic_dd([self.y], self.v,
  392. range=[[1, 0]])
  393. with assert_raises(
  394. ValueError,
  395. match='In dimension 1 of range, start must be <= stop'):
  396. binned_statistic_dd([self.x, self.y], self.v,
  397. range=[[1, 0], [0, 1]])
  398. with assert_raises(
  399. ValueError,
  400. match='In dimension 2 of range, start must be <= stop'):
  401. binned_statistic_dd([self.x, self.y], self.v,
  402. range=[[0, 1], [1, 0]])
  403. with assert_raises(
  404. ValueError,
  405. match='range given for 1 dimensions; 2 required'):
  406. binned_statistic_dd([self.x, self.y], self.v,
  407. range=[[0, 1]])
  408. def test_binned_statistic_float32(self):
  409. X = np.array([0, 0.42358226], dtype=np.float32)
  410. stat, _, _ = binned_statistic(X, None, 'count', bins=5)
  411. assert_allclose(stat, np.array([1, 0, 0, 0, 1], dtype=np.float64))
  412. def test_gh14332(self):
  413. # Test the wrong output when the `sample` is close to bin edge
  414. x = []
  415. size = 20
  416. for i in range(size):
  417. x += [1-0.1**i]
  418. bins = np.linspace(0,1,11)
  419. sum1, edges1, bc = binned_statistic_dd(x, np.ones(len(x)),
  420. bins=[bins], statistic='sum')
  421. sum2, edges2 = np.histogram(x, bins=bins)
  422. assert_allclose(sum1, sum2)
  423. assert_allclose(edges1[0], edges2)
  424. @pytest.mark.parametrize("dtype", [np.float64, np.complex128])
  425. @pytest.mark.parametrize("statistic", [np.mean, np.median, np.sum, np.std,
  426. np.min, np.max, 'count',
  427. lambda x: (x**2).sum(),
  428. lambda x: (x**2).sum() * 1j])
  429. def test_dd_all(self, dtype, statistic):
  430. def ref_statistic(x):
  431. return len(x) if statistic == 'count' else statistic(x)
  432. rng = np.random.default_rng(3704743126639371)
  433. n = 10
  434. x = rng.random(size=n)
  435. i = x >= 0.5
  436. v = rng.random(size=n)
  437. if dtype is np.complex128:
  438. v = v + rng.random(size=n)*1j
  439. stat, _, _ = binned_statistic_dd(x, v, statistic, bins=2)
  440. ref = np.array([ref_statistic(v[~i]), ref_statistic(v[i])])
  441. assert_allclose(stat, ref)
  442. assert stat.dtype == np.result_type(ref.dtype, np.float64)