univariate_statistics.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. // (C) Copyright Nick Thompson 2018.
  2. // Use, modification and distribution are subject to the
  3. // Boost Software License, Version 1.0. (See accompanying file
  4. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #ifndef BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
  6. #define BOOST_MATH_TOOLS_UNIVARIATE_STATISTICS_HPP
  7. #include <algorithm>
  8. #include <iterator>
  9. #include <tuple>
  10. #include <boost/assert.hpp>
  11. #include <boost/config/header_deprecated.hpp>
  12. BOOST_HEADER_DEPRECATED("<boost/math/statistics/univariate_statistics.hpp>");
  13. namespace boost::math::tools {
  14. template<class ForwardIterator>
  15. auto mean(ForwardIterator first, ForwardIterator last)
  16. {
  17. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  18. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the mean.");
  19. if constexpr (std::is_integral<Real>::value)
  20. {
  21. double mu = 0;
  22. double i = 1;
  23. for(auto it = first; it != last; ++it) {
  24. mu = mu + (*it - mu)/i;
  25. i += 1;
  26. }
  27. return mu;
  28. }
  29. else if constexpr (std::is_same_v<typename std::iterator_traits<ForwardIterator>::iterator_category, std::random_access_iterator_tag>)
  30. {
  31. size_t elements = std::distance(first, last);
  32. Real mu0 = 0;
  33. Real mu1 = 0;
  34. Real mu2 = 0;
  35. Real mu3 = 0;
  36. Real i = 1;
  37. auto end = last - (elements % 4);
  38. for(auto it = first; it != end; it += 4) {
  39. Real inv = Real(1)/i;
  40. Real tmp0 = (*it - mu0);
  41. Real tmp1 = (*(it+1) - mu1);
  42. Real tmp2 = (*(it+2) - mu2);
  43. Real tmp3 = (*(it+3) - mu3);
  44. // please generate a vectorized fma here
  45. mu0 += tmp0*inv;
  46. mu1 += tmp1*inv;
  47. mu2 += tmp2*inv;
  48. mu3 += tmp3*inv;
  49. i += 1;
  50. }
  51. Real num1 = Real(elements - (elements %4))/Real(4);
  52. Real num2 = num1 + Real(elements % 4);
  53. for (auto it = end; it != last; ++it)
  54. {
  55. mu3 += (*it-mu3)/i;
  56. i += 1;
  57. }
  58. return (num1*(mu0+mu1+mu2) + num2*mu3)/Real(elements);
  59. }
  60. else
  61. {
  62. auto it = first;
  63. Real mu = *it;
  64. Real i = 2;
  65. while(++it != last)
  66. {
  67. mu += (*it - mu)/i;
  68. i += 1;
  69. }
  70. return mu;
  71. }
  72. }
  73. template<class Container>
  74. inline auto mean(Container const & v)
  75. {
  76. return mean(v.cbegin(), v.cend());
  77. }
  78. template<class ForwardIterator>
  79. auto variance(ForwardIterator first, ForwardIterator last)
  80. {
  81. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  82. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute mean and variance.");
  83. // Higham, Accuracy and Stability, equation 1.6a and 1.6b:
  84. if constexpr (std::is_integral<Real>::value)
  85. {
  86. double M = *first;
  87. double Q = 0;
  88. double k = 2;
  89. for (auto it = std::next(first); it != last; ++it)
  90. {
  91. double tmp = *it - M;
  92. Q = Q + ((k-1)*tmp*tmp)/k;
  93. M = M + tmp/k;
  94. k += 1;
  95. }
  96. return Q/(k-1);
  97. }
  98. else
  99. {
  100. Real M = *first;
  101. Real Q = 0;
  102. Real k = 2;
  103. for (auto it = std::next(first); it != last; ++it)
  104. {
  105. Real tmp = (*it - M)/k;
  106. Q += k*(k-1)*tmp*tmp;
  107. M += tmp;
  108. k += 1;
  109. }
  110. return Q/(k-1);
  111. }
  112. }
  113. template<class Container>
  114. inline auto variance(Container const & v)
  115. {
  116. return variance(v.cbegin(), v.cend());
  117. }
  118. template<class ForwardIterator>
  119. auto sample_variance(ForwardIterator first, ForwardIterator last)
  120. {
  121. size_t n = std::distance(first, last);
  122. BOOST_ASSERT_MSG(n > 1, "At least two samples are required to compute the sample variance.");
  123. return n*variance(first, last)/(n-1);
  124. }
  125. template<class Container>
  126. inline auto sample_variance(Container const & v)
  127. {
  128. return sample_variance(v.cbegin(), v.cend());
  129. }
  130. // Follows equation 1.5 of:
  131. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  132. template<class ForwardIterator>
  133. auto skewness(ForwardIterator first, ForwardIterator last)
  134. {
  135. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  136. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute skewness.");
  137. if constexpr (std::is_integral<Real>::value)
  138. {
  139. double M1 = *first;
  140. double M2 = 0;
  141. double M3 = 0;
  142. double n = 2;
  143. for (auto it = std::next(first); it != last; ++it)
  144. {
  145. double delta21 = *it - M1;
  146. double tmp = delta21/n;
  147. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  148. M2 = M2 + tmp*(n-1)*delta21;
  149. M1 = M1 + tmp;
  150. n += 1;
  151. }
  152. double var = M2/(n-1);
  153. if (var == 0)
  154. {
  155. // The limit is technically undefined, but the interpretation here is clear:
  156. // A constant dataset has no skewness.
  157. return double(0);
  158. }
  159. double skew = M3/(M2*sqrt(var));
  160. return skew;
  161. }
  162. else
  163. {
  164. Real M1 = *first;
  165. Real M2 = 0;
  166. Real M3 = 0;
  167. Real n = 2;
  168. for (auto it = std::next(first); it != last; ++it)
  169. {
  170. Real delta21 = *it - M1;
  171. Real tmp = delta21/n;
  172. M3 += tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  173. M2 += tmp*(n-1)*delta21;
  174. M1 += tmp;
  175. n += 1;
  176. }
  177. Real var = M2/(n-1);
  178. if (var == 0)
  179. {
  180. // The limit is technically undefined, but the interpretation here is clear:
  181. // A constant dataset has no skewness.
  182. return Real(0);
  183. }
  184. Real skew = M3/(M2*sqrt(var));
  185. return skew;
  186. }
  187. }
  188. template<class Container>
  189. inline auto skewness(Container const & v)
  190. {
  191. return skewness(v.cbegin(), v.cend());
  192. }
  193. // Follows equation 1.5/1.6 of:
  194. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  195. template<class ForwardIterator>
  196. auto first_four_moments(ForwardIterator first, ForwardIterator last)
  197. {
  198. using Real = typename std::iterator_traits<ForwardIterator>::value_type;
  199. BOOST_ASSERT_MSG(first != last, "At least one sample is required to compute the first four moments.");
  200. if constexpr (std::is_integral<Real>::value)
  201. {
  202. double M1 = *first;
  203. double M2 = 0;
  204. double M3 = 0;
  205. double M4 = 0;
  206. double n = 2;
  207. for (auto it = std::next(first); it != last; ++it)
  208. {
  209. double delta21 = *it - M1;
  210. double tmp = delta21/n;
  211. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  212. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  213. M2 = M2 + tmp*(n-1)*delta21;
  214. M1 = M1 + tmp;
  215. n += 1;
  216. }
  217. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  218. }
  219. else
  220. {
  221. Real M1 = *first;
  222. Real M2 = 0;
  223. Real M3 = 0;
  224. Real M4 = 0;
  225. Real n = 2;
  226. for (auto it = std::next(first); it != last; ++it)
  227. {
  228. Real delta21 = *it - M1;
  229. Real tmp = delta21/n;
  230. M4 = M4 + tmp*(tmp*tmp*delta21*((n-1)*(n*n-3*n+3)) + 6*tmp*M2 - 4*M3);
  231. M3 = M3 + tmp*((n-1)*(n-2)*delta21*tmp - 3*M2);
  232. M2 = M2 + tmp*(n-1)*delta21;
  233. M1 = M1 + tmp;
  234. n += 1;
  235. }
  236. return std::make_tuple(M1, M2/(n-1), M3/(n-1), M4/(n-1));
  237. }
  238. }
  239. template<class Container>
  240. inline auto first_four_moments(Container const & v)
  241. {
  242. return first_four_moments(v.cbegin(), v.cend());
  243. }
  244. // Follows equation 1.6 of:
  245. // https://prod.sandia.gov/techlib-noauth/access-control.cgi/2008/086212.pdf
  246. template<class ForwardIterator>
  247. auto kurtosis(ForwardIterator first, ForwardIterator last)
  248. {
  249. auto [M1, M2, M3, M4] = first_four_moments(first, last);
  250. if (M2 == 0)
  251. {
  252. return M2;
  253. }
  254. return M4/(M2*M2);
  255. }
  256. template<class Container>
  257. inline auto kurtosis(Container const & v)
  258. {
  259. return kurtosis(v.cbegin(), v.cend());
  260. }
  261. template<class ForwardIterator>
  262. auto excess_kurtosis(ForwardIterator first, ForwardIterator last)
  263. {
  264. return kurtosis(first, last) - 3;
  265. }
  266. template<class Container>
  267. inline auto excess_kurtosis(Container const & v)
  268. {
  269. return excess_kurtosis(v.cbegin(), v.cend());
  270. }
  271. template<class RandomAccessIterator>
  272. auto median(RandomAccessIterator first, RandomAccessIterator last)
  273. {
  274. size_t num_elems = std::distance(first, last);
  275. BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero length vector is undefined.");
  276. if (num_elems & 1)
  277. {
  278. auto middle = first + (num_elems - 1)/2;
  279. std::nth_element(first, middle, last);
  280. return *middle;
  281. }
  282. else
  283. {
  284. auto middle = first + num_elems/2 - 1;
  285. std::nth_element(first, middle, last);
  286. std::nth_element(middle, middle+1, last);
  287. return (*middle + *(middle+1))/2;
  288. }
  289. }
  290. template<class RandomAccessContainer>
  291. inline auto median(RandomAccessContainer & v)
  292. {
  293. return median(v.begin(), v.end());
  294. }
  295. template<class RandomAccessIterator>
  296. auto gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  297. {
  298. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  299. BOOST_ASSERT_MSG(first != last && std::next(first) != last, "Computation of the Gini coefficient requires at least two samples.");
  300. std::sort(first, last);
  301. if constexpr (std::is_integral<Real>::value)
  302. {
  303. double i = 1;
  304. double num = 0;
  305. double denom = 0;
  306. for (auto it = first; it != last; ++it)
  307. {
  308. num += *it*i;
  309. denom += *it;
  310. ++i;
  311. }
  312. // If the l1 norm is zero, all elements are zero, so every element is the same.
  313. if (denom == 0)
  314. {
  315. return double(0);
  316. }
  317. return ((2*num)/denom - i)/(i-1);
  318. }
  319. else
  320. {
  321. Real i = 1;
  322. Real num = 0;
  323. Real denom = 0;
  324. for (auto it = first; it != last; ++it)
  325. {
  326. num += *it*i;
  327. denom += *it;
  328. ++i;
  329. }
  330. // If the l1 norm is zero, all elements are zero, so every element is the same.
  331. if (denom == 0)
  332. {
  333. return Real(0);
  334. }
  335. return ((2*num)/denom - i)/(i-1);
  336. }
  337. }
  338. template<class RandomAccessContainer>
  339. inline auto gini_coefficient(RandomAccessContainer & v)
  340. {
  341. return gini_coefficient(v.begin(), v.end());
  342. }
  343. template<class RandomAccessIterator>
  344. inline auto sample_gini_coefficient(RandomAccessIterator first, RandomAccessIterator last)
  345. {
  346. size_t n = std::distance(first, last);
  347. return n*gini_coefficient(first, last)/(n-1);
  348. }
  349. template<class RandomAccessContainer>
  350. inline auto sample_gini_coefficient(RandomAccessContainer & v)
  351. {
  352. return sample_gini_coefficient(v.begin(), v.end());
  353. }
  354. template<class RandomAccessIterator>
  355. auto median_absolute_deviation(RandomAccessIterator first, RandomAccessIterator last, typename std::iterator_traits<RandomAccessIterator>::value_type center=std::numeric_limits<typename std::iterator_traits<RandomAccessIterator>::value_type>::quiet_NaN())
  356. {
  357. using std::abs;
  358. using Real = typename std::iterator_traits<RandomAccessIterator>::value_type;
  359. using std::isnan;
  360. if (isnan(center))
  361. {
  362. center = boost::math::tools::median(first, last);
  363. }
  364. size_t num_elems = std::distance(first, last);
  365. BOOST_ASSERT_MSG(num_elems > 0, "The median of a zero-length vector is undefined.");
  366. auto comparator = [&center](Real a, Real b) { return abs(a-center) < abs(b-center);};
  367. if (num_elems & 1)
  368. {
  369. auto middle = first + (num_elems - 1)/2;
  370. std::nth_element(first, middle, last, comparator);
  371. return abs(*middle);
  372. }
  373. else
  374. {
  375. auto middle = first + num_elems/2 - 1;
  376. std::nth_element(first, middle, last, comparator);
  377. std::nth_element(middle, middle+1, last, comparator);
  378. return (abs(*middle) + abs(*(middle+1)))/abs(static_cast<Real>(2));
  379. }
  380. }
  381. template<class RandomAccessContainer>
  382. inline auto median_absolute_deviation(RandomAccessContainer & v, typename RandomAccessContainer::value_type center=std::numeric_limits<typename RandomAccessContainer::value_type>::quiet_NaN())
  383. {
  384. return median_absolute_deviation(v.begin(), v.end(), center);
  385. }
  386. }
  387. #endif