_variation.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. import numpy as np
  2. from numpy.core.multiarray import normalize_axis_index
  3. from scipy._lib._util import _nan_allsame, _contains_nan
  4. from ._stats_py import _chk_asarray
  5. def _nanvariation(a, *, axis=0, ddof=0, keepdims=False):
  6. """
  7. Private version of `variation` that ignores nan.
  8. `a` must be a numpy array.
  9. `axis` is assumed to be normalized, i.e. 0 <= axis < a.ndim.
  10. """
  11. #
  12. # In theory, this should be as simple as something like
  13. # nanstd(a, ddof=ddof, axis=axis, keepdims=keepdims) /
  14. # nanmean(a, axis=axis, keepdims=keepdims)
  15. # In practice, annoying issues arise. Specifically, numpy
  16. # generates warnings in certain edge cases that we don't want
  17. # to propagate to the user. Unfortunately, there does not
  18. # appear to be a thread-safe way to filter out the warnings,
  19. # so we have to do the calculation in a way that doesn't
  20. # generate numpy warnings.
  21. #
  22. # Let N be the number of non-nan inputs in a slice.
  23. # Conditions that generate nan:
  24. # * empty input (i.e. N = 0)
  25. # * All non-nan values 0
  26. # * N < ddof
  27. # * N == ddof and the input is constant
  28. # Conditions that generate inf:
  29. # * non-constant input and either
  30. # * the mean is 0, or
  31. # * N == ddof
  32. #
  33. a_isnan = np.isnan(a)
  34. all_nan = a_isnan.all(axis=axis, keepdims=True)
  35. all_nan_full = np.broadcast_to(all_nan, a.shape)
  36. all_zero = (a_isnan | (a == 0)).all(axis=axis, keepdims=True) & ~all_nan
  37. # ngood is the number of non-nan values in each slice.
  38. ngood = (a.shape[axis] -
  39. np.expand_dims(np.count_nonzero(a_isnan, axis=axis), axis))
  40. # The return value is nan where ddof > ngood.
  41. ddof_too_big = ddof > ngood
  42. # If ddof == ngood, the return value is nan where the input is constant and
  43. # inf otherwise.
  44. ddof_equal_n = ddof == ngood
  45. is_const = _nan_allsame(a, axis=axis, keepdims=True)
  46. a2 = a.copy()
  47. # If an entire slice is nan, `np.nanmean` will generate a warning,
  48. # so we replace those nan's with 1.0 before computing the mean.
  49. # We'll fix the corresponding output later.
  50. a2[all_nan_full] = 1.0
  51. mean_a = np.nanmean(a2, axis=axis, keepdims=True)
  52. # If ddof >= ngood (the number of non-nan values in the slice), `np.nanstd`
  53. # will generate a warning, so set all the values in such a slice to 1.0.
  54. # We'll fix the corresponding output later.
  55. a2[np.broadcast_to(ddof_too_big, a2.shape) | ddof_equal_n] = 1.0
  56. with np.errstate(invalid='ignore'):
  57. std_a = np.nanstd(a2, axis=axis, ddof=ddof, keepdims=True)
  58. del a2
  59. sum_zero = np.nansum(a, axis=axis, keepdims=True) == 0
  60. # Where the sum along the axis is 0, replace mean_a with 1. This avoids
  61. # division by zero. We'll fix the corresponding output later.
  62. mean_a[sum_zero] = 1.0
  63. # Here--finally!--is the calculation of the variation.
  64. result = std_a / mean_a
  65. # Now fix the values that were given fake data to avoid warnings.
  66. result[~is_const & sum_zero] = np.inf
  67. signed_inf_mask = ~is_const & ddof_equal_n
  68. result[signed_inf_mask] = np.sign(mean_a[signed_inf_mask]) * np.inf
  69. nan_mask = all_zero | all_nan | ddof_too_big | (ddof_equal_n & is_const)
  70. result[nan_mask] = np.nan
  71. if not keepdims:
  72. result = np.squeeze(result, axis=axis)
  73. if result.shape == ():
  74. result = result[()]
  75. return result
  76. def variation(a, axis=0, nan_policy='propagate', ddof=0, *, keepdims=False):
  77. """
  78. Compute the coefficient of variation.
  79. The coefficient of variation is the standard deviation divided by the
  80. mean. This function is equivalent to::
  81. np.std(x, axis=axis, ddof=ddof) / np.mean(x)
  82. The default for ``ddof`` is 0, but many definitions of the coefficient
  83. of variation use the square root of the unbiased sample variance
  84. for the sample standard deviation, which corresponds to ``ddof=1``.
  85. The function does not take the absolute value of the mean of the data,
  86. so the return value is negative if the mean is negative.
  87. Parameters
  88. ----------
  89. a : array_like
  90. Input array.
  91. axis : int or None, optional
  92. Axis along which to calculate the coefficient of variation.
  93. Default is 0. If None, compute over the whole array `a`.
  94. nan_policy : {'propagate', 'raise', 'omit'}, optional
  95. Defines how to handle when input contains ``nan``.
  96. The following options are available:
  97. * 'propagate': return ``nan``
  98. * 'raise': raise an exception
  99. * 'omit': perform the calculation with ``nan`` values omitted
  100. The default is 'propagate'.
  101. ddof : int, optional
  102. Gives the "Delta Degrees Of Freedom" used when computing the
  103. standard deviation. The divisor used in the calculation of the
  104. standard deviation is ``N - ddof``, where ``N`` is the number of
  105. elements. `ddof` must be less than ``N``; if it isn't, the result
  106. will be ``nan`` or ``inf``, depending on ``N`` and the values in
  107. the array. By default `ddof` is zero for backwards compatibility,
  108. but it is recommended to use ``ddof=1`` to ensure that the sample
  109. standard deviation is computed as the square root of the unbiased
  110. sample variance.
  111. keepdims : bool, optional
  112. If this is set to True, the axes which are reduced are left in the
  113. result as dimensions with size one. With this option, the result
  114. will broadcast correctly against the input array.
  115. Returns
  116. -------
  117. variation : ndarray
  118. The calculated variation along the requested axis.
  119. Notes
  120. -----
  121. There are several edge cases that are handled without generating a
  122. warning:
  123. * If both the mean and the standard deviation are zero, ``nan``
  124. is returned.
  125. * If the mean is zero and the standard deviation is nonzero, ``inf``
  126. is returned.
  127. * If the input has length zero (either because the array has zero
  128. length, or all the input values are ``nan`` and ``nan_policy`` is
  129. ``'omit'``), ``nan`` is returned.
  130. * If the input contains ``inf``, ``nan`` is returned.
  131. References
  132. ----------
  133. .. [1] Zwillinger, D. and Kokoska, S. (2000). CRC Standard
  134. Probability and Statistics Tables and Formulae. Chapman & Hall: New
  135. York. 2000.
  136. Examples
  137. --------
  138. >>> import numpy as np
  139. >>> from scipy.stats import variation
  140. >>> variation([1, 2, 3, 4, 5], ddof=1)
  141. 0.5270462766947299
  142. Compute the variation along a given dimension of an array that contains
  143. a few ``nan`` values:
  144. >>> x = np.array([[ 10.0, np.nan, 11.0, 19.0, 23.0, 29.0, 98.0],
  145. ... [ 29.0, 30.0, 32.0, 33.0, 35.0, 56.0, 57.0],
  146. ... [np.nan, np.nan, 12.0, 13.0, 16.0, 16.0, 17.0]])
  147. >>> variation(x, axis=1, ddof=1, nan_policy='omit')
  148. array([1.05109361, 0.31428986, 0.146483 ])
  149. """
  150. a, axis = _chk_asarray(a, axis)
  151. axis = normalize_axis_index(axis, ndim=a.ndim)
  152. n = a.shape[axis]
  153. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  154. if contains_nan and nan_policy == 'omit':
  155. return _nanvariation(a, axis=axis, ddof=ddof, keepdims=keepdims)
  156. if a.size == 0 or ddof > n:
  157. # Handle as a special case to avoid spurious warnings.
  158. # The return values, if any, are all nan.
  159. shp = list(a.shape)
  160. if keepdims:
  161. shp[axis] = 1
  162. else:
  163. del shp[axis]
  164. if len(shp) == 0:
  165. result = np.nan
  166. else:
  167. result = np.full(shp, fill_value=np.nan)
  168. return result
  169. mean_a = a.mean(axis, keepdims=True)
  170. if ddof == n:
  171. # Another special case. Result is either inf or nan.
  172. std_a = a.std(axis=axis, ddof=0, keepdims=True)
  173. result = np.full_like(std_a, fill_value=np.nan)
  174. result.flat[std_a.flat > 0] = (np.sign(mean_a) * np.inf).flat
  175. if result.shape == ():
  176. result = result[()]
  177. return result
  178. with np.errstate(divide='ignore', invalid='ignore'):
  179. std_a = a.std(axis, ddof=ddof, keepdims=True)
  180. result = std_a / mean_a
  181. if not keepdims:
  182. result = np.squeeze(result, axis=axis)
  183. if result.shape == ():
  184. result = result[()]
  185. return result