_mstats_basic.py 115 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521
  1. """
  2. An extension of scipy.stats._stats_py to support masked arrays
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. __all__ = ['argstoarray',
  6. 'count_tied_groups',
  7. 'describe',
  8. 'f_oneway', 'find_repeats','friedmanchisquare',
  9. 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
  10. 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
  11. 'ks_1samp', 'kstest',
  12. 'linregress',
  13. 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
  14. 'normaltest',
  15. 'obrientransform',
  16. 'pearsonr','plotting_positions','pointbiserialr',
  17. 'rankdata',
  18. 'scoreatpercentile','sem',
  19. 'sen_seasonal_slopes','skew','skewtest','spearmanr',
  20. 'siegelslopes', 'theilslopes',
  21. 'tmax','tmean','tmin','trim','trimboth',
  22. 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
  23. 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
  24. 'ttest_ind','ttest_rel','tvar',
  25. 'variation',
  26. 'winsorize',
  27. 'brunnermunzel',
  28. ]
  29. import numpy as np
  30. from numpy import ndarray
  31. import numpy.ma as ma
  32. from numpy.ma import masked, nomask
  33. import math
  34. import itertools
  35. import warnings
  36. from collections import namedtuple
  37. from . import distributions
  38. from scipy._lib._util import _rename_parameter, _contains_nan
  39. from scipy._lib._bunch import _make_tuple_bunch
  40. import scipy.special as special
  41. import scipy.stats._stats_py
  42. from ._stats_mstats_common import (
  43. _find_repeats,
  44. linregress as stats_linregress,
  45. LinregressResult as stats_LinregressResult,
  46. theilslopes as stats_theilslopes,
  47. siegelslopes as stats_siegelslopes
  48. )
  49. def _chk_asarray(a, axis):
  50. # Always returns a masked array, raveled for axis=None
  51. a = ma.asanyarray(a)
  52. if axis is None:
  53. a = ma.ravel(a)
  54. outaxis = 0
  55. else:
  56. outaxis = axis
  57. return a, outaxis
  58. def _chk2_asarray(a, b, axis):
  59. a = ma.asanyarray(a)
  60. b = ma.asanyarray(b)
  61. if axis is None:
  62. a = ma.ravel(a)
  63. b = ma.ravel(b)
  64. outaxis = 0
  65. else:
  66. outaxis = axis
  67. return a, b, outaxis
  68. def _chk_size(a, b):
  69. a = ma.asanyarray(a)
  70. b = ma.asanyarray(b)
  71. (na, nb) = (a.size, b.size)
  72. if na != nb:
  73. raise ValueError("The size of the input array should match!"
  74. " (%s <> %s)" % (na, nb))
  75. return (a, b, na)
  76. def argstoarray(*args):
  77. """
  78. Constructs a 2D array from a group of sequences.
  79. Sequences are filled with missing values to match the length of the longest
  80. sequence.
  81. Parameters
  82. ----------
  83. *args : sequences
  84. Group of sequences.
  85. Returns
  86. -------
  87. argstoarray : MaskedArray
  88. A ( `m` x `n` ) masked array, where `m` is the number of arguments and
  89. `n` the length of the longest argument.
  90. Notes
  91. -----
  92. `numpy.ma.row_stack` has identical behavior, but is called with a sequence
  93. of sequences.
  94. Examples
  95. --------
  96. A 2D masked array constructed from a group of sequences is returned.
  97. >>> from scipy.stats.mstats import argstoarray
  98. >>> argstoarray([1, 2, 3], [4, 5, 6])
  99. masked_array(
  100. data=[[1.0, 2.0, 3.0],
  101. [4.0, 5.0, 6.0]],
  102. mask=[[False, False, False],
  103. [False, False, False]],
  104. fill_value=1e+20)
  105. The returned masked array filled with missing values when the lengths of
  106. sequences are different.
  107. >>> argstoarray([1, 3], [4, 5, 6])
  108. masked_array(
  109. data=[[1.0, 3.0, --],
  110. [4.0, 5.0, 6.0]],
  111. mask=[[False, False, True],
  112. [False, False, False]],
  113. fill_value=1e+20)
  114. """
  115. if len(args) == 1 and not isinstance(args[0], ndarray):
  116. output = ma.asarray(args[0])
  117. if output.ndim != 2:
  118. raise ValueError("The input should be 2D")
  119. else:
  120. n = len(args)
  121. m = max([len(k) for k in args])
  122. output = ma.array(np.empty((n,m), dtype=float), mask=True)
  123. for (k,v) in enumerate(args):
  124. output[k,:len(v)] = v
  125. output[np.logical_not(np.isfinite(output._data))] = masked
  126. return output
  127. def find_repeats(arr):
  128. """Find repeats in arr and return a tuple (repeats, repeat_count).
  129. The input is cast to float64. Masked values are discarded.
  130. Parameters
  131. ----------
  132. arr : sequence
  133. Input array. The array is flattened if it is not 1D.
  134. Returns
  135. -------
  136. repeats : ndarray
  137. Array of repeated values.
  138. counts : ndarray
  139. Array of counts.
  140. Examples
  141. --------
  142. >>> from scipy.stats import mstats
  143. >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
  144. (array([2.]), array([4]))
  145. In the above example, 2 repeats 4 times.
  146. >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
  147. (array([4., 5.]), array([2, 2]))
  148. In the above example, both 4 and 5 repeat 2 times.
  149. """
  150. # Make sure we get a copy. ma.compressed promises a "new array", but can
  151. # actually return a reference.
  152. compr = np.asarray(ma.compressed(arr), dtype=np.float64)
  153. try:
  154. need_copy = np.may_share_memory(compr, arr)
  155. except AttributeError:
  156. # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
  157. # while in numpy 1.8.2 and above it just (correctly) returns False.
  158. need_copy = False
  159. if need_copy:
  160. compr = compr.copy()
  161. return _find_repeats(compr)
  162. def count_tied_groups(x, use_missing=False):
  163. """
  164. Counts the number of tied values.
  165. Parameters
  166. ----------
  167. x : sequence
  168. Sequence of data on which to counts the ties
  169. use_missing : bool, optional
  170. Whether to consider missing values as tied.
  171. Returns
  172. -------
  173. count_tied_groups : dict
  174. Returns a dictionary (nb of ties: nb of groups).
  175. Examples
  176. --------
  177. >>> from scipy.stats import mstats
  178. >>> import numpy as np
  179. >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
  180. >>> mstats.count_tied_groups(z)
  181. {2: 1, 3: 2}
  182. In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
  183. >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
  184. >>> mstats.count_tied_groups(z)
  185. {2: 2, 3: 1}
  186. >>> z[[1,-1]] = np.ma.masked
  187. >>> mstats.count_tied_groups(z, use_missing=True)
  188. {2: 2, 3: 1}
  189. """
  190. nmasked = ma.getmask(x).sum()
  191. # We need the copy as find_repeats will overwrite the initial data
  192. data = ma.compressed(x).copy()
  193. (ties, counts) = find_repeats(data)
  194. nties = {}
  195. if len(ties):
  196. nties = dict(zip(np.unique(counts), itertools.repeat(1)))
  197. nties.update(dict(zip(*find_repeats(counts))))
  198. if nmasked and use_missing:
  199. try:
  200. nties[nmasked] += 1
  201. except KeyError:
  202. nties[nmasked] = 1
  203. return nties
  204. def rankdata(data, axis=None, use_missing=False):
  205. """Returns the rank (also known as order statistics) of each data point
  206. along the given axis.
  207. If some values are tied, their rank is averaged.
  208. If some values are masked, their rank is set to 0 if use_missing is False,
  209. or set to the average rank of the unmasked values if use_missing is True.
  210. Parameters
  211. ----------
  212. data : sequence
  213. Input data. The data is transformed to a masked array
  214. axis : {None,int}, optional
  215. Axis along which to perform the ranking.
  216. If None, the array is first flattened. An exception is raised if
  217. the axis is specified for arrays with a dimension larger than 2
  218. use_missing : bool, optional
  219. Whether the masked values have a rank of 0 (False) or equal to the
  220. average rank of the unmasked values (True).
  221. """
  222. def _rank1d(data, use_missing=False):
  223. n = data.count()
  224. rk = np.empty(data.size, dtype=float)
  225. idx = data.argsort()
  226. rk[idx[:n]] = np.arange(1,n+1)
  227. if use_missing:
  228. rk[idx[n:]] = (n+1)/2.
  229. else:
  230. rk[idx[n:]] = 0
  231. repeats = find_repeats(data.copy())
  232. for r in repeats[0]:
  233. condition = (data == r).filled(False)
  234. rk[condition] = rk[condition].mean()
  235. return rk
  236. data = ma.array(data, copy=False)
  237. if axis is None:
  238. if data.ndim > 1:
  239. return _rank1d(data.ravel(), use_missing).reshape(data.shape)
  240. else:
  241. return _rank1d(data, use_missing)
  242. else:
  243. return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
  244. ModeResult = namedtuple('ModeResult', ('mode', 'count'))
  245. def mode(a, axis=0):
  246. """
  247. Returns an array of the modal (most common) value in the passed array.
  248. Parameters
  249. ----------
  250. a : array_like
  251. n-dimensional array of which to find mode(s).
  252. axis : int or None, optional
  253. Axis along which to operate. Default is 0. If None, compute over
  254. the whole array `a`.
  255. Returns
  256. -------
  257. mode : ndarray
  258. Array of modal values.
  259. count : ndarray
  260. Array of counts for each mode.
  261. Notes
  262. -----
  263. For more details, see `scipy.stats.mode`.
  264. Examples
  265. --------
  266. >>> import numpy as np
  267. >>> from scipy import stats
  268. >>> from scipy.stats import mstats
  269. >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
  270. >>> mstats.mode(m_arr) # note that most zeros are masked
  271. ModeResult(mode=array([1.]), count=array([2.]))
  272. """
  273. return _mode(a, axis=axis, keepdims=True)
  274. def _mode(a, axis=0, keepdims=True):
  275. # Don't want to expose `keepdims` from the public `mstats.mode`
  276. a, axis = _chk_asarray(a, axis)
  277. def _mode1D(a):
  278. (rep,cnt) = find_repeats(a)
  279. if not cnt.ndim:
  280. return (0, 0)
  281. elif cnt.size:
  282. return (rep[cnt.argmax()], cnt.max())
  283. else:
  284. return (a.min(), 1)
  285. if axis is None:
  286. output = _mode1D(ma.ravel(a))
  287. output = (ma.array(output[0]), ma.array(output[1]))
  288. else:
  289. output = ma.apply_along_axis(_mode1D, axis, a)
  290. if keepdims is None or keepdims:
  291. newshape = list(a.shape)
  292. newshape[axis] = 1
  293. slices = [slice(None)] * output.ndim
  294. slices[axis] = 0
  295. modes = output[tuple(slices)].reshape(newshape)
  296. slices[axis] = 1
  297. counts = output[tuple(slices)].reshape(newshape)
  298. output = (modes, counts)
  299. else:
  300. output = np.moveaxis(output, axis, 0)
  301. return ModeResult(*output)
  302. def _betai(a, b, x):
  303. x = np.asanyarray(x)
  304. x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
  305. return special.betainc(a, b, x)
  306. def msign(x):
  307. """Returns the sign of x, or 0 if x is masked."""
  308. return ma.filled(np.sign(x), 0)
  309. def pearsonr(x, y):
  310. r"""
  311. Pearson correlation coefficient and p-value for testing non-correlation.
  312. The Pearson correlation coefficient [1]_ measures the linear relationship
  313. between two datasets. The calculation of the p-value relies on the
  314. assumption that each dataset is normally distributed. (See Kowalski [3]_
  315. for a discussion of the effects of non-normality of the input on the
  316. distribution of the correlation coefficient.) Like other correlation
  317. coefficients, this one varies between -1 and +1 with 0 implying no
  318. correlation. Correlations of -1 or +1 imply an exact linear relationship.
  319. Parameters
  320. ----------
  321. x : (N,) array_like
  322. Input array.
  323. y : (N,) array_like
  324. Input array.
  325. Returns
  326. -------
  327. r : float
  328. Pearson's correlation coefficient.
  329. p-value : float
  330. Two-tailed p-value.
  331. Warns
  332. -----
  333. PearsonRConstantInputWarning
  334. Raised if an input is a constant array. The correlation coefficient
  335. is not defined in this case, so ``np.nan`` is returned.
  336. PearsonRNearConstantInputWarning
  337. Raised if an input is "nearly" constant. The array ``x`` is considered
  338. nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
  339. Numerical errors in the calculation ``x - mean(x)`` in this case might
  340. result in an inaccurate calculation of r.
  341. See Also
  342. --------
  343. spearmanr : Spearman rank-order correlation coefficient.
  344. kendalltau : Kendall's tau, a correlation measure for ordinal data.
  345. Notes
  346. -----
  347. The correlation coefficient is calculated as follows:
  348. .. math::
  349. r = \frac{\sum (x - m_x) (y - m_y)}
  350. {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
  351. where :math:`m_x` is the mean of the vector x and :math:`m_y` is
  352. the mean of the vector y.
  353. Under the assumption that x and y are drawn from
  354. independent normal distributions (so the population correlation coefficient
  355. is 0), the probability density function of the sample correlation
  356. coefficient r is ([1]_, [2]_):
  357. .. math::
  358. f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
  359. where n is the number of samples, and B is the beta function. This
  360. is sometimes referred to as the exact distribution of r. This is
  361. the distribution that is used in `pearsonr` to compute the p-value.
  362. The distribution is a beta distribution on the interval [-1, 1],
  363. with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
  364. implementation of the beta distribution, the distribution of r is::
  365. dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
  366. The p-value returned by `pearsonr` is a two-sided p-value. The p-value
  367. roughly indicates the probability of an uncorrelated system
  368. producing datasets that have a Pearson correlation at least as extreme
  369. as the one computed from these datasets. More precisely, for a
  370. given sample with correlation coefficient r, the p-value is
  371. the probability that abs(r') of a random sample x' and y' drawn from
  372. the population with zero correlation would be greater than or equal
  373. to abs(r). In terms of the object ``dist`` shown above, the p-value
  374. for a given r and length n can be computed as::
  375. p = 2*dist.cdf(-abs(r))
  376. When n is 2, the above continuous distribution is not well-defined.
  377. One can interpret the limit of the beta distribution as the shape
  378. parameters a and b approach a = b = 0 as a discrete distribution with
  379. equal probability masses at r = 1 and r = -1. More directly, one
  380. can observe that, given the data x = [x1, x2] and y = [y1, y2], and
  381. assuming x1 != x2 and y1 != y2, the only possible values for r are 1
  382. and -1. Because abs(r') for any sample x' and y' with length 2 will
  383. be 1, the two-sided p-value for a sample of length 2 is always 1.
  384. References
  385. ----------
  386. .. [1] "Pearson correlation coefficient", Wikipedia,
  387. https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
  388. .. [2] Student, "Probable error of a correlation coefficient",
  389. Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
  390. .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
  391. of the Sample Product-Moment Correlation Coefficient"
  392. Journal of the Royal Statistical Society. Series C (Applied
  393. Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
  394. Examples
  395. --------
  396. >>> import numpy as np
  397. >>> from scipy import stats
  398. >>> from scipy.stats import mstats
  399. >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
  400. (-0.7426106572325057, 0.1505558088534455)
  401. There is a linear dependence between x and y if y = a + b*x + e, where
  402. a,b are constants and e is a random error term, assumed to be independent
  403. of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
  404. e follow a normal distribution with mean zero and standard deviation s>0.
  405. >>> s = 0.5
  406. >>> x = stats.norm.rvs(size=500)
  407. >>> e = stats.norm.rvs(scale=s, size=500)
  408. >>> y = x + e
  409. >>> mstats.pearsonr(x, y)
  410. (0.9029601878969703, 8.428978827629898e-185) # may vary
  411. This should be close to the exact value given by
  412. >>> 1/np.sqrt(1 + s**2)
  413. 0.8944271909999159
  414. For s=0.5, we observe a high level of correlation. In general, a large
  415. variance of the noise reduces the correlation, while the correlation
  416. approaches one as the variance of the error goes to zero.
  417. It is important to keep in mind that no correlation does not imply
  418. independence unless (x, y) is jointly normal. Correlation can even be zero
  419. when there is a very simple dependence structure: if X follows a
  420. standard normal distribution, let y = abs(x). Note that the correlation
  421. between x and y is zero. Indeed, since the expectation of x is zero,
  422. cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
  423. by symmetry. The following lines of code illustrate this observation:
  424. >>> y = np.abs(x)
  425. >>> mstats.pearsonr(x, y)
  426. (-0.016172891856853524, 0.7182823678751942) # may vary
  427. A non-zero correlation coefficient can be misleading. For example, if X has
  428. a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
  429. A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
  430. implying a high level of correlation:
  431. >>> y = np.where(x < 0, x, 0)
  432. >>> mstats.pearsonr(x, y)
  433. (0.8537091583771509, 3.183461621422181e-143) # may vary
  434. This is unintuitive since there is no dependence of x and y if x is larger
  435. than zero which happens in about half of the cases if we sample x and y.
  436. """
  437. (x, y, n) = _chk_size(x, y)
  438. (x, y) = (x.ravel(), y.ravel())
  439. # Get the common mask and the total nb of unmasked elements
  440. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  441. n -= m.sum()
  442. df = n-2
  443. if df < 0:
  444. return (masked, masked)
  445. return scipy.stats._stats_py.pearsonr(ma.masked_array(x, mask=m).compressed(),
  446. ma.masked_array(y, mask=m).compressed())
  447. def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
  448. alternative='two-sided'):
  449. """
  450. Calculates a Spearman rank-order correlation coefficient and the p-value
  451. to test for non-correlation.
  452. The Spearman correlation is a nonparametric measure of the linear
  453. relationship between two datasets. Unlike the Pearson correlation, the
  454. Spearman correlation does not assume that both datasets are normally
  455. distributed. Like other correlation coefficients, this one varies
  456. between -1 and +1 with 0 implying no correlation. Correlations of -1 or
  457. +1 imply a monotonic relationship. Positive correlations imply that
  458. as `x` increases, so does `y`. Negative correlations imply that as `x`
  459. increases, `y` decreases.
  460. Missing values are discarded pair-wise: if a value is missing in `x`, the
  461. corresponding value in `y` is masked.
  462. The p-value roughly indicates the probability of an uncorrelated system
  463. producing datasets that have a Spearman correlation at least as extreme
  464. as the one computed from these datasets. The p-values are not entirely
  465. reliable but are probably reasonable for datasets larger than 500 or so.
  466. Parameters
  467. ----------
  468. x, y : 1D or 2D array_like, y is optional
  469. One or two 1-D or 2-D arrays containing multiple variables and
  470. observations. When these are 1-D, each represents a vector of
  471. observations of a single variable. For the behavior in the 2-D case,
  472. see under ``axis``, below.
  473. use_ties : bool, optional
  474. DO NOT USE. Does not do anything, keyword is only left in place for
  475. backwards compatibility reasons.
  476. axis : int or None, optional
  477. If axis=0 (default), then each column represents a variable, with
  478. observations in the rows. If axis=1, the relationship is transposed:
  479. each row represents a variable, while the columns contain observations.
  480. If axis=None, then both arrays will be raveled.
  481. nan_policy : {'propagate', 'raise', 'omit'}, optional
  482. Defines how to handle when input contains nan. 'propagate' returns nan,
  483. 'raise' throws an error, 'omit' performs the calculations ignoring nan
  484. values. Default is 'propagate'.
  485. alternative : {'two-sided', 'less', 'greater'}, optional
  486. Defines the alternative hypothesis. Default is 'two-sided'.
  487. The following options are available:
  488. * 'two-sided': the correlation is nonzero
  489. * 'less': the correlation is negative (less than zero)
  490. * 'greater': the correlation is positive (greater than zero)
  491. .. versionadded:: 1.7.0
  492. Returns
  493. -------
  494. res : SignificanceResult
  495. An object containing attributes:
  496. statistic : float or ndarray (2-D square)
  497. Spearman correlation matrix or correlation coefficient (if only 2
  498. variables are given as parameters). Correlation matrix is square
  499. with length equal to total number of variables (columns or rows) in
  500. ``a`` and ``b`` combined.
  501. pvalue : float
  502. The p-value for a hypothesis test whose null hypothesis
  503. is that two sets of data are linearly uncorrelated. See
  504. `alternative` above for alternative hypotheses. `pvalue` has the
  505. same shape as `statistic`.
  506. References
  507. ----------
  508. [CRCProbStat2000] section 14.7
  509. """
  510. if not use_ties:
  511. raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
  512. # Always returns a masked array, raveled if axis=None
  513. x, axisout = _chk_asarray(x, axis)
  514. if y is not None:
  515. # Deal only with 2-D `x` case.
  516. y, _ = _chk_asarray(y, axis)
  517. if axisout == 0:
  518. x = ma.column_stack((x, y))
  519. else:
  520. x = ma.row_stack((x, y))
  521. if axisout == 1:
  522. # To simplify the code that follow (always use `n_obs, n_vars` shape)
  523. x = x.T
  524. if nan_policy == 'omit':
  525. x = ma.masked_invalid(x)
  526. def _spearmanr_2cols(x):
  527. # Mask the same observations for all variables, and then drop those
  528. # observations (can't leave them masked, rankdata is weird).
  529. x = ma.mask_rowcols(x, axis=0)
  530. x = x[~x.mask.any(axis=1), :]
  531. # If either column is entirely NaN or Inf
  532. if not np.any(x.data):
  533. res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
  534. res.correlation = np.nan
  535. return res
  536. m = ma.getmask(x)
  537. n_obs = x.shape[0]
  538. dof = n_obs - 2 - int(m.sum(axis=0)[0])
  539. if dof < 0:
  540. raise ValueError("The input must have at least 3 entries!")
  541. # Gets the ranks and rank differences
  542. x_ranked = rankdata(x, axis=0)
  543. rs = ma.corrcoef(x_ranked, rowvar=False).data
  544. # rs can have elements equal to 1, so avoid zero division warnings
  545. with np.errstate(divide='ignore'):
  546. # clip the small negative values possibly caused by rounding
  547. # errors before taking the square root
  548. t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
  549. t, prob = scipy.stats._stats_py._ttest_finish(dof, t, alternative)
  550. # For backwards compatibility, return scalars when comparing 2 columns
  551. if rs.shape == (2, 2):
  552. res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
  553. prob[1, 0])
  554. res.correlation = rs[1, 0]
  555. return res
  556. else:
  557. res = scipy.stats._stats_py.SignificanceResult(rs, prob)
  558. res.correlation = rs
  559. return res
  560. # Need to do this per pair of variables, otherwise the dropped observations
  561. # in a third column mess up the result for a pair.
  562. n_vars = x.shape[1]
  563. if n_vars == 2:
  564. return _spearmanr_2cols(x)
  565. else:
  566. rs = np.ones((n_vars, n_vars), dtype=float)
  567. prob = np.zeros((n_vars, n_vars), dtype=float)
  568. for var1 in range(n_vars - 1):
  569. for var2 in range(var1+1, n_vars):
  570. result = _spearmanr_2cols(x[:, [var1, var2]])
  571. rs[var1, var2] = result.correlation
  572. rs[var2, var1] = result.correlation
  573. prob[var1, var2] = result.pvalue
  574. prob[var2, var1] = result.pvalue
  575. res = scipy.stats._stats_py.SignificanceResult(rs, prob)
  576. res.correlation = rs
  577. return res
  578. def _kendall_p_exact(n, c, alternative='two-sided'):
  579. # Use the fact that distribution is symmetric: always calculate a CDF in
  580. # the left tail.
  581. # This will be the one-sided p-value if `c` is on the side of
  582. # the null distribution predicted by the alternative hypothesis.
  583. # The two-sided p-value will be twice this value.
  584. # If `c` is on the other side of the null distribution, we'll need to
  585. # take the complement and add back the probability mass at `c`.
  586. in_right_tail = (c >= (n*(n-1))//2 - c)
  587. alternative_greater = (alternative == 'greater')
  588. c = int(min(c, (n*(n-1))//2 - c))
  589. # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
  590. # (4th Edition), Charles Griffin & Co., 1970.
  591. if n <= 0:
  592. raise ValueError(f'n ({n}) must be positive')
  593. elif c < 0 or 4*c > n*(n-1):
  594. raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
  595. elif n == 1:
  596. prob = 1.0
  597. p_mass_at_c = 1
  598. elif n == 2:
  599. prob = 1.0
  600. p_mass_at_c = 0.5
  601. elif c == 0:
  602. prob = 2.0/math.factorial(n) if n < 171 else 0.0
  603. p_mass_at_c = prob/2
  604. elif c == 1:
  605. prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
  606. p_mass_at_c = (n-1)/math.factorial(n)
  607. elif 4*c == n*(n-1) and alternative == 'two-sided':
  608. # I'm sure there's a simple formula for p_mass_at_c in this
  609. # case, but I don't know it. Use generic formula for one-sided p-value.
  610. prob = 1.0
  611. elif n < 171:
  612. new = np.zeros(c+1)
  613. new[0:2] = 1.0
  614. for j in range(3,n+1):
  615. new = np.cumsum(new)
  616. if j <= c:
  617. new[j:] -= new[:c+1-j]
  618. prob = 2.0*np.sum(new)/math.factorial(n)
  619. p_mass_at_c = new[-1]/math.factorial(n)
  620. else:
  621. new = np.zeros(c+1)
  622. new[0:2] = 1.0
  623. for j in range(3, n+1):
  624. new = np.cumsum(new)/j
  625. if j <= c:
  626. new[j:] -= new[:c+1-j]
  627. prob = np.sum(new)
  628. p_mass_at_c = new[-1]/2
  629. if alternative != 'two-sided':
  630. # if the alternative hypothesis and alternative agree,
  631. # one-sided p-value is half the two-sided p-value
  632. if in_right_tail == alternative_greater:
  633. prob /= 2
  634. else:
  635. prob = 1 - prob/2 + p_mass_at_c
  636. prob = np.clip(prob, 0, 1)
  637. return prob
  638. def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
  639. alternative='two-sided'):
  640. """
  641. Computes Kendall's rank correlation tau on two variables *x* and *y*.
  642. Parameters
  643. ----------
  644. x : sequence
  645. First data list (for example, time).
  646. y : sequence
  647. Second data list.
  648. use_ties : {True, False}, optional
  649. Whether ties correction should be performed.
  650. use_missing : {False, True}, optional
  651. Whether missing data should be allocated a rank of 0 (False) or the
  652. average rank (True)
  653. method : {'auto', 'asymptotic', 'exact'}, optional
  654. Defines which method is used to calculate the p-value [1]_.
  655. 'asymptotic' uses a normal approximation valid for large samples.
  656. 'exact' computes the exact p-value, but can only be used if no ties
  657. are present. As the sample size increases, the 'exact' computation
  658. time may grow and the result may lose some precision.
  659. 'auto' is the default and selects the appropriate
  660. method based on a trade-off between speed and accuracy.
  661. alternative : {'two-sided', 'less', 'greater'}, optional
  662. Defines the alternative hypothesis. Default is 'two-sided'.
  663. The following options are available:
  664. * 'two-sided': the rank correlation is nonzero
  665. * 'less': the rank correlation is negative (less than zero)
  666. * 'greater': the rank correlation is positive (greater than zero)
  667. Returns
  668. -------
  669. res : SignificanceResult
  670. An object containing attributes:
  671. statistic : float
  672. The tau statistic.
  673. pvalue : float
  674. The p-value for a hypothesis test whose null hypothesis is
  675. an absence of association, tau = 0.
  676. References
  677. ----------
  678. .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
  679. Charles Griffin & Co., 1970.
  680. """
  681. (x, y, n) = _chk_size(x, y)
  682. (x, y) = (x.flatten(), y.flatten())
  683. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  684. if m is not nomask:
  685. x = ma.array(x, mask=m, copy=True)
  686. y = ma.array(y, mask=m, copy=True)
  687. # need int() here, otherwise numpy defaults to 32 bit
  688. # integer on all Windows architectures, causing overflow.
  689. # int() will keep it infinite precision.
  690. n -= int(m.sum())
  691. if n < 2:
  692. res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
  693. res.correlation = np.nan
  694. return res
  695. rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
  696. ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
  697. idx = rx.argsort()
  698. (rx, ry) = (rx[idx], ry[idx])
  699. C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
  700. for i in range(len(ry)-1)], dtype=float)
  701. D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
  702. for i in range(len(ry)-1)], dtype=float)
  703. xties = count_tied_groups(x)
  704. yties = count_tied_groups(y)
  705. if use_ties:
  706. corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
  707. corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
  708. denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
  709. else:
  710. denom = n*(n-1)/2.
  711. tau = (C-D) / denom
  712. if method == 'exact' and (xties or yties):
  713. raise ValueError("Ties found, exact method cannot be used.")
  714. if method == 'auto':
  715. if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
  716. method = 'exact'
  717. else:
  718. method = 'asymptotic'
  719. if not xties and not yties and method == 'exact':
  720. prob = _kendall_p_exact(n, C, alternative)
  721. elif method == 'asymptotic':
  722. var_s = n*(n-1)*(2*n+5)
  723. if use_ties:
  724. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
  725. var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
  726. v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\
  727. np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float)
  728. v1 /= 2.*n*(n-1)
  729. if n > 2:
  730. v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
  731. dtype=float) * \
  732. np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
  733. dtype=float)
  734. v2 /= 9.*n*(n-1)*(n-2)
  735. else:
  736. v2 = 0
  737. else:
  738. v1 = v2 = 0
  739. var_s /= 18.
  740. var_s += (v1 + v2)
  741. z = (C-D)/np.sqrt(var_s)
  742. _, prob = scipy.stats._stats_py._normtest_finish(z, alternative)
  743. else:
  744. raise ValueError("Unknown method "+str(method)+" specified, please "
  745. "use auto, exact or asymptotic.")
  746. res = scipy.stats._stats_py.SignificanceResult(tau, prob)
  747. res.correlation = tau
  748. return res
  749. def kendalltau_seasonal(x):
  750. """
  751. Computes a multivariate Kendall's rank correlation tau, for seasonal data.
  752. Parameters
  753. ----------
  754. x : 2-D ndarray
  755. Array of seasonal data, with seasons in columns.
  756. """
  757. x = ma.array(x, subok=True, copy=False, ndmin=2)
  758. (n,m) = x.shape
  759. n_p = x.count(0)
  760. S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
  761. S_tot = S_szn.sum()
  762. n_tot = x.count()
  763. ties = count_tied_groups(x.compressed())
  764. corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
  765. denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
  766. R = rankdata(x, axis=0, use_missing=True)
  767. K = ma.empty((m,m), dtype=int)
  768. covmat = ma.empty((m,m), dtype=float)
  769. denom_szn = ma.empty(m, dtype=float)
  770. for j in range(m):
  771. ties_j = count_tied_groups(x[:,j].compressed())
  772. corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
  773. cmb = n_p[j]*(n_p[j]-1)
  774. for k in range(j,m,1):
  775. K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
  776. for i in range(n))
  777. covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
  778. n*(n_p[j]+1)*(n_p[k]+1))/3.
  779. K[k,j] = K[j,k]
  780. covmat[k,j] = covmat[j,k]
  781. denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
  782. var_szn = covmat.diagonal()
  783. z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
  784. z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
  785. z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
  786. prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
  787. prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
  788. prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
  789. chi2_tot = (z_szn*z_szn).sum()
  790. chi2_trd = m * z_szn.mean()**2
  791. output = {'seasonal tau': S_szn/denom_szn,
  792. 'global tau': S_tot/denom_tot,
  793. 'global tau (alt)': S_tot/denom_szn.sum(),
  794. 'seasonal p-value': prob_szn,
  795. 'global p-value (indep)': prob_tot_ind,
  796. 'global p-value (dep)': prob_tot_dep,
  797. 'chi2 total': chi2_tot,
  798. 'chi2 trend': chi2_trd,
  799. }
  800. return output
  801. PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
  802. 'pvalue'))
  803. def pointbiserialr(x, y):
  804. """Calculates a point biserial correlation coefficient and its p-value.
  805. Parameters
  806. ----------
  807. x : array_like of bools
  808. Input array.
  809. y : array_like
  810. Input array.
  811. Returns
  812. -------
  813. correlation : float
  814. R value
  815. pvalue : float
  816. 2-tailed p-value
  817. Notes
  818. -----
  819. Missing values are considered pair-wise: if a value is missing in x,
  820. the corresponding value in y is masked.
  821. For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
  822. """
  823. x = ma.fix_invalid(x, copy=True).astype(bool)
  824. y = ma.fix_invalid(y, copy=True).astype(float)
  825. # Get rid of the missing data
  826. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  827. if m is not nomask:
  828. unmask = np.logical_not(m)
  829. x = x[unmask]
  830. y = y[unmask]
  831. n = len(x)
  832. # phat is the fraction of x values that are True
  833. phat = x.sum() / float(n)
  834. y0 = y[~x] # y-values where x is False
  835. y1 = y[x] # y-values where x is True
  836. y0m = y0.mean()
  837. y1m = y1.mean()
  838. rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
  839. df = n-2
  840. t = rpb*ma.sqrt(df/(1.0-rpb**2))
  841. prob = _betai(0.5*df, 0.5, df/(df+t*t))
  842. return PointbiserialrResult(rpb, prob)
  843. def linregress(x, y=None):
  844. r"""
  845. Linear regression calculation
  846. Note that the non-masked version is used, and that this docstring is
  847. replaced by the non-masked docstring + some info on missing data.
  848. """
  849. if y is None:
  850. x = ma.array(x)
  851. if x.shape[0] == 2:
  852. x, y = x
  853. elif x.shape[1] == 2:
  854. x, y = x.T
  855. else:
  856. raise ValueError("If only `x` is given as input, "
  857. "it has to be of shape (2, N) or (N, 2), "
  858. f"provided shape was {x.shape}")
  859. else:
  860. x = ma.array(x)
  861. y = ma.array(y)
  862. x = x.flatten()
  863. y = y.flatten()
  864. if np.amax(x) == np.amin(x) and len(x) > 1:
  865. raise ValueError("Cannot calculate a linear regression "
  866. "if all x values are identical")
  867. m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
  868. if m is not nomask:
  869. x = ma.array(x, mask=m)
  870. y = ma.array(y, mask=m)
  871. if np.any(~m):
  872. result = stats_linregress(x.data[~m], y.data[~m])
  873. else:
  874. # All data is masked
  875. result = stats_LinregressResult(slope=None, intercept=None,
  876. rvalue=None, pvalue=None,
  877. stderr=None,
  878. intercept_stderr=None)
  879. else:
  880. result = stats_linregress(x.data, y.data)
  881. return result
  882. def theilslopes(y, x=None, alpha=0.95, method='separate'):
  883. r"""
  884. Computes the Theil-Sen estimator for a set of points (x, y).
  885. `theilslopes` implements a method for robust linear regression. It
  886. computes the slope as the median of all slopes between paired values.
  887. Parameters
  888. ----------
  889. y : array_like
  890. Dependent variable.
  891. x : array_like or None, optional
  892. Independent variable. If None, use ``arange(len(y))`` instead.
  893. alpha : float, optional
  894. Confidence degree between 0 and 1. Default is 95% confidence.
  895. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  896. interpreted as "find the 90% confidence interval".
  897. method : {'joint', 'separate'}, optional
  898. Method to be used for computing estimate for intercept.
  899. Following methods are supported,
  900. * 'joint': Uses np.median(y - slope * x) as intercept.
  901. * 'separate': Uses np.median(y) - slope * np.median(x)
  902. as intercept.
  903. The default is 'separate'.
  904. .. versionadded:: 1.8.0
  905. Returns
  906. -------
  907. result : ``TheilslopesResult`` instance
  908. The return value is an object with the following attributes:
  909. slope : float
  910. Theil slope.
  911. intercept : float
  912. Intercept of the Theil line.
  913. low_slope : float
  914. Lower bound of the confidence interval on `slope`.
  915. high_slope : float
  916. Upper bound of the confidence interval on `slope`.
  917. See Also
  918. --------
  919. siegelslopes : a similar technique using repeated medians
  920. Notes
  921. -----
  922. For more details on `theilslopes`, see `scipy.stats.theilslopes`.
  923. """
  924. y = ma.asarray(y).flatten()
  925. if x is None:
  926. x = ma.arange(len(y), dtype=float)
  927. else:
  928. x = ma.asarray(x).flatten()
  929. if len(x) != len(y):
  930. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x)))
  931. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  932. y._mask = x._mask = m
  933. # Disregard any masked elements of x or y
  934. y = y.compressed()
  935. x = x.compressed().astype(float)
  936. # We now have unmasked arrays so can use `scipy.stats.theilslopes`
  937. return stats_theilslopes(y, x, alpha=alpha, method=method)
  938. def siegelslopes(y, x=None, method="hierarchical"):
  939. r"""
  940. Computes the Siegel estimator for a set of points (x, y).
  941. `siegelslopes` implements a method for robust linear regression
  942. using repeated medians to fit a line to the points (x, y).
  943. The method is robust to outliers with an asymptotic breakdown point
  944. of 50%.
  945. Parameters
  946. ----------
  947. y : array_like
  948. Dependent variable.
  949. x : array_like or None, optional
  950. Independent variable. If None, use ``arange(len(y))`` instead.
  951. method : {'hierarchical', 'separate'}
  952. If 'hierarchical', estimate the intercept using the estimated
  953. slope ``slope`` (default option).
  954. If 'separate', estimate the intercept independent of the estimated
  955. slope. See Notes for details.
  956. Returns
  957. -------
  958. result : ``SiegelslopesResult`` instance
  959. The return value is an object with the following attributes:
  960. slope : float
  961. Estimate of the slope of the regression line.
  962. intercept : float
  963. Estimate of the intercept of the regression line.
  964. See Also
  965. --------
  966. theilslopes : a similar technique without repeated medians
  967. Notes
  968. -----
  969. For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
  970. """
  971. y = ma.asarray(y).ravel()
  972. if x is None:
  973. x = ma.arange(len(y), dtype=float)
  974. else:
  975. x = ma.asarray(x).ravel()
  976. if len(x) != len(y):
  977. raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
  978. m = ma.mask_or(ma.getmask(x), ma.getmask(y))
  979. y._mask = x._mask = m
  980. # Disregard any masked elements of x or y
  981. y = y.compressed()
  982. x = x.compressed().astype(float)
  983. # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
  984. return stats_siegelslopes(y, x, method=method)
  985. SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
  986. ['intra_slope', 'inter_slope'])
  987. def sen_seasonal_slopes(x):
  988. r"""
  989. Computes seasonal Theil-Sen and Kendall slope estimators.
  990. The seasonal generalization of Sen's slope computes the slopes between all
  991. pairs of values within a "season" (column) of a 2D array. It returns an
  992. array containing the median of these "within-season" slopes for each
  993. season (the Theil-Sen slope estimator of each season), and it returns the
  994. median of the within-season slopes across all seasons (the seasonal Kendall
  995. slope estimator).
  996. Parameters
  997. ----------
  998. x : 2D array_like
  999. Each column of `x` contains measurements of the dependent variable
  1000. within a season. The independent variable (usually time) of each season
  1001. is assumed to be ``np.arange(x.shape[0])``.
  1002. Returns
  1003. -------
  1004. result : ``SenSeasonalSlopesResult`` instance
  1005. The return value is an object with the following attributes:
  1006. intra_slope : ndarray
  1007. For each season, the Theil-Sen slope estimator: the median of
  1008. within-season slopes.
  1009. inter_slope : float
  1010. The seasonal Kendall slope estimateor: the median of within-season
  1011. slopes *across all* seasons.
  1012. See Also
  1013. --------
  1014. theilslopes : the analogous function for non-seasonal data
  1015. scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
  1016. Notes
  1017. -----
  1018. The slopes :math:`d_{ijk}` within season :math:`i` are:
  1019. .. math::
  1020. d_{ijk} = \frac{x_{ij} - x_{ik}}
  1021. {j - k}
  1022. for pairs of distinct integer indices :math:`j, k` of :math:`x`.
  1023. Element :math:`i` of the returned `intra_slope` array is the median of the
  1024. :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
  1025. estimator of season :math:`i`. The returned `inter_slope` value, better
  1026. known as the seasonal Kendall slope estimator, is the median of the
  1027. :math:`d_{ijk}` over all :math:`i, j, k`.
  1028. References
  1029. ----------
  1030. .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
  1031. "Techniques of trend analysis for monthly water quality data."
  1032. *Water Resources Research* 18.1 (1982): 107-121.
  1033. Examples
  1034. --------
  1035. Suppose we have 100 observations of a dependent variable for each of four
  1036. seasons:
  1037. >>> import numpy as np
  1038. >>> rng = np.random.default_rng()
  1039. >>> x = rng.random(size=(100, 4))
  1040. We compute the seasonal slopes as:
  1041. >>> from scipy import stats
  1042. >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
  1043. If we define a function to compute all slopes between observations within
  1044. a season:
  1045. >>> def dijk(yi):
  1046. ... n = len(yi)
  1047. ... x = np.arange(n)
  1048. ... dy = yi - yi[:, np.newaxis]
  1049. ... dx = x - x[:, np.newaxis]
  1050. ... # we only want unique pairs of distinct indices
  1051. ... mask = np.triu(np.ones((n, n), dtype=bool), k=1)
  1052. ... return dy[mask]/dx[mask]
  1053. then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
  1054. >>> i = 2
  1055. >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
  1056. True
  1057. and ``inter_slope`` is the median of the values returned by ``dijk`` for
  1058. all seasons:
  1059. >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
  1060. >>> np.allclose(np.median(all_slopes), inter_slope)
  1061. True
  1062. Because the data are randomly generated, we would expect the median slopes
  1063. to be nearly zero both within and across all seasons, and indeed they are:
  1064. >>> intra_slope.data
  1065. array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
  1066. >>> inter_slope
  1067. -0.0010511779872922058
  1068. """
  1069. x = ma.array(x, subok=True, copy=False, ndmin=2)
  1070. (n,_) = x.shape
  1071. # Get list of slopes per season
  1072. szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
  1073. for i in range(n)])
  1074. szn_medslopes = ma.median(szn_slopes, axis=0)
  1075. medslope = ma.median(szn_slopes, axis=None)
  1076. return SenSeasonalSlopesResult(szn_medslopes, medslope)
  1077. Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
  1078. def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
  1079. """
  1080. Calculates the T-test for the mean of ONE group of scores.
  1081. Parameters
  1082. ----------
  1083. a : array_like
  1084. sample observation
  1085. popmean : float or array_like
  1086. expected value in null hypothesis, if array_like than it must have the
  1087. same shape as `a` excluding the axis dimension
  1088. axis : int or None, optional
  1089. Axis along which to compute test. If None, compute over the whole
  1090. array `a`.
  1091. alternative : {'two-sided', 'less', 'greater'}, optional
  1092. Defines the alternative hypothesis.
  1093. The following options are available (default is 'two-sided'):
  1094. * 'two-sided': the mean of the underlying distribution of the sample
  1095. is different than the given population mean (`popmean`)
  1096. * 'less': the mean of the underlying distribution of the sample is
  1097. less than the given population mean (`popmean`)
  1098. * 'greater': the mean of the underlying distribution of the sample is
  1099. greater than the given population mean (`popmean`)
  1100. .. versionadded:: 1.7.0
  1101. Returns
  1102. -------
  1103. statistic : float or array
  1104. t-statistic
  1105. pvalue : float or array
  1106. The p-value
  1107. Notes
  1108. -----
  1109. For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
  1110. """
  1111. a, axis = _chk_asarray(a, axis)
  1112. if a.size == 0:
  1113. return (np.nan, np.nan)
  1114. x = a.mean(axis=axis)
  1115. v = a.var(axis=axis, ddof=1)
  1116. n = a.count(axis=axis)
  1117. # force df to be an array for masked division not to throw a warning
  1118. df = ma.asanyarray(n - 1.0)
  1119. svar = ((n - 1.0) * v) / df
  1120. with np.errstate(divide='ignore', invalid='ignore'):
  1121. t = (x - popmean) / ma.sqrt(svar / n)
  1122. t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
  1123. return Ttest_1sampResult(t, prob)
  1124. ttest_onesamp = ttest_1samp
  1125. Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
  1126. def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
  1127. """
  1128. Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
  1129. Parameters
  1130. ----------
  1131. a, b : array_like
  1132. The arrays must have the same shape, except in the dimension
  1133. corresponding to `axis` (the first, by default).
  1134. axis : int or None, optional
  1135. Axis along which to compute test. If None, compute over the whole
  1136. arrays, `a`, and `b`.
  1137. equal_var : bool, optional
  1138. If True, perform a standard independent 2 sample test that assumes equal
  1139. population variances.
  1140. If False, perform Welch's t-test, which does not assume equal population
  1141. variance.
  1142. .. versionadded:: 0.17.0
  1143. alternative : {'two-sided', 'less', 'greater'}, optional
  1144. Defines the alternative hypothesis.
  1145. The following options are available (default is 'two-sided'):
  1146. * 'two-sided': the means of the distributions underlying the samples
  1147. are unequal.
  1148. * 'less': the mean of the distribution underlying the first sample
  1149. is less than the mean of the distribution underlying the second
  1150. sample.
  1151. * 'greater': the mean of the distribution underlying the first
  1152. sample is greater than the mean of the distribution underlying
  1153. the second sample.
  1154. .. versionadded:: 1.7.0
  1155. Returns
  1156. -------
  1157. statistic : float or array
  1158. The calculated t-statistic.
  1159. pvalue : float or array
  1160. The p-value.
  1161. Notes
  1162. -----
  1163. For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
  1164. """
  1165. a, b, axis = _chk2_asarray(a, b, axis)
  1166. if a.size == 0 or b.size == 0:
  1167. return Ttest_indResult(np.nan, np.nan)
  1168. (x1, x2) = (a.mean(axis), b.mean(axis))
  1169. (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
  1170. (n1, n2) = (a.count(axis), b.count(axis))
  1171. if equal_var:
  1172. # force df to be an array for masked division not to throw a warning
  1173. df = ma.asanyarray(n1 + n2 - 2.0)
  1174. svar = ((n1-1)*v1+(n2-1)*v2) / df
  1175. denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!
  1176. else:
  1177. vn1 = v1/n1
  1178. vn2 = v2/n2
  1179. with np.errstate(divide='ignore', invalid='ignore'):
  1180. df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
  1181. # If df is undefined, variances are zero.
  1182. # It doesn't matter what df is as long as it is not NaN.
  1183. df = np.where(np.isnan(df), 1, df)
  1184. denom = ma.sqrt(vn1 + vn2)
  1185. with np.errstate(divide='ignore', invalid='ignore'):
  1186. t = (x1-x2) / denom
  1187. t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
  1188. return Ttest_indResult(t, prob)
  1189. Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
  1190. def ttest_rel(a, b, axis=0, alternative='two-sided'):
  1191. """
  1192. Calculates the T-test on TWO RELATED samples of scores, a and b.
  1193. Parameters
  1194. ----------
  1195. a, b : array_like
  1196. The arrays must have the same shape.
  1197. axis : int or None, optional
  1198. Axis along which to compute test. If None, compute over the whole
  1199. arrays, `a`, and `b`.
  1200. alternative : {'two-sided', 'less', 'greater'}, optional
  1201. Defines the alternative hypothesis.
  1202. The following options are available (default is 'two-sided'):
  1203. * 'two-sided': the means of the distributions underlying the samples
  1204. are unequal.
  1205. * 'less': the mean of the distribution underlying the first sample
  1206. is less than the mean of the distribution underlying the second
  1207. sample.
  1208. * 'greater': the mean of the distribution underlying the first
  1209. sample is greater than the mean of the distribution underlying
  1210. the second sample.
  1211. .. versionadded:: 1.7.0
  1212. Returns
  1213. -------
  1214. statistic : float or array
  1215. t-statistic
  1216. pvalue : float or array
  1217. two-tailed p-value
  1218. Notes
  1219. -----
  1220. For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
  1221. """
  1222. a, b, axis = _chk2_asarray(a, b, axis)
  1223. if len(a) != len(b):
  1224. raise ValueError('unequal length arrays')
  1225. if a.size == 0 or b.size == 0:
  1226. return Ttest_relResult(np.nan, np.nan)
  1227. n = a.count(axis)
  1228. df = ma.asanyarray(n-1.0)
  1229. d = (a-b).astype('d')
  1230. dm = d.mean(axis)
  1231. v = d.var(axis=axis, ddof=1)
  1232. denom = ma.sqrt(v / n)
  1233. with np.errstate(divide='ignore', invalid='ignore'):
  1234. t = dm / denom
  1235. t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
  1236. return Ttest_relResult(t, prob)
  1237. MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
  1238. 'pvalue'))
  1239. def mannwhitneyu(x,y, use_continuity=True):
  1240. """
  1241. Computes the Mann-Whitney statistic
  1242. Missing values in `x` and/or `y` are discarded.
  1243. Parameters
  1244. ----------
  1245. x : sequence
  1246. Input
  1247. y : sequence
  1248. Input
  1249. use_continuity : {True, False}, optional
  1250. Whether a continuity correction (1/2.) should be taken into account.
  1251. Returns
  1252. -------
  1253. statistic : float
  1254. The minimum of the Mann-Whitney statistics
  1255. pvalue : float
  1256. Approximate two-sided p-value assuming a normal distribution.
  1257. """
  1258. x = ma.asarray(x).compressed().view(ndarray)
  1259. y = ma.asarray(y).compressed().view(ndarray)
  1260. ranks = rankdata(np.concatenate([x,y]))
  1261. (nx, ny) = (len(x), len(y))
  1262. nt = nx + ny
  1263. U = ranks[:nx].sum() - nx*(nx+1)/2.
  1264. U = max(U, nx*ny - U)
  1265. u = nx*ny - U
  1266. mu = (nx*ny)/2.
  1267. sigsq = (nt**3 - nt)/12.
  1268. ties = count_tied_groups(ranks)
  1269. sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
  1270. sigsq *= nx*ny/float(nt*(nt-1))
  1271. if use_continuity:
  1272. z = (U - 1/2. - mu) / ma.sqrt(sigsq)
  1273. else:
  1274. z = (U - mu) / ma.sqrt(sigsq)
  1275. prob = special.erfc(abs(z)/np.sqrt(2))
  1276. return MannwhitneyuResult(u, prob)
  1277. KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
  1278. def kruskal(*args):
  1279. """
  1280. Compute the Kruskal-Wallis H-test for independent samples
  1281. Parameters
  1282. ----------
  1283. sample1, sample2, ... : array_like
  1284. Two or more arrays with the sample measurements can be given as
  1285. arguments.
  1286. Returns
  1287. -------
  1288. statistic : float
  1289. The Kruskal-Wallis H statistic, corrected for ties
  1290. pvalue : float
  1291. The p-value for the test using the assumption that H has a chi
  1292. square distribution
  1293. Notes
  1294. -----
  1295. For more details on `kruskal`, see `scipy.stats.kruskal`.
  1296. Examples
  1297. --------
  1298. >>> from scipy.stats.mstats import kruskal
  1299. Random samples from three different brands of batteries were tested
  1300. to see how long the charge lasted. Results were as follows:
  1301. >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
  1302. >>> b = [6.9, 7.0, 6.1, 7.9]
  1303. >>> c = [7.2, 6.9, 6.1, 6.5]
  1304. Test the hypotesis that the distribution functions for all of the brands'
  1305. durations are identical. Use 5% level of significance.
  1306. >>> kruskal(a, b, c)
  1307. KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
  1308. The null hypothesis is rejected at the 5% level of significance
  1309. because the returned p-value is less than the critical value of 5%.
  1310. """
  1311. output = argstoarray(*args)
  1312. ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
  1313. sumrk = ranks.sum(-1)
  1314. ngrp = ranks.count(-1)
  1315. ntot = ranks.count()
  1316. H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
  1317. # Tie correction
  1318. ties = count_tied_groups(ranks)
  1319. T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
  1320. if T == 0:
  1321. raise ValueError('All numbers are identical in kruskal')
  1322. H /= T
  1323. df = len(output) - 1
  1324. prob = distributions.chi2.sf(H, df)
  1325. return KruskalResult(H, prob)
  1326. kruskalwallis = kruskal
  1327. @_rename_parameter("mode", "method")
  1328. def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
  1329. """
  1330. Computes the Kolmogorov-Smirnov test on one sample of masked values.
  1331. Missing values in `x` are discarded.
  1332. Parameters
  1333. ----------
  1334. x : array_like
  1335. a 1-D array of observations of random variables.
  1336. cdf : str or callable
  1337. If a string, it should be the name of a distribution in `scipy.stats`.
  1338. If a callable, that callable is used to calculate the cdf.
  1339. args : tuple, sequence, optional
  1340. Distribution parameters, used if `cdf` is a string.
  1341. alternative : {'two-sided', 'less', 'greater'}, optional
  1342. Indicates the alternative hypothesis. Default is 'two-sided'.
  1343. method : {'auto', 'exact', 'asymp'}, optional
  1344. Defines the method used for calculating the p-value.
  1345. The following options are available (default is 'auto'):
  1346. * 'auto' : use 'exact' for small size arrays, 'asymp' for large
  1347. * 'exact' : use approximation to exact distribution of test statistic
  1348. * 'asymp' : use asymptotic distribution of test statistic
  1349. Returns
  1350. -------
  1351. d : float
  1352. Value of the Kolmogorov Smirnov test
  1353. p : float
  1354. Corresponding p-value.
  1355. """
  1356. alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
  1357. alternative.lower()[0], alternative)
  1358. return scipy.stats._stats_py.ks_1samp(
  1359. x, cdf, args=args, alternative=alternative, method=method)
  1360. @_rename_parameter("mode", "method")
  1361. def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
  1362. """
  1363. Computes the Kolmogorov-Smirnov test on two samples.
  1364. Missing values in `x` and/or `y` are discarded.
  1365. Parameters
  1366. ----------
  1367. data1 : array_like
  1368. First data set
  1369. data2 : array_like
  1370. Second data set
  1371. alternative : {'two-sided', 'less', 'greater'}, optional
  1372. Indicates the alternative hypothesis. Default is 'two-sided'.
  1373. method : {'auto', 'exact', 'asymp'}, optional
  1374. Defines the method used for calculating the p-value.
  1375. The following options are available (default is 'auto'):
  1376. * 'auto' : use 'exact' for small size arrays, 'asymp' for large
  1377. * 'exact' : use approximation to exact distribution of test statistic
  1378. * 'asymp' : use asymptotic distribution of test statistic
  1379. Returns
  1380. -------
  1381. d : float
  1382. Value of the Kolmogorov Smirnov test
  1383. p : float
  1384. Corresponding p-value.
  1385. """
  1386. # Ideally this would be accomplished by
  1387. # ks_2samp = scipy.stats._stats_py.ks_2samp
  1388. # but the circular dependencies between _mstats_basic and stats prevent that.
  1389. alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
  1390. alternative.lower()[0], alternative)
  1391. return scipy.stats._stats_py.ks_2samp(data1, data2,
  1392. alternative=alternative,
  1393. method=method)
  1394. ks_twosamp = ks_2samp
  1395. @_rename_parameter("mode", "method")
  1396. def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
  1397. """
  1398. Parameters
  1399. ----------
  1400. data1 : array_like
  1401. data2 : str, callable or array_like
  1402. args : tuple, sequence, optional
  1403. Distribution parameters, used if `data1` or `data2` are strings.
  1404. alternative : str, as documented in stats.kstest
  1405. method : str, as documented in stats.kstest
  1406. Returns
  1407. -------
  1408. tuple of (K-S statistic, probability)
  1409. """
  1410. return scipy.stats._stats_py.kstest(data1, data2, args,
  1411. alternative=alternative, method=method)
  1412. def trima(a, limits=None, inclusive=(True,True)):
  1413. """
  1414. Trims an array by masking the data outside some given limits.
  1415. Returns a masked version of the input array.
  1416. Parameters
  1417. ----------
  1418. a : array_like
  1419. Input array.
  1420. limits : {None, tuple}, optional
  1421. Tuple of (lower limit, upper limit) in absolute values.
  1422. Values of the input array lower (greater) than the lower (upper) limit
  1423. will be masked. A limit is None indicates an open interval.
  1424. inclusive : (bool, bool) tuple, optional
  1425. Tuple of (lower flag, upper flag), indicating whether values exactly
  1426. equal to the lower (upper) limit are allowed.
  1427. Examples
  1428. --------
  1429. >>> from scipy.stats.mstats import trima
  1430. >>> import numpy as np
  1431. >>> a = np.arange(10)
  1432. The interval is left-closed and right-open, i.e., `[2, 8)`.
  1433. Trim the array by keeping only values in the interval.
  1434. >>> trima(a, limits=(2, 8), inclusive=(True, False))
  1435. masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
  1436. mask=[ True, True, False, False, False, False, False, False,
  1437. True, True],
  1438. fill_value=999999)
  1439. """
  1440. a = ma.asarray(a)
  1441. a.unshare_mask()
  1442. if (limits is None) or (limits == (None, None)):
  1443. return a
  1444. (lower_lim, upper_lim) = limits
  1445. (lower_in, upper_in) = inclusive
  1446. condition = False
  1447. if lower_lim is not None:
  1448. if lower_in:
  1449. condition |= (a < lower_lim)
  1450. else:
  1451. condition |= (a <= lower_lim)
  1452. if upper_lim is not None:
  1453. if upper_in:
  1454. condition |= (a > upper_lim)
  1455. else:
  1456. condition |= (a >= upper_lim)
  1457. a[condition.filled(True)] = masked
  1458. return a
  1459. def trimr(a, limits=None, inclusive=(True, True), axis=None):
  1460. """
  1461. Trims an array by masking some proportion of the data on each end.
  1462. Returns a masked version of the input array.
  1463. Parameters
  1464. ----------
  1465. a : sequence
  1466. Input array.
  1467. limits : {None, tuple}, optional
  1468. Tuple of the percentages to cut on each side of the array, with respect
  1469. to the number of unmasked data, as floats between 0. and 1.
  1470. Noting n the number of unmasked data before trimming, the
  1471. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  1472. masked, and the total number of unmasked data after trimming is
  1473. n*(1.-sum(limits)). The value of one limit can be set to None to
  1474. indicate an open interval.
  1475. inclusive : {(True,True) tuple}, optional
  1476. Tuple of flags indicating whether the number of data being masked on
  1477. the left (right) end should be truncated (True) or rounded (False) to
  1478. integers.
  1479. axis : {None,int}, optional
  1480. Axis along which to trim. If None, the whole array is trimmed, but its
  1481. shape is maintained.
  1482. """
  1483. def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1484. n = a.count()
  1485. idx = a.argsort()
  1486. if low_limit:
  1487. if low_inclusive:
  1488. lowidx = int(low_limit*n)
  1489. else:
  1490. lowidx = int(np.round(low_limit*n))
  1491. a[idx[:lowidx]] = masked
  1492. if up_limit is not None:
  1493. if up_inclusive:
  1494. upidx = n - int(n*up_limit)
  1495. else:
  1496. upidx = n - int(np.round(n*up_limit))
  1497. a[idx[upidx:]] = masked
  1498. return a
  1499. a = ma.asarray(a)
  1500. a.unshare_mask()
  1501. if limits is None:
  1502. return a
  1503. # Check the limits
  1504. (lolim, uplim) = limits
  1505. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1506. if lolim is not None:
  1507. if lolim > 1. or lolim < 0:
  1508. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  1509. if uplim is not None:
  1510. if uplim > 1. or uplim < 0:
  1511. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  1512. (loinc, upinc) = inclusive
  1513. if axis is None:
  1514. shp = a.shape
  1515. return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
  1516. else:
  1517. return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
  1518. trimdoc = """
  1519. Parameters
  1520. ----------
  1521. a : sequence
  1522. Input array
  1523. limits : {None, tuple}, optional
  1524. If `relative` is False, tuple (lower limit, upper limit) in absolute values.
  1525. Values of the input array lower (greater) than the lower (upper) limit are
  1526. masked.
  1527. If `relative` is True, tuple (lower percentage, upper percentage) to cut
  1528. on each side of the array, with respect to the number of unmasked data.
  1529. Noting n the number of unmasked data before trimming, the (n*limits[0])th
  1530. smallest data and the (n*limits[1])th largest data are masked, and the
  1531. total number of unmasked data after trimming is n*(1.-sum(limits))
  1532. In each case, the value of one limit can be set to None to indicate an
  1533. open interval.
  1534. If limits is None, no trimming is performed
  1535. inclusive : {(bool, bool) tuple}, optional
  1536. If `relative` is False, tuple indicating whether values exactly equal
  1537. to the absolute limits are allowed.
  1538. If `relative` is True, tuple indicating whether the number of data
  1539. being masked on each side should be rounded (True) or truncated
  1540. (False).
  1541. relative : bool, optional
  1542. Whether to consider the limits as absolute values (False) or proportions
  1543. to cut (True).
  1544. axis : int, optional
  1545. Axis along which to trim.
  1546. """
  1547. def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
  1548. """
  1549. Trims an array by masking the data outside some given limits.
  1550. Returns a masked version of the input array.
  1551. %s
  1552. Examples
  1553. --------
  1554. >>> from scipy.stats.mstats import trim
  1555. >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
  1556. >>> print(trim(z,(3,8)))
  1557. [-- -- 3 4 5 6 7 8 -- --]
  1558. >>> print(trim(z,(0.1,0.2),relative=True))
  1559. [-- 2 3 4 5 6 7 8 -- --]
  1560. """
  1561. if relative:
  1562. return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
  1563. else:
  1564. return trima(a, limits=limits, inclusive=inclusive)
  1565. if trim.__doc__:
  1566. trim.__doc__ = trim.__doc__ % trimdoc
  1567. def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
  1568. """
  1569. Trims the smallest and largest data values.
  1570. Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
  1571. ``int(proportiontocut * n)`` largest values of data along the given axis,
  1572. where n is the number of unmasked values before trimming.
  1573. Parameters
  1574. ----------
  1575. data : ndarray
  1576. Data to trim.
  1577. proportiontocut : float, optional
  1578. Percentage of trimming (as a float between 0 and 1).
  1579. If n is the number of unmasked values before trimming, the number of
  1580. values after trimming is ``(1 - 2*proportiontocut) * n``.
  1581. Default is 0.2.
  1582. inclusive : {(bool, bool) tuple}, optional
  1583. Tuple indicating whether the number of data being masked on each side
  1584. should be rounded (True) or truncated (False).
  1585. axis : int, optional
  1586. Axis along which to perform the trimming.
  1587. If None, the input array is first flattened.
  1588. """
  1589. return trimr(data, limits=(proportiontocut,proportiontocut),
  1590. inclusive=inclusive, axis=axis)
  1591. def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
  1592. axis=None):
  1593. """
  1594. Trims the data by masking values from one tail.
  1595. Parameters
  1596. ----------
  1597. data : array_like
  1598. Data to trim.
  1599. proportiontocut : float, optional
  1600. Percentage of trimming. If n is the number of unmasked values
  1601. before trimming, the number of values after trimming is
  1602. ``(1 - proportiontocut) * n``. Default is 0.2.
  1603. tail : {'left','right'}, optional
  1604. If 'left' the `proportiontocut` lowest values will be masked.
  1605. If 'right' the `proportiontocut` highest values will be masked.
  1606. Default is 'left'.
  1607. inclusive : {(bool, bool) tuple}, optional
  1608. Tuple indicating whether the number of data being masked on each side
  1609. should be rounded (True) or truncated (False). Default is
  1610. (True, True).
  1611. axis : int, optional
  1612. Axis along which to perform the trimming.
  1613. If None, the input array is first flattened. Default is None.
  1614. Returns
  1615. -------
  1616. trimtail : ndarray
  1617. Returned array of same shape as `data` with masked tail values.
  1618. """
  1619. tail = str(tail).lower()[0]
  1620. if tail == 'l':
  1621. limits = (proportiontocut,None)
  1622. elif tail == 'r':
  1623. limits = (None, proportiontocut)
  1624. else:
  1625. raise TypeError("The tail argument should be in ('left','right')")
  1626. return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
  1627. trim1 = trimtail
  1628. def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1629. axis=None):
  1630. """Returns the trimmed mean of the data along the given axis.
  1631. %s
  1632. """
  1633. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1634. limits = (limits, limits)
  1635. if relative:
  1636. return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
  1637. else:
  1638. return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
  1639. if trimmed_mean.__doc__:
  1640. trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
  1641. def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1642. axis=None, ddof=0):
  1643. """Returns the trimmed variance of the data along the given axis.
  1644. %s
  1645. ddof : {0,integer}, optional
  1646. Means Delta Degrees of Freedom. The denominator used during computations
  1647. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1648. biased estimate of the variance.
  1649. """
  1650. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1651. limits = (limits, limits)
  1652. if relative:
  1653. out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
  1654. else:
  1655. out = trima(a,limits=limits,inclusive=inclusive)
  1656. return out.var(axis=axis, ddof=ddof)
  1657. if trimmed_var.__doc__:
  1658. trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
  1659. def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
  1660. axis=None, ddof=0):
  1661. """Returns the trimmed standard deviation of the data along the given axis.
  1662. %s
  1663. ddof : {0,integer}, optional
  1664. Means Delta Degrees of Freedom. The denominator used during computations
  1665. is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
  1666. biased estimate of the variance.
  1667. """
  1668. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1669. limits = (limits, limits)
  1670. if relative:
  1671. out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
  1672. else:
  1673. out = trima(a,limits=limits,inclusive=inclusive)
  1674. return out.std(axis=axis,ddof=ddof)
  1675. if trimmed_std.__doc__:
  1676. trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
  1677. def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
  1678. """
  1679. Returns the standard error of the trimmed mean along the given axis.
  1680. Parameters
  1681. ----------
  1682. a : sequence
  1683. Input array
  1684. limits : {(0.1,0.1), tuple of float}, optional
  1685. tuple (lower percentage, upper percentage) to cut on each side of the
  1686. array, with respect to the number of unmasked data.
  1687. If n is the number of unmasked data before trimming, the values
  1688. smaller than ``n * limits[0]`` and the values larger than
  1689. ``n * `limits[1]`` are masked, and the total number of unmasked
  1690. data after trimming is ``n * (1.-sum(limits))``. In each case,
  1691. the value of one limit can be set to None to indicate an open interval.
  1692. If `limits` is None, no trimming is performed.
  1693. inclusive : {(bool, bool) tuple} optional
  1694. Tuple indicating whether the number of data being masked on each side
  1695. should be rounded (True) or truncated (False).
  1696. axis : int, optional
  1697. Axis along which to trim.
  1698. Returns
  1699. -------
  1700. trimmed_stde : scalar or ndarray
  1701. """
  1702. def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
  1703. "Returns the standard error of the trimmed mean for a 1D input data."
  1704. n = a.count()
  1705. idx = a.argsort()
  1706. if low_limit:
  1707. if low_inclusive:
  1708. lowidx = int(low_limit*n)
  1709. else:
  1710. lowidx = np.round(low_limit*n)
  1711. a[idx[:lowidx]] = masked
  1712. if up_limit is not None:
  1713. if up_inclusive:
  1714. upidx = n - int(n*up_limit)
  1715. else:
  1716. upidx = n - np.round(n*up_limit)
  1717. a[idx[upidx:]] = masked
  1718. a[idx[:lowidx]] = a[idx[lowidx]]
  1719. a[idx[upidx:]] = a[idx[upidx-1]]
  1720. winstd = a.std(ddof=1)
  1721. return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
  1722. a = ma.array(a, copy=True, subok=True)
  1723. a.unshare_mask()
  1724. if limits is None:
  1725. return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
  1726. if (not isinstance(limits,tuple)) and isinstance(limits,float):
  1727. limits = (limits, limits)
  1728. # Check the limits
  1729. (lolim, uplim) = limits
  1730. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  1731. if lolim is not None:
  1732. if lolim > 1. or lolim < 0:
  1733. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  1734. if uplim is not None:
  1735. if uplim > 1. or uplim < 0:
  1736. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  1737. (loinc, upinc) = inclusive
  1738. if (axis is None):
  1739. return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
  1740. else:
  1741. if a.ndim > 2:
  1742. raise ValueError("Array 'a' must be at most two dimensional, "
  1743. "but got a.ndim = %d" % a.ndim)
  1744. return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
  1745. lolim,uplim,loinc,upinc)
  1746. def _mask_to_limits(a, limits, inclusive):
  1747. """Mask an array for values outside of given limits.
  1748. This is primarily a utility function.
  1749. Parameters
  1750. ----------
  1751. a : array
  1752. limits : (float or None, float or None)
  1753. A tuple consisting of the (lower limit, upper limit). Values in the
  1754. input array less than the lower limit or greater than the upper limit
  1755. will be masked out. None implies no limit.
  1756. inclusive : (bool, bool)
  1757. A tuple consisting of the (lower flag, upper flag). These flags
  1758. determine whether values exactly equal to lower or upper are allowed.
  1759. Returns
  1760. -------
  1761. A MaskedArray.
  1762. Raises
  1763. ------
  1764. A ValueError if there are no values within the given limits.
  1765. """
  1766. lower_limit, upper_limit = limits
  1767. lower_include, upper_include = inclusive
  1768. am = ma.MaskedArray(a)
  1769. if lower_limit is not None:
  1770. if lower_include:
  1771. am = ma.masked_less(am, lower_limit)
  1772. else:
  1773. am = ma.masked_less_equal(am, lower_limit)
  1774. if upper_limit is not None:
  1775. if upper_include:
  1776. am = ma.masked_greater(am, upper_limit)
  1777. else:
  1778. am = ma.masked_greater_equal(am, upper_limit)
  1779. if am.count() == 0:
  1780. raise ValueError("No array values within given limits")
  1781. return am
  1782. def tmean(a, limits=None, inclusive=(True, True), axis=None):
  1783. """
  1784. Compute the trimmed mean.
  1785. Parameters
  1786. ----------
  1787. a : array_like
  1788. Array of values.
  1789. limits : None or (lower limit, upper limit), optional
  1790. Values in the input array less than the lower limit or greater than the
  1791. upper limit will be ignored. When limits is None (default), then all
  1792. values are used. Either of the limit values in the tuple can also be
  1793. None representing a half-open interval.
  1794. inclusive : (bool, bool), optional
  1795. A tuple consisting of the (lower flag, upper flag). These flags
  1796. determine whether values exactly equal to the lower or upper limits
  1797. are included. The default value is (True, True).
  1798. axis : int or None, optional
  1799. Axis along which to operate. If None, compute over the
  1800. whole array. Default is None.
  1801. Returns
  1802. -------
  1803. tmean : float
  1804. Notes
  1805. -----
  1806. For more details on `tmean`, see `scipy.stats.tmean`.
  1807. Examples
  1808. --------
  1809. >>> import numpy as np
  1810. >>> from scipy.stats import mstats
  1811. >>> a = np.array([[6, 8, 3, 0],
  1812. ... [3, 9, 1, 2],
  1813. ... [8, 7, 8, 2],
  1814. ... [5, 6, 0, 2],
  1815. ... [4, 5, 5, 2]])
  1816. ...
  1817. ...
  1818. >>> mstats.tmean(a, (2,5))
  1819. 3.3
  1820. >>> mstats.tmean(a, (2,5), axis=0)
  1821. masked_array(data=[4.0, 5.0, 4.0, 2.0],
  1822. mask=[False, False, False, False],
  1823. fill_value=1e+20)
  1824. """
  1825. return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
  1826. def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  1827. """
  1828. Compute the trimmed variance
  1829. This function computes the sample variance of an array of values,
  1830. while ignoring values which are outside of given `limits`.
  1831. Parameters
  1832. ----------
  1833. a : array_like
  1834. Array of values.
  1835. limits : None or (lower limit, upper limit), optional
  1836. Values in the input array less than the lower limit or greater than the
  1837. upper limit will be ignored. When limits is None, then all values are
  1838. used. Either of the limit values in the tuple can also be None
  1839. representing a half-open interval. The default value is None.
  1840. inclusive : (bool, bool), optional
  1841. A tuple consisting of the (lower flag, upper flag). These flags
  1842. determine whether values exactly equal to the lower or upper limits
  1843. are included. The default value is (True, True).
  1844. axis : int or None, optional
  1845. Axis along which to operate. If None, compute over the
  1846. whole array. Default is zero.
  1847. ddof : int, optional
  1848. Delta degrees of freedom. Default is 1.
  1849. Returns
  1850. -------
  1851. tvar : float
  1852. Trimmed variance.
  1853. Notes
  1854. -----
  1855. For more details on `tvar`, see `scipy.stats.tvar`.
  1856. """
  1857. a = a.astype(float).ravel()
  1858. if limits is None:
  1859. n = (~a.mask).sum() # todo: better way to do that?
  1860. return np.ma.var(a) * n/(n-1.)
  1861. am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
  1862. return np.ma.var(am, axis=axis, ddof=ddof)
  1863. def tmin(a, lowerlimit=None, axis=0, inclusive=True):
  1864. """
  1865. Compute the trimmed minimum
  1866. Parameters
  1867. ----------
  1868. a : array_like
  1869. array of values
  1870. lowerlimit : None or float, optional
  1871. Values in the input array less than the given limit will be ignored.
  1872. When lowerlimit is None, then all values are used. The default value
  1873. is None.
  1874. axis : int or None, optional
  1875. Axis along which to operate. Default is 0. If None, compute over the
  1876. whole array `a`.
  1877. inclusive : {True, False}, optional
  1878. This flag determines whether values exactly equal to the lower limit
  1879. are included. The default value is True.
  1880. Returns
  1881. -------
  1882. tmin : float, int or ndarray
  1883. Notes
  1884. -----
  1885. For more details on `tmin`, see `scipy.stats.tmin`.
  1886. Examples
  1887. --------
  1888. >>> import numpy as np
  1889. >>> from scipy.stats import mstats
  1890. >>> a = np.array([[6, 8, 3, 0],
  1891. ... [3, 2, 1, 2],
  1892. ... [8, 1, 8, 2],
  1893. ... [5, 3, 0, 2],
  1894. ... [4, 7, 5, 2]])
  1895. ...
  1896. >>> mstats.tmin(a, 5)
  1897. masked_array(data=[5, 7, 5, --],
  1898. mask=[False, False, False, True],
  1899. fill_value=999999)
  1900. """
  1901. a, axis = _chk_asarray(a, axis)
  1902. am = trima(a, (lowerlimit, None), (inclusive, False))
  1903. return ma.minimum.reduce(am, axis)
  1904. def tmax(a, upperlimit=None, axis=0, inclusive=True):
  1905. """
  1906. Compute the trimmed maximum
  1907. This function computes the maximum value of an array along a given axis,
  1908. while ignoring values larger than a specified upper limit.
  1909. Parameters
  1910. ----------
  1911. a : array_like
  1912. array of values
  1913. upperlimit : None or float, optional
  1914. Values in the input array greater than the given limit will be ignored.
  1915. When upperlimit is None, then all values are used. The default value
  1916. is None.
  1917. axis : int or None, optional
  1918. Axis along which to operate. Default is 0. If None, compute over the
  1919. whole array `a`.
  1920. inclusive : {True, False}, optional
  1921. This flag determines whether values exactly equal to the upper limit
  1922. are included. The default value is True.
  1923. Returns
  1924. -------
  1925. tmax : float, int or ndarray
  1926. Notes
  1927. -----
  1928. For more details on `tmax`, see `scipy.stats.tmax`.
  1929. Examples
  1930. --------
  1931. >>> import numpy as np
  1932. >>> from scipy.stats import mstats
  1933. >>> a = np.array([[6, 8, 3, 0],
  1934. ... [3, 9, 1, 2],
  1935. ... [8, 7, 8, 2],
  1936. ... [5, 6, 0, 2],
  1937. ... [4, 5, 5, 2]])
  1938. ...
  1939. ...
  1940. >>> mstats.tmax(a, 4)
  1941. masked_array(data=[4, --, 3, 2],
  1942. mask=[False, True, False, False],
  1943. fill_value=999999)
  1944. """
  1945. a, axis = _chk_asarray(a, axis)
  1946. am = trima(a, (None, upperlimit), (False, inclusive))
  1947. return ma.maximum.reduce(am, axis)
  1948. def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
  1949. """
  1950. Compute the trimmed standard error of the mean.
  1951. This function finds the standard error of the mean for given
  1952. values, ignoring values outside the given `limits`.
  1953. Parameters
  1954. ----------
  1955. a : array_like
  1956. array of values
  1957. limits : None or (lower limit, upper limit), optional
  1958. Values in the input array less than the lower limit or greater than the
  1959. upper limit will be ignored. When limits is None, then all values are
  1960. used. Either of the limit values in the tuple can also be None
  1961. representing a half-open interval. The default value is None.
  1962. inclusive : (bool, bool), optional
  1963. A tuple consisting of the (lower flag, upper flag). These flags
  1964. determine whether values exactly equal to the lower or upper limits
  1965. are included. The default value is (True, True).
  1966. axis : int or None, optional
  1967. Axis along which to operate. If None, compute over the
  1968. whole array. Default is zero.
  1969. ddof : int, optional
  1970. Delta degrees of freedom. Default is 1.
  1971. Returns
  1972. -------
  1973. tsem : float
  1974. Notes
  1975. -----
  1976. For more details on `tsem`, see `scipy.stats.tsem`.
  1977. """
  1978. a = ma.asarray(a).ravel()
  1979. if limits is None:
  1980. n = float(a.count())
  1981. return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
  1982. am = trima(a.ravel(), limits, inclusive)
  1983. sd = np.sqrt(am.var(axis=axis, ddof=ddof))
  1984. return sd / np.sqrt(am.count())
  1985. def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
  1986. axis=None, nan_policy='propagate'):
  1987. """Returns a Winsorized version of the input array.
  1988. The (limits[0])th lowest values are set to the (limits[0])th percentile,
  1989. and the (limits[1])th highest values are set to the (1 - limits[1])th
  1990. percentile.
  1991. Masked values are skipped.
  1992. Parameters
  1993. ----------
  1994. a : sequence
  1995. Input array.
  1996. limits : {None, tuple of float}, optional
  1997. Tuple of the percentages to cut on each side of the array, with respect
  1998. to the number of unmasked data, as floats between 0. and 1.
  1999. Noting n the number of unmasked data before trimming, the
  2000. (n*limits[0])th smallest data and the (n*limits[1])th largest data are
  2001. masked, and the total number of unmasked data after trimming
  2002. is n*(1.-sum(limits)) The value of one limit can be set to None to
  2003. indicate an open interval.
  2004. inclusive : {(True, True) tuple}, optional
  2005. Tuple indicating whether the number of data being masked on each side
  2006. should be truncated (True) or rounded (False).
  2007. inplace : {False, True}, optional
  2008. Whether to winsorize in place (True) or to use a copy (False)
  2009. axis : {None, int}, optional
  2010. Axis along which to trim. If None, the whole array is trimmed, but its
  2011. shape is maintained.
  2012. nan_policy : {'propagate', 'raise', 'omit'}, optional
  2013. Defines how to handle when input contains nan.
  2014. The following options are available (default is 'propagate'):
  2015. * 'propagate': allows nan values and may overwrite or propagate them
  2016. * 'raise': throws an error
  2017. * 'omit': performs the calculations ignoring nan values
  2018. Notes
  2019. -----
  2020. This function is applied to reduce the effect of possibly spurious outliers
  2021. by limiting the extreme values.
  2022. Examples
  2023. --------
  2024. >>> import numpy as np
  2025. >>> from scipy.stats.mstats import winsorize
  2026. A shuffled array contains integers from 1 to 10.
  2027. >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
  2028. The 10% of the lowest value (i.e., `1`) and the 20% of the highest
  2029. values (i.e., `9` and `10`) are replaced.
  2030. >>> winsorize(a, limits=[0.1, 0.2])
  2031. masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
  2032. mask=False,
  2033. fill_value=999999)
  2034. """
  2035. def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
  2036. contains_nan, nan_policy):
  2037. n = a.count()
  2038. idx = a.argsort()
  2039. if contains_nan:
  2040. nan_count = np.count_nonzero(np.isnan(a))
  2041. if low_limit:
  2042. if low_include:
  2043. lowidx = int(low_limit * n)
  2044. else:
  2045. lowidx = np.round(low_limit * n).astype(int)
  2046. if contains_nan and nan_policy == 'omit':
  2047. lowidx = min(lowidx, n-nan_count-1)
  2048. a[idx[:lowidx]] = a[idx[lowidx]]
  2049. if up_limit is not None:
  2050. if up_include:
  2051. upidx = n - int(n * up_limit)
  2052. else:
  2053. upidx = n - np.round(n * up_limit).astype(int)
  2054. if contains_nan and nan_policy == 'omit':
  2055. a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
  2056. else:
  2057. a[idx[upidx:]] = a[idx[upidx - 1]]
  2058. return a
  2059. contains_nan, nan_policy = _contains_nan(a, nan_policy)
  2060. # We are going to modify a: better make a copy
  2061. a = ma.array(a, copy=np.logical_not(inplace))
  2062. if limits is None:
  2063. return a
  2064. if (not isinstance(limits, tuple)) and isinstance(limits, float):
  2065. limits = (limits, limits)
  2066. # Check the limits
  2067. (lolim, uplim) = limits
  2068. errmsg = "The proportion to cut from the %s should be between 0. and 1."
  2069. if lolim is not None:
  2070. if lolim > 1. or lolim < 0:
  2071. raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
  2072. if uplim is not None:
  2073. if uplim > 1. or uplim < 0:
  2074. raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
  2075. (loinc, upinc) = inclusive
  2076. if axis is None:
  2077. shp = a.shape
  2078. return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
  2079. contains_nan, nan_policy).reshape(shp)
  2080. else:
  2081. return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
  2082. upinc, contains_nan, nan_policy)
  2083. def moment(a, moment=1, axis=0):
  2084. """
  2085. Calculates the nth moment about the mean for a sample.
  2086. Parameters
  2087. ----------
  2088. a : array_like
  2089. data
  2090. moment : int, optional
  2091. order of central moment that is returned
  2092. axis : int or None, optional
  2093. Axis along which the central moment is computed. Default is 0.
  2094. If None, compute over the whole array `a`.
  2095. Returns
  2096. -------
  2097. n-th central moment : ndarray or float
  2098. The appropriate moment along the given axis or over all values if axis
  2099. is None. The denominator for the moment calculation is the number of
  2100. observations, no degrees of freedom correction is done.
  2101. Notes
  2102. -----
  2103. For more details about `moment`, see `scipy.stats.moment`.
  2104. """
  2105. a, axis = _chk_asarray(a, axis)
  2106. if a.size == 0:
  2107. moment_shape = list(a.shape)
  2108. del moment_shape[axis]
  2109. dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
  2110. # empty array, return nan(s) with shape matching `moment`
  2111. out_shape = (moment_shape if np.isscalar(moment)
  2112. else [len(moment)] + moment_shape)
  2113. if len(out_shape) == 0:
  2114. return dtype(np.nan)
  2115. else:
  2116. return ma.array(np.full(out_shape, np.nan, dtype=dtype))
  2117. # for array_like moment input, return a value for each.
  2118. if not np.isscalar(moment):
  2119. mean = a.mean(axis, keepdims=True)
  2120. mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
  2121. return ma.array(mmnt)
  2122. else:
  2123. return _moment(a, moment, axis)
  2124. # Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
  2125. def _moment(a, moment, axis, *, mean=None):
  2126. if np.abs(moment - np.round(moment)) > 0:
  2127. raise ValueError("All moment parameters must be integers")
  2128. if moment == 0 or moment == 1:
  2129. # By definition the zeroth moment about the mean is 1, and the first
  2130. # moment is 0.
  2131. shape = list(a.shape)
  2132. del shape[axis]
  2133. dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
  2134. if len(shape) == 0:
  2135. return dtype(1.0 if moment == 0 else 0.0)
  2136. else:
  2137. return (ma.ones(shape, dtype=dtype) if moment == 0
  2138. else ma.zeros(shape, dtype=dtype))
  2139. else:
  2140. # Exponentiation by squares: form exponent sequence
  2141. n_list = [moment]
  2142. current_n = moment
  2143. while current_n > 2:
  2144. if current_n % 2:
  2145. current_n = (current_n-1)/2
  2146. else:
  2147. current_n /= 2
  2148. n_list.append(current_n)
  2149. # Starting point for exponentiation by squares
  2150. mean = a.mean(axis, keepdims=True) if mean is None else mean
  2151. a_zero_mean = a - mean
  2152. if n_list[-1] == 1:
  2153. s = a_zero_mean.copy()
  2154. else:
  2155. s = a_zero_mean**2
  2156. # Perform multiplications
  2157. for n in n_list[-2::-1]:
  2158. s = s**2
  2159. if n % 2:
  2160. s *= a_zero_mean
  2161. return s.mean(axis)
  2162. def variation(a, axis=0, ddof=0):
  2163. """
  2164. Compute the coefficient of variation.
  2165. The coefficient of variation is the standard deviation divided by the
  2166. mean. This function is equivalent to::
  2167. np.std(x, axis=axis, ddof=ddof) / np.mean(x)
  2168. The default for ``ddof`` is 0, but many definitions of the coefficient
  2169. of variation use the square root of the unbiased sample variance
  2170. for the sample standard deviation, which corresponds to ``ddof=1``.
  2171. Parameters
  2172. ----------
  2173. a : array_like
  2174. Input array.
  2175. axis : int or None, optional
  2176. Axis along which to calculate the coefficient of variation. Default
  2177. is 0. If None, compute over the whole array `a`.
  2178. ddof : int, optional
  2179. Delta degrees of freedom. Default is 0.
  2180. Returns
  2181. -------
  2182. variation : ndarray
  2183. The calculated variation along the requested axis.
  2184. Notes
  2185. -----
  2186. For more details about `variation`, see `scipy.stats.variation`.
  2187. Examples
  2188. --------
  2189. >>> import numpy as np
  2190. >>> from scipy.stats.mstats import variation
  2191. >>> a = np.array([2,8,4])
  2192. >>> variation(a)
  2193. 0.5345224838248487
  2194. >>> b = np.array([2,8,3,4])
  2195. >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
  2196. >>> variation(c)
  2197. 0.5345224838248487
  2198. In the example above, it can be seen that this works the same as
  2199. `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
  2200. array elements.
  2201. """
  2202. a, axis = _chk_asarray(a, axis)
  2203. return a.std(axis, ddof=ddof)/a.mean(axis)
  2204. def skew(a, axis=0, bias=True):
  2205. """
  2206. Computes the skewness of a data set.
  2207. Parameters
  2208. ----------
  2209. a : ndarray
  2210. data
  2211. axis : int or None, optional
  2212. Axis along which skewness is calculated. Default is 0.
  2213. If None, compute over the whole array `a`.
  2214. bias : bool, optional
  2215. If False, then the calculations are corrected for statistical bias.
  2216. Returns
  2217. -------
  2218. skewness : ndarray
  2219. The skewness of values along an axis, returning 0 where all values are
  2220. equal.
  2221. Notes
  2222. -----
  2223. For more details about `skew`, see `scipy.stats.skew`.
  2224. """
  2225. a, axis = _chk_asarray(a,axis)
  2226. mean = a.mean(axis, keepdims=True)
  2227. m2 = _moment(a, 2, axis, mean=mean)
  2228. m3 = _moment(a, 3, axis, mean=mean)
  2229. zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
  2230. with np.errstate(all='ignore'):
  2231. vals = ma.where(zero, 0, m3 / m2**1.5)
  2232. if not bias and zero is not ma.masked and m2 is not ma.masked:
  2233. n = a.count(axis)
  2234. can_correct = ~zero & (n > 2)
  2235. if can_correct.any():
  2236. n = np.extract(can_correct, n)
  2237. m2 = np.extract(can_correct, m2)
  2238. m3 = np.extract(can_correct, m3)
  2239. nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
  2240. np.place(vals, can_correct, nval)
  2241. return vals
  2242. def kurtosis(a, axis=0, fisher=True, bias=True):
  2243. """
  2244. Computes the kurtosis (Fisher or Pearson) of a dataset.
  2245. Kurtosis is the fourth central moment divided by the square of the
  2246. variance. If Fisher's definition is used, then 3.0 is subtracted from
  2247. the result to give 0.0 for a normal distribution.
  2248. If bias is False then the kurtosis is calculated using k statistics to
  2249. eliminate bias coming from biased moment estimators
  2250. Use `kurtosistest` to see if result is close enough to normal.
  2251. Parameters
  2252. ----------
  2253. a : array
  2254. data for which the kurtosis is calculated
  2255. axis : int or None, optional
  2256. Axis along which the kurtosis is calculated. Default is 0.
  2257. If None, compute over the whole array `a`.
  2258. fisher : bool, optional
  2259. If True, Fisher's definition is used (normal ==> 0.0). If False,
  2260. Pearson's definition is used (normal ==> 3.0).
  2261. bias : bool, optional
  2262. If False, then the calculations are corrected for statistical bias.
  2263. Returns
  2264. -------
  2265. kurtosis : array
  2266. The kurtosis of values along an axis. If all values are equal,
  2267. return -3 for Fisher's definition and 0 for Pearson's definition.
  2268. Notes
  2269. -----
  2270. For more details about `kurtosis`, see `scipy.stats.kurtosis`.
  2271. """
  2272. a, axis = _chk_asarray(a, axis)
  2273. mean = a.mean(axis, keepdims=True)
  2274. m2 = _moment(a, 2, axis, mean=mean)
  2275. m4 = _moment(a, 4, axis, mean=mean)
  2276. zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
  2277. with np.errstate(all='ignore'):
  2278. vals = ma.where(zero, 0, m4 / m2**2.0)
  2279. if not bias and zero is not ma.masked and m2 is not ma.masked:
  2280. n = a.count(axis)
  2281. can_correct = ~zero & (n > 3)
  2282. if can_correct.any():
  2283. n = np.extract(can_correct, n)
  2284. m2 = np.extract(can_correct, m2)
  2285. m4 = np.extract(can_correct, m4)
  2286. nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
  2287. np.place(vals, can_correct, nval+3.0)
  2288. if fisher:
  2289. return vals - 3
  2290. else:
  2291. return vals
  2292. DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
  2293. 'variance', 'skewness',
  2294. 'kurtosis'))
  2295. def describe(a, axis=0, ddof=0, bias=True):
  2296. """
  2297. Computes several descriptive statistics of the passed array.
  2298. Parameters
  2299. ----------
  2300. a : array_like
  2301. Data array
  2302. axis : int or None, optional
  2303. Axis along which to calculate statistics. Default 0. If None,
  2304. compute over the whole array `a`.
  2305. ddof : int, optional
  2306. degree of freedom (default 0); note that default ddof is different
  2307. from the same routine in stats.describe
  2308. bias : bool, optional
  2309. If False, then the skewness and kurtosis calculations are corrected for
  2310. statistical bias.
  2311. Returns
  2312. -------
  2313. nobs : int
  2314. (size of the data (discarding missing values)
  2315. minmax : (int, int)
  2316. min, max
  2317. mean : float
  2318. arithmetic mean
  2319. variance : float
  2320. unbiased variance
  2321. skewness : float
  2322. biased skewness
  2323. kurtosis : float
  2324. biased kurtosis
  2325. Examples
  2326. --------
  2327. >>> import numpy as np
  2328. >>> from scipy.stats.mstats import describe
  2329. >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
  2330. >>> describe(ma)
  2331. DescribeResult(nobs=3, minmax=(masked_array(data=0,
  2332. mask=False,
  2333. fill_value=999999), masked_array(data=2,
  2334. mask=False,
  2335. fill_value=999999)), mean=1.0, variance=0.6666666666666666,
  2336. skewness=masked_array(data=0., mask=False, fill_value=1e+20),
  2337. kurtosis=-1.5)
  2338. """
  2339. a, axis = _chk_asarray(a, axis)
  2340. n = a.count(axis)
  2341. mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
  2342. m = a.mean(axis)
  2343. v = a.var(axis, ddof=ddof)
  2344. sk = skew(a, axis, bias=bias)
  2345. kurt = kurtosis(a, axis, bias=bias)
  2346. return DescribeResult(n, mm, m, v, sk, kurt)
  2347. def stde_median(data, axis=None):
  2348. """Returns the McKean-Schrader estimate of the standard error of the sample
  2349. median along the given axis. masked values are discarded.
  2350. Parameters
  2351. ----------
  2352. data : ndarray
  2353. Data to trim.
  2354. axis : {None,int}, optional
  2355. Axis along which to perform the trimming.
  2356. If None, the input array is first flattened.
  2357. """
  2358. def _stdemed_1D(data):
  2359. data = np.sort(data.compressed())
  2360. n = len(data)
  2361. z = 2.5758293035489004
  2362. k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
  2363. return ((data[n-k] - data[k-1])/(2.*z))
  2364. data = ma.array(data, copy=False, subok=True)
  2365. if (axis is None):
  2366. return _stdemed_1D(data)
  2367. else:
  2368. if data.ndim > 2:
  2369. raise ValueError("Array 'data' must be at most two dimensional, "
  2370. "but got data.ndim = %d" % data.ndim)
  2371. return ma.apply_along_axis(_stdemed_1D, axis, data)
  2372. SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
  2373. def skewtest(a, axis=0, alternative='two-sided'):
  2374. """
  2375. Tests whether the skew is different from the normal distribution.
  2376. Parameters
  2377. ----------
  2378. a : array_like
  2379. The data to be tested
  2380. axis : int or None, optional
  2381. Axis along which statistics are calculated. Default is 0.
  2382. If None, compute over the whole array `a`.
  2383. alternative : {'two-sided', 'less', 'greater'}, optional
  2384. Defines the alternative hypothesis. Default is 'two-sided'.
  2385. The following options are available:
  2386. * 'two-sided': the skewness of the distribution underlying the sample
  2387. is different from that of the normal distribution (i.e. 0)
  2388. * 'less': the skewness of the distribution underlying the sample
  2389. is less than that of the normal distribution
  2390. * 'greater': the skewness of the distribution underlying the sample
  2391. is greater than that of the normal distribution
  2392. .. versionadded:: 1.7.0
  2393. Returns
  2394. -------
  2395. statistic : array_like
  2396. The computed z-score for this test.
  2397. pvalue : array_like
  2398. A p-value for the hypothesis test
  2399. Notes
  2400. -----
  2401. For more details about `skewtest`, see `scipy.stats.skewtest`.
  2402. """
  2403. a, axis = _chk_asarray(a, axis)
  2404. if axis is None:
  2405. a = a.ravel()
  2406. axis = 0
  2407. b2 = skew(a,axis)
  2408. n = a.count(axis)
  2409. if np.min(n) < 8:
  2410. raise ValueError(
  2411. "skewtest is not valid with less than 8 samples; %i samples"
  2412. " were given." % np.min(n))
  2413. y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
  2414. beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
  2415. W2 = -1 + ma.sqrt(2*(beta2-1))
  2416. delta = 1/ma.sqrt(0.5*ma.log(W2))
  2417. alpha = ma.sqrt(2.0/(W2-1))
  2418. y = ma.where(y == 0, 1, y)
  2419. Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
  2420. return SkewtestResult(*scipy.stats._stats_py._normtest_finish(Z, alternative))
  2421. KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
  2422. def kurtosistest(a, axis=0, alternative='two-sided'):
  2423. """
  2424. Tests whether a dataset has normal kurtosis
  2425. Parameters
  2426. ----------
  2427. a : array_like
  2428. array of the sample data
  2429. axis : int or None, optional
  2430. Axis along which to compute test. Default is 0. If None,
  2431. compute over the whole array `a`.
  2432. alternative : {'two-sided', 'less', 'greater'}, optional
  2433. Defines the alternative hypothesis.
  2434. The following options are available (default is 'two-sided'):
  2435. * 'two-sided': the kurtosis of the distribution underlying the sample
  2436. is different from that of the normal distribution
  2437. * 'less': the kurtosis of the distribution underlying the sample
  2438. is less than that of the normal distribution
  2439. * 'greater': the kurtosis of the distribution underlying the sample
  2440. is greater than that of the normal distribution
  2441. .. versionadded:: 1.7.0
  2442. Returns
  2443. -------
  2444. statistic : array_like
  2445. The computed z-score for this test.
  2446. pvalue : array_like
  2447. The p-value for the hypothesis test
  2448. Notes
  2449. -----
  2450. For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
  2451. """
  2452. a, axis = _chk_asarray(a, axis)
  2453. n = a.count(axis=axis)
  2454. if np.min(n) < 5:
  2455. raise ValueError(
  2456. "kurtosistest requires at least 5 observations; %i observations"
  2457. " were given." % np.min(n))
  2458. if np.min(n) < 20:
  2459. warnings.warn(
  2460. "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
  2461. np.min(n))
  2462. b2 = kurtosis(a, axis, fisher=False)
  2463. E = 3.0*(n-1) / (n+1)
  2464. varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
  2465. x = (b2-E)/ma.sqrt(varb2)
  2466. sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
  2467. (n*(n-2)*(n-3)))
  2468. A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
  2469. term1 = 1 - 2./(9.0*A)
  2470. denom = 1 + x*ma.sqrt(2/(A-4.0))
  2471. if np.ma.isMaskedArray(denom):
  2472. # For multi-dimensional array input
  2473. denom[denom == 0.0] = masked
  2474. elif denom == 0.0:
  2475. denom = masked
  2476. term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
  2477. -ma.power(-(1-2.0/A)/denom, 1/3.0))
  2478. Z = (term1 - term2) / np.sqrt(2/(9.0*A))
  2479. return KurtosistestResult(
  2480. *scipy.stats._stats_py._normtest_finish(Z, alternative)
  2481. )
  2482. NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
  2483. def normaltest(a, axis=0):
  2484. """
  2485. Tests whether a sample differs from a normal distribution.
  2486. Parameters
  2487. ----------
  2488. a : array_like
  2489. The array containing the data to be tested.
  2490. axis : int or None, optional
  2491. Axis along which to compute test. Default is 0. If None,
  2492. compute over the whole array `a`.
  2493. Returns
  2494. -------
  2495. statistic : float or array
  2496. ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
  2497. ``k`` is the z-score returned by `kurtosistest`.
  2498. pvalue : float or array
  2499. A 2-sided chi squared probability for the hypothesis test.
  2500. Notes
  2501. -----
  2502. For more details about `normaltest`, see `scipy.stats.normaltest`.
  2503. """
  2504. a, axis = _chk_asarray(a, axis)
  2505. s, _ = skewtest(a, axis)
  2506. k, _ = kurtosistest(a, axis)
  2507. k2 = s*s + k*k
  2508. return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
  2509. def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
  2510. limit=()):
  2511. """
  2512. Computes empirical quantiles for a data array.
  2513. Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
  2514. where ``x[j]`` is the j-th order statistic, and gamma is a function of
  2515. ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
  2516. ``g = n*p + m - j``.
  2517. Reinterpreting the above equations to compare to **R** lead to the
  2518. equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
  2519. Typical values of (alphap,betap) are:
  2520. - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf
  2521. (**R** type 4)
  2522. - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
  2523. (**R** type 5)
  2524. - (0,0) : ``p(k) = k/(n+1)`` :
  2525. (**R** type 6)
  2526. - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
  2527. (**R** type 7, **R** default)
  2528. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
  2529. The resulting quantile estimates are approximately median-unbiased
  2530. regardless of the distribution of x.
  2531. (**R** type 8)
  2532. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
  2533. The resulting quantile estimates are approximately unbiased
  2534. if x is normally distributed
  2535. (**R** type 9)
  2536. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2537. - (.35,.35): APL, used with PWM
  2538. Parameters
  2539. ----------
  2540. a : array_like
  2541. Input data, as a sequence or array of dimension at most 2.
  2542. prob : array_like, optional
  2543. List of quantiles to compute.
  2544. alphap : float, optional
  2545. Plotting positions parameter, default is 0.4.
  2546. betap : float, optional
  2547. Plotting positions parameter, default is 0.4.
  2548. axis : int, optional
  2549. Axis along which to perform the trimming.
  2550. If None (default), the input array is first flattened.
  2551. limit : tuple, optional
  2552. Tuple of (lower, upper) values.
  2553. Values of `a` outside this open interval are ignored.
  2554. Returns
  2555. -------
  2556. mquantiles : MaskedArray
  2557. An array containing the calculated quantiles.
  2558. Notes
  2559. -----
  2560. This formulation is very similar to **R** except the calculation of
  2561. ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
  2562. with each type.
  2563. References
  2564. ----------
  2565. .. [1] *R* statistical software: https://www.r-project.org/
  2566. .. [2] *R* ``quantile`` function:
  2567. http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
  2568. Examples
  2569. --------
  2570. >>> import numpy as np
  2571. >>> from scipy.stats.mstats import mquantiles
  2572. >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
  2573. >>> mquantiles(a)
  2574. array([ 19.2, 40. , 42.8])
  2575. Using a 2D array, specifying axis and limit.
  2576. >>> data = np.array([[ 6., 7., 1.],
  2577. ... [ 47., 15., 2.],
  2578. ... [ 49., 36., 3.],
  2579. ... [ 15., 39., 4.],
  2580. ... [ 42., 40., -999.],
  2581. ... [ 41., 41., -999.],
  2582. ... [ 7., -999., -999.],
  2583. ... [ 39., -999., -999.],
  2584. ... [ 43., -999., -999.],
  2585. ... [ 40., -999., -999.],
  2586. ... [ 36., -999., -999.]])
  2587. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2588. [[19.2 14.6 1.45]
  2589. [40. 37.5 2.5 ]
  2590. [42.8 40.05 3.55]]
  2591. >>> data[:, 2] = -999.
  2592. >>> print(mquantiles(data, axis=0, limit=(0, 50)))
  2593. [[19.200000000000003 14.6 --]
  2594. [40.0 37.5 --]
  2595. [42.800000000000004 40.05 --]]
  2596. """
  2597. def _quantiles1D(data,m,p):
  2598. x = np.sort(data.compressed())
  2599. n = len(x)
  2600. if n == 0:
  2601. return ma.array(np.empty(len(p), dtype=float), mask=True)
  2602. elif n == 1:
  2603. return ma.array(np.resize(x, p.shape), mask=nomask)
  2604. aleph = (n*p + m)
  2605. k = np.floor(aleph.clip(1, n-1)).astype(int)
  2606. gamma = (aleph-k).clip(0,1)
  2607. return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
  2608. data = ma.array(a, copy=False)
  2609. if data.ndim > 2:
  2610. raise TypeError("Array should be 2D at most !")
  2611. if limit:
  2612. condition = (limit[0] < data) & (data < limit[1])
  2613. data[~condition.filled(True)] = masked
  2614. p = np.array(prob, copy=False, ndmin=1)
  2615. m = alphap + p*(1.-alphap-betap)
  2616. # Computes quantiles along axis (or globally)
  2617. if (axis is None):
  2618. return _quantiles1D(data, m, p)
  2619. return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
  2620. def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
  2621. """Calculate the score at the given 'per' percentile of the
  2622. sequence a. For example, the score at per=50 is the median.
  2623. This function is a shortcut to mquantile
  2624. """
  2625. if (per < 0) or (per > 100.):
  2626. raise ValueError("The percentile should be between 0. and 100. !"
  2627. " (got %s)" % per)
  2628. return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
  2629. limit=limit, axis=0).squeeze()
  2630. def plotting_positions(data, alpha=0.4, beta=0.4):
  2631. """
  2632. Returns plotting positions (or empirical percentile points) for the data.
  2633. Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
  2634. - i is the rank order statistics
  2635. - n is the number of unmasked values along the given axis
  2636. - `alpha` and `beta` are two parameters.
  2637. Typical values for `alpha` and `beta` are:
  2638. - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
  2639. - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function
  2640. (R, type 5)
  2641. - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)
  2642. - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,
  2643. ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
  2644. - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
  2645. ``p(k) ~ median[F(x[k])]``.
  2646. The resulting quantile estimates are approximately median-unbiased
  2647. regardless of the distribution of x. (R type 8)
  2648. - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
  2649. The resulting quantile estimates are approximately unbiased
  2650. if x is normally distributed (R type 9)
  2651. - (.4,.4) : approximately quantile unbiased (Cunnane)
  2652. - (.35,.35): APL, used with PWM
  2653. - (.3175, .3175): used in scipy.stats.probplot
  2654. Parameters
  2655. ----------
  2656. data : array_like
  2657. Input data, as a sequence or array of dimension at most 2.
  2658. alpha : float, optional
  2659. Plotting positions parameter. Default is 0.4.
  2660. beta : float, optional
  2661. Plotting positions parameter. Default is 0.4.
  2662. Returns
  2663. -------
  2664. positions : MaskedArray
  2665. The calculated plotting positions.
  2666. """
  2667. data = ma.array(data, copy=False).reshape(1,-1)
  2668. n = data.count()
  2669. plpos = np.empty(data.size, dtype=float)
  2670. plpos[n:] = 0
  2671. plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
  2672. (n + 1.0 - alpha - beta))
  2673. return ma.array(plpos, mask=data._mask)
  2674. meppf = plotting_positions
  2675. def obrientransform(*args):
  2676. """
  2677. Computes a transform on input data (any number of columns). Used to
  2678. test for homogeneity of variance prior to running one-way stats. Each
  2679. array in ``*args`` is one level of a factor. If an `f_oneway()` run on
  2680. the transformed data and found significant, variances are unequal. From
  2681. Maxwell and Delaney, p.112.
  2682. Returns: transformed data for use in an ANOVA
  2683. """
  2684. data = argstoarray(*args).T
  2685. v = data.var(axis=0,ddof=1)
  2686. m = data.mean(0)
  2687. n = data.count(0).astype(float)
  2688. # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
  2689. data -= m
  2690. data **= 2
  2691. data *= (n-1.5)*n
  2692. data -= 0.5*v*(n-1)
  2693. data /= (n-1.)*(n-2.)
  2694. if not ma.allclose(v,data.mean(0)):
  2695. raise ValueError("Lack of convergence in obrientransform.")
  2696. return data
  2697. def sem(a, axis=0, ddof=1):
  2698. """
  2699. Calculates the standard error of the mean of the input array.
  2700. Also sometimes called standard error of measurement.
  2701. Parameters
  2702. ----------
  2703. a : array_like
  2704. An array containing the values for which the standard error is
  2705. returned.
  2706. axis : int or None, optional
  2707. If axis is None, ravel `a` first. If axis is an integer, this will be
  2708. the axis over which to operate. Defaults to 0.
  2709. ddof : int, optional
  2710. Delta degrees-of-freedom. How many degrees of freedom to adjust
  2711. for bias in limited samples relative to the population estimate
  2712. of variance. Defaults to 1.
  2713. Returns
  2714. -------
  2715. s : ndarray or float
  2716. The standard error of the mean in the sample(s), along the input axis.
  2717. Notes
  2718. -----
  2719. The default value for `ddof` changed in scipy 0.15.0 to be consistent with
  2720. `scipy.stats.sem` as well as with the most common definition used (like in
  2721. the R documentation).
  2722. Examples
  2723. --------
  2724. Find standard error along the first axis:
  2725. >>> import numpy as np
  2726. >>> from scipy import stats
  2727. >>> a = np.arange(20).reshape(5,4)
  2728. >>> print(stats.mstats.sem(a))
  2729. [2.8284271247461903 2.8284271247461903 2.8284271247461903
  2730. 2.8284271247461903]
  2731. Find standard error across the whole array, using n degrees of freedom:
  2732. >>> print(stats.mstats.sem(a, axis=None, ddof=0))
  2733. 1.2893796958227628
  2734. """
  2735. a, axis = _chk_asarray(a, axis)
  2736. n = a.count(axis=axis)
  2737. s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
  2738. return s
  2739. F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
  2740. def f_oneway(*args):
  2741. """
  2742. Performs a 1-way ANOVA, returning an F-value and probability given
  2743. any number of groups. From Heiman, pp.394-7.
  2744. Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
  2745. one per treatment group.
  2746. Returns
  2747. -------
  2748. statistic : float
  2749. The computed F-value of the test.
  2750. pvalue : float
  2751. The associated p-value from the F-distribution.
  2752. """
  2753. # Construct a single array of arguments: each row is a group
  2754. data = argstoarray(*args)
  2755. ngroups = len(data)
  2756. ntot = data.count()
  2757. sstot = (data**2).sum() - (data.sum())**2/float(ntot)
  2758. ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
  2759. sswg = sstot-ssbg
  2760. dfbg = ngroups-1
  2761. dfwg = ntot - ngroups
  2762. msb = ssbg/float(dfbg)
  2763. msw = sswg/float(dfwg)
  2764. f = msb/msw
  2765. prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf
  2766. return F_onewayResult(f, prob)
  2767. FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
  2768. ('statistic', 'pvalue'))
  2769. def friedmanchisquare(*args):
  2770. """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
  2771. This function calculates the Friedman Chi-square test for repeated measures
  2772. and returns the result, along with the associated probability value.
  2773. Each input is considered a given group. Ideally, the number of treatments
  2774. among each group should be equal. If this is not the case, only the first
  2775. n treatments are taken into account, where n is the number of treatments
  2776. of the smallest group.
  2777. If a group has some missing values, the corresponding treatments are masked
  2778. in the other groups.
  2779. The test statistic is corrected for ties.
  2780. Masked values in one group are propagated to the other groups.
  2781. Returns
  2782. -------
  2783. statistic : float
  2784. the test statistic.
  2785. pvalue : float
  2786. the associated p-value.
  2787. """
  2788. data = argstoarray(*args).astype(float)
  2789. k = len(data)
  2790. if k < 3:
  2791. raise ValueError("Less than 3 groups (%i): " % k +
  2792. "the Friedman test is NOT appropriate.")
  2793. ranked = ma.masked_values(rankdata(data, axis=0), 0)
  2794. if ranked._mask is not nomask:
  2795. ranked = ma.mask_cols(ranked)
  2796. ranked = ranked.compressed().reshape(k,-1).view(ndarray)
  2797. else:
  2798. ranked = ranked._data
  2799. (k,n) = ranked.shape
  2800. # Ties correction
  2801. repeats = [find_repeats(row) for row in ranked.T]
  2802. ties = np.array([y for x, y in repeats if x.size > 0])
  2803. tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
  2804. ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
  2805. chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
  2806. return FriedmanchisquareResult(chisq,
  2807. distributions.chi2.sf(chisq, k-1))
  2808. BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
  2809. def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
  2810. """
  2811. Computes the Brunner-Munzel test on samples x and y
  2812. Missing values in `x` and/or `y` are discarded.
  2813. Parameters
  2814. ----------
  2815. x, y : array_like
  2816. Array of samples, should be one-dimensional.
  2817. alternative : 'less', 'two-sided', or 'greater', optional
  2818. Whether to get the p-value for the one-sided hypothesis ('less'
  2819. or 'greater') or for the two-sided hypothesis ('two-sided').
  2820. Defaults value is 'two-sided' .
  2821. distribution : 't' or 'normal', optional
  2822. Whether to get the p-value by t-distribution or by standard normal
  2823. distribution.
  2824. Defaults value is 't' .
  2825. Returns
  2826. -------
  2827. statistic : float
  2828. The Brunner-Munzer W statistic.
  2829. pvalue : float
  2830. p-value assuming an t distribution. One-sided or
  2831. two-sided, depending on the choice of `alternative` and `distribution`.
  2832. See Also
  2833. --------
  2834. mannwhitneyu : Mann-Whitney rank test on two samples.
  2835. Notes
  2836. -----
  2837. For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
  2838. """
  2839. x = ma.asarray(x).compressed().view(ndarray)
  2840. y = ma.asarray(y).compressed().view(ndarray)
  2841. nx = len(x)
  2842. ny = len(y)
  2843. if nx == 0 or ny == 0:
  2844. return BrunnerMunzelResult(np.nan, np.nan)
  2845. rankc = rankdata(np.concatenate((x,y)))
  2846. rankcx = rankc[0:nx]
  2847. rankcy = rankc[nx:nx+ny]
  2848. rankcx_mean = np.mean(rankcx)
  2849. rankcy_mean = np.mean(rankcy)
  2850. rankx = rankdata(x)
  2851. ranky = rankdata(y)
  2852. rankx_mean = np.mean(rankx)
  2853. ranky_mean = np.mean(ranky)
  2854. Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
  2855. Sx /= nx - 1
  2856. Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
  2857. Sy /= ny - 1
  2858. wbfn = nx * ny * (rankcy_mean - rankcx_mean)
  2859. wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
  2860. if distribution == "t":
  2861. df_numer = np.power(nx * Sx + ny * Sy, 2.0)
  2862. df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
  2863. df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
  2864. df = df_numer / df_denom
  2865. p = distributions.t.cdf(wbfn, df)
  2866. elif distribution == "normal":
  2867. p = distributions.norm.cdf(wbfn)
  2868. else:
  2869. raise ValueError(
  2870. "distribution should be 't' or 'normal'")
  2871. if alternative == "greater":
  2872. pass
  2873. elif alternative == "less":
  2874. p = 1 - p
  2875. elif alternative == "two-sided":
  2876. p = 2 * np.min([p, 1-p])
  2877. else:
  2878. raise ValueError(
  2879. "alternative should be 'less', 'greater' or 'two-sided'")
  2880. return BrunnerMunzelResult(wbfn, p)