12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521 |
- """
- An extension of scipy.stats._stats_py to support masked arrays
- """
- # Original author (2007): Pierre GF Gerard-Marchant
- __all__ = ['argstoarray',
- 'count_tied_groups',
- 'describe',
- 'f_oneway', 'find_repeats','friedmanchisquare',
- 'kendalltau','kendalltau_seasonal','kruskal','kruskalwallis',
- 'ks_twosamp', 'ks_2samp', 'kurtosis', 'kurtosistest',
- 'ks_1samp', 'kstest',
- 'linregress',
- 'mannwhitneyu', 'meppf','mode','moment','mquantiles','msign',
- 'normaltest',
- 'obrientransform',
- 'pearsonr','plotting_positions','pointbiserialr',
- 'rankdata',
- 'scoreatpercentile','sem',
- 'sen_seasonal_slopes','skew','skewtest','spearmanr',
- 'siegelslopes', 'theilslopes',
- 'tmax','tmean','tmin','trim','trimboth',
- 'trimtail','trima','trimr','trimmed_mean','trimmed_std',
- 'trimmed_stde','trimmed_var','tsem','ttest_1samp','ttest_onesamp',
- 'ttest_ind','ttest_rel','tvar',
- 'variation',
- 'winsorize',
- 'brunnermunzel',
- ]
- import numpy as np
- from numpy import ndarray
- import numpy.ma as ma
- from numpy.ma import masked, nomask
- import math
- import itertools
- import warnings
- from collections import namedtuple
- from . import distributions
- from scipy._lib._util import _rename_parameter, _contains_nan
- from scipy._lib._bunch import _make_tuple_bunch
- import scipy.special as special
- import scipy.stats._stats_py
- from ._stats_mstats_common import (
- _find_repeats,
- linregress as stats_linregress,
- LinregressResult as stats_LinregressResult,
- theilslopes as stats_theilslopes,
- siegelslopes as stats_siegelslopes
- )
- def _chk_asarray(a, axis):
- # Always returns a masked array, raveled for axis=None
- a = ma.asanyarray(a)
- if axis is None:
- a = ma.ravel(a)
- outaxis = 0
- else:
- outaxis = axis
- return a, outaxis
- def _chk2_asarray(a, b, axis):
- a = ma.asanyarray(a)
- b = ma.asanyarray(b)
- if axis is None:
- a = ma.ravel(a)
- b = ma.ravel(b)
- outaxis = 0
- else:
- outaxis = axis
- return a, b, outaxis
- def _chk_size(a, b):
- a = ma.asanyarray(a)
- b = ma.asanyarray(b)
- (na, nb) = (a.size, b.size)
- if na != nb:
- raise ValueError("The size of the input array should match!"
- " (%s <> %s)" % (na, nb))
- return (a, b, na)
- def argstoarray(*args):
- """
- Constructs a 2D array from a group of sequences.
- Sequences are filled with missing values to match the length of the longest
- sequence.
- Parameters
- ----------
- *args : sequences
- Group of sequences.
- Returns
- -------
- argstoarray : MaskedArray
- A ( `m` x `n` ) masked array, where `m` is the number of arguments and
- `n` the length of the longest argument.
- Notes
- -----
- `numpy.ma.row_stack` has identical behavior, but is called with a sequence
- of sequences.
- Examples
- --------
- A 2D masked array constructed from a group of sequences is returned.
- >>> from scipy.stats.mstats import argstoarray
- >>> argstoarray([1, 2, 3], [4, 5, 6])
- masked_array(
- data=[[1.0, 2.0, 3.0],
- [4.0, 5.0, 6.0]],
- mask=[[False, False, False],
- [False, False, False]],
- fill_value=1e+20)
- The returned masked array filled with missing values when the lengths of
- sequences are different.
- >>> argstoarray([1, 3], [4, 5, 6])
- masked_array(
- data=[[1.0, 3.0, --],
- [4.0, 5.0, 6.0]],
- mask=[[False, False, True],
- [False, False, False]],
- fill_value=1e+20)
- """
- if len(args) == 1 and not isinstance(args[0], ndarray):
- output = ma.asarray(args[0])
- if output.ndim != 2:
- raise ValueError("The input should be 2D")
- else:
- n = len(args)
- m = max([len(k) for k in args])
- output = ma.array(np.empty((n,m), dtype=float), mask=True)
- for (k,v) in enumerate(args):
- output[k,:len(v)] = v
- output[np.logical_not(np.isfinite(output._data))] = masked
- return output
- def find_repeats(arr):
- """Find repeats in arr and return a tuple (repeats, repeat_count).
- The input is cast to float64. Masked values are discarded.
- Parameters
- ----------
- arr : sequence
- Input array. The array is flattened if it is not 1D.
- Returns
- -------
- repeats : ndarray
- Array of repeated values.
- counts : ndarray
- Array of counts.
- Examples
- --------
- >>> from scipy.stats import mstats
- >>> mstats.find_repeats([2, 1, 2, 3, 2, 2, 5])
- (array([2.]), array([4]))
- In the above example, 2 repeats 4 times.
- >>> mstats.find_repeats([[10, 20, 1, 2], [5, 5, 4, 4]])
- (array([4., 5.]), array([2, 2]))
- In the above example, both 4 and 5 repeat 2 times.
- """
- # Make sure we get a copy. ma.compressed promises a "new array", but can
- # actually return a reference.
- compr = np.asarray(ma.compressed(arr), dtype=np.float64)
- try:
- need_copy = np.may_share_memory(compr, arr)
- except AttributeError:
- # numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
- # while in numpy 1.8.2 and above it just (correctly) returns False.
- need_copy = False
- if need_copy:
- compr = compr.copy()
- return _find_repeats(compr)
- def count_tied_groups(x, use_missing=False):
- """
- Counts the number of tied values.
- Parameters
- ----------
- x : sequence
- Sequence of data on which to counts the ties
- use_missing : bool, optional
- Whether to consider missing values as tied.
- Returns
- -------
- count_tied_groups : dict
- Returns a dictionary (nb of ties: nb of groups).
- Examples
- --------
- >>> from scipy.stats import mstats
- >>> import numpy as np
- >>> z = [0, 0, 0, 2, 2, 2, 3, 3, 4, 5, 6]
- >>> mstats.count_tied_groups(z)
- {2: 1, 3: 2}
- In the above example, the ties were 0 (3x), 2 (3x) and 3 (2x).
- >>> z = np.ma.array([0, 0, 1, 2, 2, 2, 3, 3, 4, 5, 6])
- >>> mstats.count_tied_groups(z)
- {2: 2, 3: 1}
- >>> z[[1,-1]] = np.ma.masked
- >>> mstats.count_tied_groups(z, use_missing=True)
- {2: 2, 3: 1}
- """
- nmasked = ma.getmask(x).sum()
- # We need the copy as find_repeats will overwrite the initial data
- data = ma.compressed(x).copy()
- (ties, counts) = find_repeats(data)
- nties = {}
- if len(ties):
- nties = dict(zip(np.unique(counts), itertools.repeat(1)))
- nties.update(dict(zip(*find_repeats(counts))))
- if nmasked and use_missing:
- try:
- nties[nmasked] += 1
- except KeyError:
- nties[nmasked] = 1
- return nties
- def rankdata(data, axis=None, use_missing=False):
- """Returns the rank (also known as order statistics) of each data point
- along the given axis.
- If some values are tied, their rank is averaged.
- If some values are masked, their rank is set to 0 if use_missing is False,
- or set to the average rank of the unmasked values if use_missing is True.
- Parameters
- ----------
- data : sequence
- Input data. The data is transformed to a masked array
- axis : {None,int}, optional
- Axis along which to perform the ranking.
- If None, the array is first flattened. An exception is raised if
- the axis is specified for arrays with a dimension larger than 2
- use_missing : bool, optional
- Whether the masked values have a rank of 0 (False) or equal to the
- average rank of the unmasked values (True).
- """
- def _rank1d(data, use_missing=False):
- n = data.count()
- rk = np.empty(data.size, dtype=float)
- idx = data.argsort()
- rk[idx[:n]] = np.arange(1,n+1)
- if use_missing:
- rk[idx[n:]] = (n+1)/2.
- else:
- rk[idx[n:]] = 0
- repeats = find_repeats(data.copy())
- for r in repeats[0]:
- condition = (data == r).filled(False)
- rk[condition] = rk[condition].mean()
- return rk
- data = ma.array(data, copy=False)
- if axis is None:
- if data.ndim > 1:
- return _rank1d(data.ravel(), use_missing).reshape(data.shape)
- else:
- return _rank1d(data, use_missing)
- else:
- return ma.apply_along_axis(_rank1d,axis,data,use_missing).view(ndarray)
- ModeResult = namedtuple('ModeResult', ('mode', 'count'))
- def mode(a, axis=0):
- """
- Returns an array of the modal (most common) value in the passed array.
- Parameters
- ----------
- a : array_like
- n-dimensional array of which to find mode(s).
- axis : int or None, optional
- Axis along which to operate. Default is 0. If None, compute over
- the whole array `a`.
- Returns
- -------
- mode : ndarray
- Array of modal values.
- count : ndarray
- Array of counts for each mode.
- Notes
- -----
- For more details, see `scipy.stats.mode`.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> from scipy.stats import mstats
- >>> m_arr = np.ma.array([1, 1, 0, 0, 0, 0], mask=[0, 0, 1, 1, 1, 0])
- >>> mstats.mode(m_arr) # note that most zeros are masked
- ModeResult(mode=array([1.]), count=array([2.]))
- """
- return _mode(a, axis=axis, keepdims=True)
- def _mode(a, axis=0, keepdims=True):
- # Don't want to expose `keepdims` from the public `mstats.mode`
- a, axis = _chk_asarray(a, axis)
- def _mode1D(a):
- (rep,cnt) = find_repeats(a)
- if not cnt.ndim:
- return (0, 0)
- elif cnt.size:
- return (rep[cnt.argmax()], cnt.max())
- else:
- return (a.min(), 1)
- if axis is None:
- output = _mode1D(ma.ravel(a))
- output = (ma.array(output[0]), ma.array(output[1]))
- else:
- output = ma.apply_along_axis(_mode1D, axis, a)
- if keepdims is None or keepdims:
- newshape = list(a.shape)
- newshape[axis] = 1
- slices = [slice(None)] * output.ndim
- slices[axis] = 0
- modes = output[tuple(slices)].reshape(newshape)
- slices[axis] = 1
- counts = output[tuple(slices)].reshape(newshape)
- output = (modes, counts)
- else:
- output = np.moveaxis(output, axis, 0)
- return ModeResult(*output)
- def _betai(a, b, x):
- x = np.asanyarray(x)
- x = ma.where(x < 1.0, x, 1.0) # if x > 1 then return 1.0
- return special.betainc(a, b, x)
- def msign(x):
- """Returns the sign of x, or 0 if x is masked."""
- return ma.filled(np.sign(x), 0)
- def pearsonr(x, y):
- r"""
- Pearson correlation coefficient and p-value for testing non-correlation.
- The Pearson correlation coefficient [1]_ measures the linear relationship
- between two datasets. The calculation of the p-value relies on the
- assumption that each dataset is normally distributed. (See Kowalski [3]_
- for a discussion of the effects of non-normality of the input on the
- distribution of the correlation coefficient.) Like other correlation
- coefficients, this one varies between -1 and +1 with 0 implying no
- correlation. Correlations of -1 or +1 imply an exact linear relationship.
- Parameters
- ----------
- x : (N,) array_like
- Input array.
- y : (N,) array_like
- Input array.
- Returns
- -------
- r : float
- Pearson's correlation coefficient.
- p-value : float
- Two-tailed p-value.
- Warns
- -----
- PearsonRConstantInputWarning
- Raised if an input is a constant array. The correlation coefficient
- is not defined in this case, so ``np.nan`` is returned.
- PearsonRNearConstantInputWarning
- Raised if an input is "nearly" constant. The array ``x`` is considered
- nearly constant if ``norm(x - mean(x)) < 1e-13 * abs(mean(x))``.
- Numerical errors in the calculation ``x - mean(x)`` in this case might
- result in an inaccurate calculation of r.
- See Also
- --------
- spearmanr : Spearman rank-order correlation coefficient.
- kendalltau : Kendall's tau, a correlation measure for ordinal data.
- Notes
- -----
- The correlation coefficient is calculated as follows:
- .. math::
- r = \frac{\sum (x - m_x) (y - m_y)}
- {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}
- where :math:`m_x` is the mean of the vector x and :math:`m_y` is
- the mean of the vector y.
- Under the assumption that x and y are drawn from
- independent normal distributions (so the population correlation coefficient
- is 0), the probability density function of the sample correlation
- coefficient r is ([1]_, [2]_):
- .. math::
- f(r) = \frac{{(1-r^2)}^{n/2-2}}{\mathrm{B}(\frac{1}{2},\frac{n}{2}-1)}
- where n is the number of samples, and B is the beta function. This
- is sometimes referred to as the exact distribution of r. This is
- the distribution that is used in `pearsonr` to compute the p-value.
- The distribution is a beta distribution on the interval [-1, 1],
- with equal shape parameters a = b = n/2 - 1. In terms of SciPy's
- implementation of the beta distribution, the distribution of r is::
- dist = scipy.stats.beta(n/2 - 1, n/2 - 1, loc=-1, scale=2)
- The p-value returned by `pearsonr` is a two-sided p-value. The p-value
- roughly indicates the probability of an uncorrelated system
- producing datasets that have a Pearson correlation at least as extreme
- as the one computed from these datasets. More precisely, for a
- given sample with correlation coefficient r, the p-value is
- the probability that abs(r') of a random sample x' and y' drawn from
- the population with zero correlation would be greater than or equal
- to abs(r). In terms of the object ``dist`` shown above, the p-value
- for a given r and length n can be computed as::
- p = 2*dist.cdf(-abs(r))
- When n is 2, the above continuous distribution is not well-defined.
- One can interpret the limit of the beta distribution as the shape
- parameters a and b approach a = b = 0 as a discrete distribution with
- equal probability masses at r = 1 and r = -1. More directly, one
- can observe that, given the data x = [x1, x2] and y = [y1, y2], and
- assuming x1 != x2 and y1 != y2, the only possible values for r are 1
- and -1. Because abs(r') for any sample x' and y' with length 2 will
- be 1, the two-sided p-value for a sample of length 2 is always 1.
- References
- ----------
- .. [1] "Pearson correlation coefficient", Wikipedia,
- https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
- .. [2] Student, "Probable error of a correlation coefficient",
- Biometrika, Volume 6, Issue 2-3, 1 September 1908, pp. 302-310.
- .. [3] C. J. Kowalski, "On the Effects of Non-Normality on the Distribution
- of the Sample Product-Moment Correlation Coefficient"
- Journal of the Royal Statistical Society. Series C (Applied
- Statistics), Vol. 21, No. 1 (1972), pp. 1-12.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> from scipy.stats import mstats
- >>> mstats.pearsonr([1, 2, 3, 4, 5], [10, 9, 2.5, 6, 4])
- (-0.7426106572325057, 0.1505558088534455)
- There is a linear dependence between x and y if y = a + b*x + e, where
- a,b are constants and e is a random error term, assumed to be independent
- of x. For simplicity, assume that x is standard normal, a=0, b=1 and let
- e follow a normal distribution with mean zero and standard deviation s>0.
- >>> s = 0.5
- >>> x = stats.norm.rvs(size=500)
- >>> e = stats.norm.rvs(scale=s, size=500)
- >>> y = x + e
- >>> mstats.pearsonr(x, y)
- (0.9029601878969703, 8.428978827629898e-185) # may vary
- This should be close to the exact value given by
- >>> 1/np.sqrt(1 + s**2)
- 0.8944271909999159
- For s=0.5, we observe a high level of correlation. In general, a large
- variance of the noise reduces the correlation, while the correlation
- approaches one as the variance of the error goes to zero.
- It is important to keep in mind that no correlation does not imply
- independence unless (x, y) is jointly normal. Correlation can even be zero
- when there is a very simple dependence structure: if X follows a
- standard normal distribution, let y = abs(x). Note that the correlation
- between x and y is zero. Indeed, since the expectation of x is zero,
- cov(x, y) = E[x*y]. By definition, this equals E[x*abs(x)] which is zero
- by symmetry. The following lines of code illustrate this observation:
- >>> y = np.abs(x)
- >>> mstats.pearsonr(x, y)
- (-0.016172891856853524, 0.7182823678751942) # may vary
- A non-zero correlation coefficient can be misleading. For example, if X has
- a standard normal distribution, define y = x if x < 0 and y = 0 otherwise.
- A simple calculation shows that corr(x, y) = sqrt(2/Pi) = 0.797...,
- implying a high level of correlation:
- >>> y = np.where(x < 0, x, 0)
- >>> mstats.pearsonr(x, y)
- (0.8537091583771509, 3.183461621422181e-143) # may vary
- This is unintuitive since there is no dependence of x and y if x is larger
- than zero which happens in about half of the cases if we sample x and y.
- """
- (x, y, n) = _chk_size(x, y)
- (x, y) = (x.ravel(), y.ravel())
- # Get the common mask and the total nb of unmasked elements
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- n -= m.sum()
- df = n-2
- if df < 0:
- return (masked, masked)
- return scipy.stats._stats_py.pearsonr(ma.masked_array(x, mask=m).compressed(),
- ma.masked_array(y, mask=m).compressed())
- def spearmanr(x, y=None, use_ties=True, axis=None, nan_policy='propagate',
- alternative='two-sided'):
- """
- Calculates a Spearman rank-order correlation coefficient and the p-value
- to test for non-correlation.
- The Spearman correlation is a nonparametric measure of the linear
- relationship between two datasets. Unlike the Pearson correlation, the
- Spearman correlation does not assume that both datasets are normally
- distributed. Like other correlation coefficients, this one varies
- between -1 and +1 with 0 implying no correlation. Correlations of -1 or
- +1 imply a monotonic relationship. Positive correlations imply that
- as `x` increases, so does `y`. Negative correlations imply that as `x`
- increases, `y` decreases.
- Missing values are discarded pair-wise: if a value is missing in `x`, the
- corresponding value in `y` is masked.
- The p-value roughly indicates the probability of an uncorrelated system
- producing datasets that have a Spearman correlation at least as extreme
- as the one computed from these datasets. The p-values are not entirely
- reliable but are probably reasonable for datasets larger than 500 or so.
- Parameters
- ----------
- x, y : 1D or 2D array_like, y is optional
- One or two 1-D or 2-D arrays containing multiple variables and
- observations. When these are 1-D, each represents a vector of
- observations of a single variable. For the behavior in the 2-D case,
- see under ``axis``, below.
- use_ties : bool, optional
- DO NOT USE. Does not do anything, keyword is only left in place for
- backwards compatibility reasons.
- axis : int or None, optional
- If axis=0 (default), then each column represents a variable, with
- observations in the rows. If axis=1, the relationship is transposed:
- each row represents a variable, while the columns contain observations.
- If axis=None, then both arrays will be raveled.
- nan_policy : {'propagate', 'raise', 'omit'}, optional
- Defines how to handle when input contains nan. 'propagate' returns nan,
- 'raise' throws an error, 'omit' performs the calculations ignoring nan
- values. Default is 'propagate'.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the correlation is nonzero
- * 'less': the correlation is negative (less than zero)
- * 'greater': the correlation is positive (greater than zero)
- .. versionadded:: 1.7.0
- Returns
- -------
- res : SignificanceResult
- An object containing attributes:
- statistic : float or ndarray (2-D square)
- Spearman correlation matrix or correlation coefficient (if only 2
- variables are given as parameters). Correlation matrix is square
- with length equal to total number of variables (columns or rows) in
- ``a`` and ``b`` combined.
- pvalue : float
- The p-value for a hypothesis test whose null hypothesis
- is that two sets of data are linearly uncorrelated. See
- `alternative` above for alternative hypotheses. `pvalue` has the
- same shape as `statistic`.
- References
- ----------
- [CRCProbStat2000] section 14.7
- """
- if not use_ties:
- raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
- # Always returns a masked array, raveled if axis=None
- x, axisout = _chk_asarray(x, axis)
- if y is not None:
- # Deal only with 2-D `x` case.
- y, _ = _chk_asarray(y, axis)
- if axisout == 0:
- x = ma.column_stack((x, y))
- else:
- x = ma.row_stack((x, y))
- if axisout == 1:
- # To simplify the code that follow (always use `n_obs, n_vars` shape)
- x = x.T
- if nan_policy == 'omit':
- x = ma.masked_invalid(x)
- def _spearmanr_2cols(x):
- # Mask the same observations for all variables, and then drop those
- # observations (can't leave them masked, rankdata is weird).
- x = ma.mask_rowcols(x, axis=0)
- x = x[~x.mask.any(axis=1), :]
- # If either column is entirely NaN or Inf
- if not np.any(x.data):
- res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
- res.correlation = np.nan
- return res
- m = ma.getmask(x)
- n_obs = x.shape[0]
- dof = n_obs - 2 - int(m.sum(axis=0)[0])
- if dof < 0:
- raise ValueError("The input must have at least 3 entries!")
- # Gets the ranks and rank differences
- x_ranked = rankdata(x, axis=0)
- rs = ma.corrcoef(x_ranked, rowvar=False).data
- # rs can have elements equal to 1, so avoid zero division warnings
- with np.errstate(divide='ignore'):
- # clip the small negative values possibly caused by rounding
- # errors before taking the square root
- t = rs * np.sqrt((dof / ((rs+1.0) * (1.0-rs))).clip(0))
- t, prob = scipy.stats._stats_py._ttest_finish(dof, t, alternative)
- # For backwards compatibility, return scalars when comparing 2 columns
- if rs.shape == (2, 2):
- res = scipy.stats._stats_py.SignificanceResult(rs[1, 0],
- prob[1, 0])
- res.correlation = rs[1, 0]
- return res
- else:
- res = scipy.stats._stats_py.SignificanceResult(rs, prob)
- res.correlation = rs
- return res
- # Need to do this per pair of variables, otherwise the dropped observations
- # in a third column mess up the result for a pair.
- n_vars = x.shape[1]
- if n_vars == 2:
- return _spearmanr_2cols(x)
- else:
- rs = np.ones((n_vars, n_vars), dtype=float)
- prob = np.zeros((n_vars, n_vars), dtype=float)
- for var1 in range(n_vars - 1):
- for var2 in range(var1+1, n_vars):
- result = _spearmanr_2cols(x[:, [var1, var2]])
- rs[var1, var2] = result.correlation
- rs[var2, var1] = result.correlation
- prob[var1, var2] = result.pvalue
- prob[var2, var1] = result.pvalue
- res = scipy.stats._stats_py.SignificanceResult(rs, prob)
- res.correlation = rs
- return res
- def _kendall_p_exact(n, c, alternative='two-sided'):
- # Use the fact that distribution is symmetric: always calculate a CDF in
- # the left tail.
- # This will be the one-sided p-value if `c` is on the side of
- # the null distribution predicted by the alternative hypothesis.
- # The two-sided p-value will be twice this value.
- # If `c` is on the other side of the null distribution, we'll need to
- # take the complement and add back the probability mass at `c`.
- in_right_tail = (c >= (n*(n-1))//2 - c)
- alternative_greater = (alternative == 'greater')
- c = int(min(c, (n*(n-1))//2 - c))
- # Exact p-value, see Maurice G. Kendall, "Rank Correlation Methods"
- # (4th Edition), Charles Griffin & Co., 1970.
- if n <= 0:
- raise ValueError(f'n ({n}) must be positive')
- elif c < 0 or 4*c > n*(n-1):
- raise ValueError(f'c ({c}) must satisfy 0 <= 4c <= n(n-1) = {n*(n-1)}.')
- elif n == 1:
- prob = 1.0
- p_mass_at_c = 1
- elif n == 2:
- prob = 1.0
- p_mass_at_c = 0.5
- elif c == 0:
- prob = 2.0/math.factorial(n) if n < 171 else 0.0
- p_mass_at_c = prob/2
- elif c == 1:
- prob = 2.0/math.factorial(n-1) if n < 172 else 0.0
- p_mass_at_c = (n-1)/math.factorial(n)
- elif 4*c == n*(n-1) and alternative == 'two-sided':
- # I'm sure there's a simple formula for p_mass_at_c in this
- # case, but I don't know it. Use generic formula for one-sided p-value.
- prob = 1.0
- elif n < 171:
- new = np.zeros(c+1)
- new[0:2] = 1.0
- for j in range(3,n+1):
- new = np.cumsum(new)
- if j <= c:
- new[j:] -= new[:c+1-j]
- prob = 2.0*np.sum(new)/math.factorial(n)
- p_mass_at_c = new[-1]/math.factorial(n)
- else:
- new = np.zeros(c+1)
- new[0:2] = 1.0
- for j in range(3, n+1):
- new = np.cumsum(new)/j
- if j <= c:
- new[j:] -= new[:c+1-j]
- prob = np.sum(new)
- p_mass_at_c = new[-1]/2
- if alternative != 'two-sided':
- # if the alternative hypothesis and alternative agree,
- # one-sided p-value is half the two-sided p-value
- if in_right_tail == alternative_greater:
- prob /= 2
- else:
- prob = 1 - prob/2 + p_mass_at_c
- prob = np.clip(prob, 0, 1)
- return prob
- def kendalltau(x, y, use_ties=True, use_missing=False, method='auto',
- alternative='two-sided'):
- """
- Computes Kendall's rank correlation tau on two variables *x* and *y*.
- Parameters
- ----------
- x : sequence
- First data list (for example, time).
- y : sequence
- Second data list.
- use_ties : {True, False}, optional
- Whether ties correction should be performed.
- use_missing : {False, True}, optional
- Whether missing data should be allocated a rank of 0 (False) or the
- average rank (True)
- method : {'auto', 'asymptotic', 'exact'}, optional
- Defines which method is used to calculate the p-value [1]_.
- 'asymptotic' uses a normal approximation valid for large samples.
- 'exact' computes the exact p-value, but can only be used if no ties
- are present. As the sample size increases, the 'exact' computation
- time may grow and the result may lose some precision.
- 'auto' is the default and selects the appropriate
- method based on a trade-off between speed and accuracy.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the rank correlation is nonzero
- * 'less': the rank correlation is negative (less than zero)
- * 'greater': the rank correlation is positive (greater than zero)
- Returns
- -------
- res : SignificanceResult
- An object containing attributes:
- statistic : float
- The tau statistic.
- pvalue : float
- The p-value for a hypothesis test whose null hypothesis is
- an absence of association, tau = 0.
- References
- ----------
- .. [1] Maurice G. Kendall, "Rank Correlation Methods" (4th Edition),
- Charles Griffin & Co., 1970.
- """
- (x, y, n) = _chk_size(x, y)
- (x, y) = (x.flatten(), y.flatten())
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- if m is not nomask:
- x = ma.array(x, mask=m, copy=True)
- y = ma.array(y, mask=m, copy=True)
- # need int() here, otherwise numpy defaults to 32 bit
- # integer on all Windows architectures, causing overflow.
- # int() will keep it infinite precision.
- n -= int(m.sum())
- if n < 2:
- res = scipy.stats._stats_py.SignificanceResult(np.nan, np.nan)
- res.correlation = np.nan
- return res
- rx = ma.masked_equal(rankdata(x, use_missing=use_missing), 0)
- ry = ma.masked_equal(rankdata(y, use_missing=use_missing), 0)
- idx = rx.argsort()
- (rx, ry) = (rx[idx], ry[idx])
- C = np.sum([((ry[i+1:] > ry[i]) * (rx[i+1:] > rx[i])).filled(0).sum()
- for i in range(len(ry)-1)], dtype=float)
- D = np.sum([((ry[i+1:] < ry[i])*(rx[i+1:] > rx[i])).filled(0).sum()
- for i in range(len(ry)-1)], dtype=float)
- xties = count_tied_groups(x)
- yties = count_tied_groups(y)
- if use_ties:
- corr_x = np.sum([v*k*(k-1) for (k,v) in xties.items()], dtype=float)
- corr_y = np.sum([v*k*(k-1) for (k,v) in yties.items()], dtype=float)
- denom = ma.sqrt((n*(n-1)-corr_x)/2. * (n*(n-1)-corr_y)/2.)
- else:
- denom = n*(n-1)/2.
- tau = (C-D) / denom
- if method == 'exact' and (xties or yties):
- raise ValueError("Ties found, exact method cannot be used.")
- if method == 'auto':
- if (not xties and not yties) and (n <= 33 or min(C, n*(n-1)/2.0-C) <= 1):
- method = 'exact'
- else:
- method = 'asymptotic'
- if not xties and not yties and method == 'exact':
- prob = _kendall_p_exact(n, C, alternative)
- elif method == 'asymptotic':
- var_s = n*(n-1)*(2*n+5)
- if use_ties:
- var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in xties.items()])
- var_s -= np.sum([v*k*(k-1)*(2*k+5)*1. for (k,v) in yties.items()])
- v1 = np.sum([v*k*(k-1) for (k, v) in xties.items()], dtype=float) *\
- np.sum([v*k*(k-1) for (k, v) in yties.items()], dtype=float)
- v1 /= 2.*n*(n-1)
- if n > 2:
- v2 = np.sum([v*k*(k-1)*(k-2) for (k,v) in xties.items()],
- dtype=float) * \
- np.sum([v*k*(k-1)*(k-2) for (k,v) in yties.items()],
- dtype=float)
- v2 /= 9.*n*(n-1)*(n-2)
- else:
- v2 = 0
- else:
- v1 = v2 = 0
- var_s /= 18.
- var_s += (v1 + v2)
- z = (C-D)/np.sqrt(var_s)
- _, prob = scipy.stats._stats_py._normtest_finish(z, alternative)
- else:
- raise ValueError("Unknown method "+str(method)+" specified, please "
- "use auto, exact or asymptotic.")
- res = scipy.stats._stats_py.SignificanceResult(tau, prob)
- res.correlation = tau
- return res
- def kendalltau_seasonal(x):
- """
- Computes a multivariate Kendall's rank correlation tau, for seasonal data.
- Parameters
- ----------
- x : 2-D ndarray
- Array of seasonal data, with seasons in columns.
- """
- x = ma.array(x, subok=True, copy=False, ndmin=2)
- (n,m) = x.shape
- n_p = x.count(0)
- S_szn = sum(msign(x[i:]-x[i]).sum(0) for i in range(n))
- S_tot = S_szn.sum()
- n_tot = x.count()
- ties = count_tied_groups(x.compressed())
- corr_ties = sum(v*k*(k-1) for (k,v) in ties.items())
- denom_tot = ma.sqrt(1.*n_tot*(n_tot-1)*(n_tot*(n_tot-1)-corr_ties))/2.
- R = rankdata(x, axis=0, use_missing=True)
- K = ma.empty((m,m), dtype=int)
- covmat = ma.empty((m,m), dtype=float)
- denom_szn = ma.empty(m, dtype=float)
- for j in range(m):
- ties_j = count_tied_groups(x[:,j].compressed())
- corr_j = sum(v*k*(k-1) for (k,v) in ties_j.items())
- cmb = n_p[j]*(n_p[j]-1)
- for k in range(j,m,1):
- K[j,k] = sum(msign((x[i:,j]-x[i,j])*(x[i:,k]-x[i,k])).sum()
- for i in range(n))
- covmat[j,k] = (K[j,k] + 4*(R[:,j]*R[:,k]).sum() -
- n*(n_p[j]+1)*(n_p[k]+1))/3.
- K[k,j] = K[j,k]
- covmat[k,j] = covmat[j,k]
- denom_szn[j] = ma.sqrt(cmb*(cmb-corr_j)) / 2.
- var_szn = covmat.diagonal()
- z_szn = msign(S_szn) * (abs(S_szn)-1) / ma.sqrt(var_szn)
- z_tot_ind = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(var_szn.sum())
- z_tot_dep = msign(S_tot) * (abs(S_tot)-1) / ma.sqrt(covmat.sum())
- prob_szn = special.erfc(abs(z_szn)/np.sqrt(2))
- prob_tot_ind = special.erfc(abs(z_tot_ind)/np.sqrt(2))
- prob_tot_dep = special.erfc(abs(z_tot_dep)/np.sqrt(2))
- chi2_tot = (z_szn*z_szn).sum()
- chi2_trd = m * z_szn.mean()**2
- output = {'seasonal tau': S_szn/denom_szn,
- 'global tau': S_tot/denom_tot,
- 'global tau (alt)': S_tot/denom_szn.sum(),
- 'seasonal p-value': prob_szn,
- 'global p-value (indep)': prob_tot_ind,
- 'global p-value (dep)': prob_tot_dep,
- 'chi2 total': chi2_tot,
- 'chi2 trend': chi2_trd,
- }
- return output
- PointbiserialrResult = namedtuple('PointbiserialrResult', ('correlation',
- 'pvalue'))
- def pointbiserialr(x, y):
- """Calculates a point biserial correlation coefficient and its p-value.
- Parameters
- ----------
- x : array_like of bools
- Input array.
- y : array_like
- Input array.
- Returns
- -------
- correlation : float
- R value
- pvalue : float
- 2-tailed p-value
- Notes
- -----
- Missing values are considered pair-wise: if a value is missing in x,
- the corresponding value in y is masked.
- For more details on `pointbiserialr`, see `scipy.stats.pointbiserialr`.
- """
- x = ma.fix_invalid(x, copy=True).astype(bool)
- y = ma.fix_invalid(y, copy=True).astype(float)
- # Get rid of the missing data
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- if m is not nomask:
- unmask = np.logical_not(m)
- x = x[unmask]
- y = y[unmask]
- n = len(x)
- # phat is the fraction of x values that are True
- phat = x.sum() / float(n)
- y0 = y[~x] # y-values where x is False
- y1 = y[x] # y-values where x is True
- y0m = y0.mean()
- y1m = y1.mean()
- rpb = (y1m - y0m)*np.sqrt(phat * (1-phat)) / y.std()
- df = n-2
- t = rpb*ma.sqrt(df/(1.0-rpb**2))
- prob = _betai(0.5*df, 0.5, df/(df+t*t))
- return PointbiserialrResult(rpb, prob)
- def linregress(x, y=None):
- r"""
- Linear regression calculation
- Note that the non-masked version is used, and that this docstring is
- replaced by the non-masked docstring + some info on missing data.
- """
- if y is None:
- x = ma.array(x)
- if x.shape[0] == 2:
- x, y = x
- elif x.shape[1] == 2:
- x, y = x.T
- else:
- raise ValueError("If only `x` is given as input, "
- "it has to be of shape (2, N) or (N, 2), "
- f"provided shape was {x.shape}")
- else:
- x = ma.array(x)
- y = ma.array(y)
- x = x.flatten()
- y = y.flatten()
- if np.amax(x) == np.amin(x) and len(x) > 1:
- raise ValueError("Cannot calculate a linear regression "
- "if all x values are identical")
- m = ma.mask_or(ma.getmask(x), ma.getmask(y), shrink=False)
- if m is not nomask:
- x = ma.array(x, mask=m)
- y = ma.array(y, mask=m)
- if np.any(~m):
- result = stats_linregress(x.data[~m], y.data[~m])
- else:
- # All data is masked
- result = stats_LinregressResult(slope=None, intercept=None,
- rvalue=None, pvalue=None,
- stderr=None,
- intercept_stderr=None)
- else:
- result = stats_linregress(x.data, y.data)
- return result
- def theilslopes(y, x=None, alpha=0.95, method='separate'):
- r"""
- Computes the Theil-Sen estimator for a set of points (x, y).
- `theilslopes` implements a method for robust linear regression. It
- computes the slope as the median of all slopes between paired values.
- Parameters
- ----------
- y : array_like
- Dependent variable.
- x : array_like or None, optional
- Independent variable. If None, use ``arange(len(y))`` instead.
- alpha : float, optional
- Confidence degree between 0 and 1. Default is 95% confidence.
- Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
- interpreted as "find the 90% confidence interval".
- method : {'joint', 'separate'}, optional
- Method to be used for computing estimate for intercept.
- Following methods are supported,
- * 'joint': Uses np.median(y - slope * x) as intercept.
- * 'separate': Uses np.median(y) - slope * np.median(x)
- as intercept.
- The default is 'separate'.
- .. versionadded:: 1.8.0
- Returns
- -------
- result : ``TheilslopesResult`` instance
- The return value is an object with the following attributes:
- slope : float
- Theil slope.
- intercept : float
- Intercept of the Theil line.
- low_slope : float
- Lower bound of the confidence interval on `slope`.
- high_slope : float
- Upper bound of the confidence interval on `slope`.
- See Also
- --------
- siegelslopes : a similar technique using repeated medians
- Notes
- -----
- For more details on `theilslopes`, see `scipy.stats.theilslopes`.
- """
- y = ma.asarray(y).flatten()
- if x is None:
- x = ma.arange(len(y), dtype=float)
- else:
- x = ma.asarray(x).flatten()
- if len(x) != len(y):
- raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y),len(x)))
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- y._mask = x._mask = m
- # Disregard any masked elements of x or y
- y = y.compressed()
- x = x.compressed().astype(float)
- # We now have unmasked arrays so can use `scipy.stats.theilslopes`
- return stats_theilslopes(y, x, alpha=alpha, method=method)
- def siegelslopes(y, x=None, method="hierarchical"):
- r"""
- Computes the Siegel estimator for a set of points (x, y).
- `siegelslopes` implements a method for robust linear regression
- using repeated medians to fit a line to the points (x, y).
- The method is robust to outliers with an asymptotic breakdown point
- of 50%.
- Parameters
- ----------
- y : array_like
- Dependent variable.
- x : array_like or None, optional
- Independent variable. If None, use ``arange(len(y))`` instead.
- method : {'hierarchical', 'separate'}
- If 'hierarchical', estimate the intercept using the estimated
- slope ``slope`` (default option).
- If 'separate', estimate the intercept independent of the estimated
- slope. See Notes for details.
- Returns
- -------
- result : ``SiegelslopesResult`` instance
- The return value is an object with the following attributes:
- slope : float
- Estimate of the slope of the regression line.
- intercept : float
- Estimate of the intercept of the regression line.
- See Also
- --------
- theilslopes : a similar technique without repeated medians
- Notes
- -----
- For more details on `siegelslopes`, see `scipy.stats.siegelslopes`.
- """
- y = ma.asarray(y).ravel()
- if x is None:
- x = ma.arange(len(y), dtype=float)
- else:
- x = ma.asarray(x).ravel()
- if len(x) != len(y):
- raise ValueError("Incompatible lengths ! (%s<>%s)" % (len(y), len(x)))
- m = ma.mask_or(ma.getmask(x), ma.getmask(y))
- y._mask = x._mask = m
- # Disregard any masked elements of x or y
- y = y.compressed()
- x = x.compressed().astype(float)
- # We now have unmasked arrays so can use `scipy.stats.siegelslopes`
- return stats_siegelslopes(y, x, method=method)
- SenSeasonalSlopesResult = _make_tuple_bunch('SenSeasonalSlopesResult',
- ['intra_slope', 'inter_slope'])
- def sen_seasonal_slopes(x):
- r"""
- Computes seasonal Theil-Sen and Kendall slope estimators.
- The seasonal generalization of Sen's slope computes the slopes between all
- pairs of values within a "season" (column) of a 2D array. It returns an
- array containing the median of these "within-season" slopes for each
- season (the Theil-Sen slope estimator of each season), and it returns the
- median of the within-season slopes across all seasons (the seasonal Kendall
- slope estimator).
- Parameters
- ----------
- x : 2D array_like
- Each column of `x` contains measurements of the dependent variable
- within a season. The independent variable (usually time) of each season
- is assumed to be ``np.arange(x.shape[0])``.
- Returns
- -------
- result : ``SenSeasonalSlopesResult`` instance
- The return value is an object with the following attributes:
- intra_slope : ndarray
- For each season, the Theil-Sen slope estimator: the median of
- within-season slopes.
- inter_slope : float
- The seasonal Kendall slope estimateor: the median of within-season
- slopes *across all* seasons.
- See Also
- --------
- theilslopes : the analogous function for non-seasonal data
- scipy.stats.theilslopes : non-seasonal slopes for non-masked arrays
- Notes
- -----
- The slopes :math:`d_{ijk}` within season :math:`i` are:
- .. math::
- d_{ijk} = \frac{x_{ij} - x_{ik}}
- {j - k}
- for pairs of distinct integer indices :math:`j, k` of :math:`x`.
- Element :math:`i` of the returned `intra_slope` array is the median of the
- :math:`d_{ijk}` over all :math:`j < k`; this is the Theil-Sen slope
- estimator of season :math:`i`. The returned `inter_slope` value, better
- known as the seasonal Kendall slope estimator, is the median of the
- :math:`d_{ijk}` over all :math:`i, j, k`.
- References
- ----------
- .. [1] Hirsch, Robert M., James R. Slack, and Richard A. Smith.
- "Techniques of trend analysis for monthly water quality data."
- *Water Resources Research* 18.1 (1982): 107-121.
- Examples
- --------
- Suppose we have 100 observations of a dependent variable for each of four
- seasons:
- >>> import numpy as np
- >>> rng = np.random.default_rng()
- >>> x = rng.random(size=(100, 4))
- We compute the seasonal slopes as:
- >>> from scipy import stats
- >>> intra_slope, inter_slope = stats.mstats.sen_seasonal_slopes(x)
- If we define a function to compute all slopes between observations within
- a season:
- >>> def dijk(yi):
- ... n = len(yi)
- ... x = np.arange(n)
- ... dy = yi - yi[:, np.newaxis]
- ... dx = x - x[:, np.newaxis]
- ... # we only want unique pairs of distinct indices
- ... mask = np.triu(np.ones((n, n), dtype=bool), k=1)
- ... return dy[mask]/dx[mask]
- then element ``i`` of ``intra_slope`` is the median of ``dijk[x[:, i]]``:
- >>> i = 2
- >>> np.allclose(np.median(dijk(x[:, i])), intra_slope[i])
- True
- and ``inter_slope`` is the median of the values returned by ``dijk`` for
- all seasons:
- >>> all_slopes = np.concatenate([dijk(x[:, i]) for i in range(x.shape[1])])
- >>> np.allclose(np.median(all_slopes), inter_slope)
- True
- Because the data are randomly generated, we would expect the median slopes
- to be nearly zero both within and across all seasons, and indeed they are:
- >>> intra_slope.data
- array([ 0.00124504, -0.00277761, -0.00221245, -0.00036338])
- >>> inter_slope
- -0.0010511779872922058
- """
- x = ma.array(x, subok=True, copy=False, ndmin=2)
- (n,_) = x.shape
- # Get list of slopes per season
- szn_slopes = ma.vstack([(x[i+1:]-x[i])/np.arange(1,n-i)[:,None]
- for i in range(n)])
- szn_medslopes = ma.median(szn_slopes, axis=0)
- medslope = ma.median(szn_slopes, axis=None)
- return SenSeasonalSlopesResult(szn_medslopes, medslope)
- Ttest_1sampResult = namedtuple('Ttest_1sampResult', ('statistic', 'pvalue'))
- def ttest_1samp(a, popmean, axis=0, alternative='two-sided'):
- """
- Calculates the T-test for the mean of ONE group of scores.
- Parameters
- ----------
- a : array_like
- sample observation
- popmean : float or array_like
- expected value in null hypothesis, if array_like than it must have the
- same shape as `a` excluding the axis dimension
- axis : int or None, optional
- Axis along which to compute test. If None, compute over the whole
- array `a`.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis.
- The following options are available (default is 'two-sided'):
- * 'two-sided': the mean of the underlying distribution of the sample
- is different than the given population mean (`popmean`)
- * 'less': the mean of the underlying distribution of the sample is
- less than the given population mean (`popmean`)
- * 'greater': the mean of the underlying distribution of the sample is
- greater than the given population mean (`popmean`)
- .. versionadded:: 1.7.0
- Returns
- -------
- statistic : float or array
- t-statistic
- pvalue : float or array
- The p-value
- Notes
- -----
- For more details on `ttest_1samp`, see `scipy.stats.ttest_1samp`.
- """
- a, axis = _chk_asarray(a, axis)
- if a.size == 0:
- return (np.nan, np.nan)
- x = a.mean(axis=axis)
- v = a.var(axis=axis, ddof=1)
- n = a.count(axis=axis)
- # force df to be an array for masked division not to throw a warning
- df = ma.asanyarray(n - 1.0)
- svar = ((n - 1.0) * v) / df
- with np.errstate(divide='ignore', invalid='ignore'):
- t = (x - popmean) / ma.sqrt(svar / n)
- t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
- return Ttest_1sampResult(t, prob)
- ttest_onesamp = ttest_1samp
- Ttest_indResult = namedtuple('Ttest_indResult', ('statistic', 'pvalue'))
- def ttest_ind(a, b, axis=0, equal_var=True, alternative='two-sided'):
- """
- Calculates the T-test for the means of TWO INDEPENDENT samples of scores.
- Parameters
- ----------
- a, b : array_like
- The arrays must have the same shape, except in the dimension
- corresponding to `axis` (the first, by default).
- axis : int or None, optional
- Axis along which to compute test. If None, compute over the whole
- arrays, `a`, and `b`.
- equal_var : bool, optional
- If True, perform a standard independent 2 sample test that assumes equal
- population variances.
- If False, perform Welch's t-test, which does not assume equal population
- variance.
- .. versionadded:: 0.17.0
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis.
- The following options are available (default is 'two-sided'):
- * 'two-sided': the means of the distributions underlying the samples
- are unequal.
- * 'less': the mean of the distribution underlying the first sample
- is less than the mean of the distribution underlying the second
- sample.
- * 'greater': the mean of the distribution underlying the first
- sample is greater than the mean of the distribution underlying
- the second sample.
- .. versionadded:: 1.7.0
- Returns
- -------
- statistic : float or array
- The calculated t-statistic.
- pvalue : float or array
- The p-value.
- Notes
- -----
- For more details on `ttest_ind`, see `scipy.stats.ttest_ind`.
- """
- a, b, axis = _chk2_asarray(a, b, axis)
- if a.size == 0 or b.size == 0:
- return Ttest_indResult(np.nan, np.nan)
- (x1, x2) = (a.mean(axis), b.mean(axis))
- (v1, v2) = (a.var(axis=axis, ddof=1), b.var(axis=axis, ddof=1))
- (n1, n2) = (a.count(axis), b.count(axis))
- if equal_var:
- # force df to be an array for masked division not to throw a warning
- df = ma.asanyarray(n1 + n2 - 2.0)
- svar = ((n1-1)*v1+(n2-1)*v2) / df
- denom = ma.sqrt(svar*(1.0/n1 + 1.0/n2)) # n-D computation here!
- else:
- vn1 = v1/n1
- vn2 = v2/n2
- with np.errstate(divide='ignore', invalid='ignore'):
- df = (vn1 + vn2)**2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))
- # If df is undefined, variances are zero.
- # It doesn't matter what df is as long as it is not NaN.
- df = np.where(np.isnan(df), 1, df)
- denom = ma.sqrt(vn1 + vn2)
- with np.errstate(divide='ignore', invalid='ignore'):
- t = (x1-x2) / denom
- t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
- return Ttest_indResult(t, prob)
- Ttest_relResult = namedtuple('Ttest_relResult', ('statistic', 'pvalue'))
- def ttest_rel(a, b, axis=0, alternative='two-sided'):
- """
- Calculates the T-test on TWO RELATED samples of scores, a and b.
- Parameters
- ----------
- a, b : array_like
- The arrays must have the same shape.
- axis : int or None, optional
- Axis along which to compute test. If None, compute over the whole
- arrays, `a`, and `b`.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis.
- The following options are available (default is 'two-sided'):
- * 'two-sided': the means of the distributions underlying the samples
- are unequal.
- * 'less': the mean of the distribution underlying the first sample
- is less than the mean of the distribution underlying the second
- sample.
- * 'greater': the mean of the distribution underlying the first
- sample is greater than the mean of the distribution underlying
- the second sample.
- .. versionadded:: 1.7.0
- Returns
- -------
- statistic : float or array
- t-statistic
- pvalue : float or array
- two-tailed p-value
- Notes
- -----
- For more details on `ttest_rel`, see `scipy.stats.ttest_rel`.
- """
- a, b, axis = _chk2_asarray(a, b, axis)
- if len(a) != len(b):
- raise ValueError('unequal length arrays')
- if a.size == 0 or b.size == 0:
- return Ttest_relResult(np.nan, np.nan)
- n = a.count(axis)
- df = ma.asanyarray(n-1.0)
- d = (a-b).astype('d')
- dm = d.mean(axis)
- v = d.var(axis=axis, ddof=1)
- denom = ma.sqrt(v / n)
- with np.errstate(divide='ignore', invalid='ignore'):
- t = dm / denom
- t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
- return Ttest_relResult(t, prob)
- MannwhitneyuResult = namedtuple('MannwhitneyuResult', ('statistic',
- 'pvalue'))
- def mannwhitneyu(x,y, use_continuity=True):
- """
- Computes the Mann-Whitney statistic
- Missing values in `x` and/or `y` are discarded.
- Parameters
- ----------
- x : sequence
- Input
- y : sequence
- Input
- use_continuity : {True, False}, optional
- Whether a continuity correction (1/2.) should be taken into account.
- Returns
- -------
- statistic : float
- The minimum of the Mann-Whitney statistics
- pvalue : float
- Approximate two-sided p-value assuming a normal distribution.
- """
- x = ma.asarray(x).compressed().view(ndarray)
- y = ma.asarray(y).compressed().view(ndarray)
- ranks = rankdata(np.concatenate([x,y]))
- (nx, ny) = (len(x), len(y))
- nt = nx + ny
- U = ranks[:nx].sum() - nx*(nx+1)/2.
- U = max(U, nx*ny - U)
- u = nx*ny - U
- mu = (nx*ny)/2.
- sigsq = (nt**3 - nt)/12.
- ties = count_tied_groups(ranks)
- sigsq -= sum(v*(k**3-k) for (k,v) in ties.items())/12.
- sigsq *= nx*ny/float(nt*(nt-1))
- if use_continuity:
- z = (U - 1/2. - mu) / ma.sqrt(sigsq)
- else:
- z = (U - mu) / ma.sqrt(sigsq)
- prob = special.erfc(abs(z)/np.sqrt(2))
- return MannwhitneyuResult(u, prob)
- KruskalResult = namedtuple('KruskalResult', ('statistic', 'pvalue'))
- def kruskal(*args):
- """
- Compute the Kruskal-Wallis H-test for independent samples
- Parameters
- ----------
- sample1, sample2, ... : array_like
- Two or more arrays with the sample measurements can be given as
- arguments.
- Returns
- -------
- statistic : float
- The Kruskal-Wallis H statistic, corrected for ties
- pvalue : float
- The p-value for the test using the assumption that H has a chi
- square distribution
- Notes
- -----
- For more details on `kruskal`, see `scipy.stats.kruskal`.
- Examples
- --------
- >>> from scipy.stats.mstats import kruskal
- Random samples from three different brands of batteries were tested
- to see how long the charge lasted. Results were as follows:
- >>> a = [6.3, 5.4, 5.7, 5.2, 5.0]
- >>> b = [6.9, 7.0, 6.1, 7.9]
- >>> c = [7.2, 6.9, 6.1, 6.5]
- Test the hypotesis that the distribution functions for all of the brands'
- durations are identical. Use 5% level of significance.
- >>> kruskal(a, b, c)
- KruskalResult(statistic=7.113812154696133, pvalue=0.028526948491942164)
- The null hypothesis is rejected at the 5% level of significance
- because the returned p-value is less than the critical value of 5%.
- """
- output = argstoarray(*args)
- ranks = ma.masked_equal(rankdata(output, use_missing=False), 0)
- sumrk = ranks.sum(-1)
- ngrp = ranks.count(-1)
- ntot = ranks.count()
- H = 12./(ntot*(ntot+1)) * (sumrk**2/ngrp).sum() - 3*(ntot+1)
- # Tie correction
- ties = count_tied_groups(ranks)
- T = 1. - sum(v*(k**3-k) for (k,v) in ties.items())/float(ntot**3-ntot)
- if T == 0:
- raise ValueError('All numbers are identical in kruskal')
- H /= T
- df = len(output) - 1
- prob = distributions.chi2.sf(H, df)
- return KruskalResult(H, prob)
- kruskalwallis = kruskal
- @_rename_parameter("mode", "method")
- def ks_1samp(x, cdf, args=(), alternative="two-sided", method='auto'):
- """
- Computes the Kolmogorov-Smirnov test on one sample of masked values.
- Missing values in `x` are discarded.
- Parameters
- ----------
- x : array_like
- a 1-D array of observations of random variables.
- cdf : str or callable
- If a string, it should be the name of a distribution in `scipy.stats`.
- If a callable, that callable is used to calculate the cdf.
- args : tuple, sequence, optional
- Distribution parameters, used if `cdf` is a string.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Indicates the alternative hypothesis. Default is 'two-sided'.
- method : {'auto', 'exact', 'asymp'}, optional
- Defines the method used for calculating the p-value.
- The following options are available (default is 'auto'):
- * 'auto' : use 'exact' for small size arrays, 'asymp' for large
- * 'exact' : use approximation to exact distribution of test statistic
- * 'asymp' : use asymptotic distribution of test statistic
- Returns
- -------
- d : float
- Value of the Kolmogorov Smirnov test
- p : float
- Corresponding p-value.
- """
- alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
- alternative.lower()[0], alternative)
- return scipy.stats._stats_py.ks_1samp(
- x, cdf, args=args, alternative=alternative, method=method)
- @_rename_parameter("mode", "method")
- def ks_2samp(data1, data2, alternative="two-sided", method='auto'):
- """
- Computes the Kolmogorov-Smirnov test on two samples.
- Missing values in `x` and/or `y` are discarded.
- Parameters
- ----------
- data1 : array_like
- First data set
- data2 : array_like
- Second data set
- alternative : {'two-sided', 'less', 'greater'}, optional
- Indicates the alternative hypothesis. Default is 'two-sided'.
- method : {'auto', 'exact', 'asymp'}, optional
- Defines the method used for calculating the p-value.
- The following options are available (default is 'auto'):
- * 'auto' : use 'exact' for small size arrays, 'asymp' for large
- * 'exact' : use approximation to exact distribution of test statistic
- * 'asymp' : use asymptotic distribution of test statistic
- Returns
- -------
- d : float
- Value of the Kolmogorov Smirnov test
- p : float
- Corresponding p-value.
- """
- # Ideally this would be accomplished by
- # ks_2samp = scipy.stats._stats_py.ks_2samp
- # but the circular dependencies between _mstats_basic and stats prevent that.
- alternative = {'t': 'two-sided', 'g': 'greater', 'l': 'less'}.get(
- alternative.lower()[0], alternative)
- return scipy.stats._stats_py.ks_2samp(data1, data2,
- alternative=alternative,
- method=method)
- ks_twosamp = ks_2samp
- @_rename_parameter("mode", "method")
- def kstest(data1, data2, args=(), alternative='two-sided', method='auto'):
- """
- Parameters
- ----------
- data1 : array_like
- data2 : str, callable or array_like
- args : tuple, sequence, optional
- Distribution parameters, used if `data1` or `data2` are strings.
- alternative : str, as documented in stats.kstest
- method : str, as documented in stats.kstest
- Returns
- -------
- tuple of (K-S statistic, probability)
- """
- return scipy.stats._stats_py.kstest(data1, data2, args,
- alternative=alternative, method=method)
- def trima(a, limits=None, inclusive=(True,True)):
- """
- Trims an array by masking the data outside some given limits.
- Returns a masked version of the input array.
- Parameters
- ----------
- a : array_like
- Input array.
- limits : {None, tuple}, optional
- Tuple of (lower limit, upper limit) in absolute values.
- Values of the input array lower (greater) than the lower (upper) limit
- will be masked. A limit is None indicates an open interval.
- inclusive : (bool, bool) tuple, optional
- Tuple of (lower flag, upper flag), indicating whether values exactly
- equal to the lower (upper) limit are allowed.
- Examples
- --------
- >>> from scipy.stats.mstats import trima
- >>> import numpy as np
- >>> a = np.arange(10)
- The interval is left-closed and right-open, i.e., `[2, 8)`.
- Trim the array by keeping only values in the interval.
- >>> trima(a, limits=(2, 8), inclusive=(True, False))
- masked_array(data=[--, --, 2, 3, 4, 5, 6, 7, --, --],
- mask=[ True, True, False, False, False, False, False, False,
- True, True],
- fill_value=999999)
- """
- a = ma.asarray(a)
- a.unshare_mask()
- if (limits is None) or (limits == (None, None)):
- return a
- (lower_lim, upper_lim) = limits
- (lower_in, upper_in) = inclusive
- condition = False
- if lower_lim is not None:
- if lower_in:
- condition |= (a < lower_lim)
- else:
- condition |= (a <= lower_lim)
- if upper_lim is not None:
- if upper_in:
- condition |= (a > upper_lim)
- else:
- condition |= (a >= upper_lim)
- a[condition.filled(True)] = masked
- return a
- def trimr(a, limits=None, inclusive=(True, True), axis=None):
- """
- Trims an array by masking some proportion of the data on each end.
- Returns a masked version of the input array.
- Parameters
- ----------
- a : sequence
- Input array.
- limits : {None, tuple}, optional
- Tuple of the percentages to cut on each side of the array, with respect
- to the number of unmasked data, as floats between 0. and 1.
- Noting n the number of unmasked data before trimming, the
- (n*limits[0])th smallest data and the (n*limits[1])th largest data are
- masked, and the total number of unmasked data after trimming is
- n*(1.-sum(limits)). The value of one limit can be set to None to
- indicate an open interval.
- inclusive : {(True,True) tuple}, optional
- Tuple of flags indicating whether the number of data being masked on
- the left (right) end should be truncated (True) or rounded (False) to
- integers.
- axis : {None,int}, optional
- Axis along which to trim. If None, the whole array is trimmed, but its
- shape is maintained.
- """
- def _trimr1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
- n = a.count()
- idx = a.argsort()
- if low_limit:
- if low_inclusive:
- lowidx = int(low_limit*n)
- else:
- lowidx = int(np.round(low_limit*n))
- a[idx[:lowidx]] = masked
- if up_limit is not None:
- if up_inclusive:
- upidx = n - int(n*up_limit)
- else:
- upidx = n - int(np.round(n*up_limit))
- a[idx[upidx:]] = masked
- return a
- a = ma.asarray(a)
- a.unshare_mask()
- if limits is None:
- return a
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- (loinc, upinc) = inclusive
- if axis is None:
- shp = a.shape
- return _trimr1D(a.ravel(),lolim,uplim,loinc,upinc).reshape(shp)
- else:
- return ma.apply_along_axis(_trimr1D, axis, a, lolim,uplim,loinc,upinc)
- trimdoc = """
- Parameters
- ----------
- a : sequence
- Input array
- limits : {None, tuple}, optional
- If `relative` is False, tuple (lower limit, upper limit) in absolute values.
- Values of the input array lower (greater) than the lower (upper) limit are
- masked.
- If `relative` is True, tuple (lower percentage, upper percentage) to cut
- on each side of the array, with respect to the number of unmasked data.
- Noting n the number of unmasked data before trimming, the (n*limits[0])th
- smallest data and the (n*limits[1])th largest data are masked, and the
- total number of unmasked data after trimming is n*(1.-sum(limits))
- In each case, the value of one limit can be set to None to indicate an
- open interval.
- If limits is None, no trimming is performed
- inclusive : {(bool, bool) tuple}, optional
- If `relative` is False, tuple indicating whether values exactly equal
- to the absolute limits are allowed.
- If `relative` is True, tuple indicating whether the number of data
- being masked on each side should be rounded (True) or truncated
- (False).
- relative : bool, optional
- Whether to consider the limits as absolute values (False) or proportions
- to cut (True).
- axis : int, optional
- Axis along which to trim.
- """
- def trim(a, limits=None, inclusive=(True,True), relative=False, axis=None):
- """
- Trims an array by masking the data outside some given limits.
- Returns a masked version of the input array.
- %s
- Examples
- --------
- >>> from scipy.stats.mstats import trim
- >>> z = [ 1, 2, 3, 4, 5, 6, 7, 8, 9,10]
- >>> print(trim(z,(3,8)))
- [-- -- 3 4 5 6 7 8 -- --]
- >>> print(trim(z,(0.1,0.2),relative=True))
- [-- 2 3 4 5 6 7 8 -- --]
- """
- if relative:
- return trimr(a, limits=limits, inclusive=inclusive, axis=axis)
- else:
- return trima(a, limits=limits, inclusive=inclusive)
- if trim.__doc__:
- trim.__doc__ = trim.__doc__ % trimdoc
- def trimboth(data, proportiontocut=0.2, inclusive=(True,True), axis=None):
- """
- Trims the smallest and largest data values.
- Trims the `data` by masking the ``int(proportiontocut * n)`` smallest and
- ``int(proportiontocut * n)`` largest values of data along the given axis,
- where n is the number of unmasked values before trimming.
- Parameters
- ----------
- data : ndarray
- Data to trim.
- proportiontocut : float, optional
- Percentage of trimming (as a float between 0 and 1).
- If n is the number of unmasked values before trimming, the number of
- values after trimming is ``(1 - 2*proportiontocut) * n``.
- Default is 0.2.
- inclusive : {(bool, bool) tuple}, optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- axis : int, optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
- """
- return trimr(data, limits=(proportiontocut,proportiontocut),
- inclusive=inclusive, axis=axis)
- def trimtail(data, proportiontocut=0.2, tail='left', inclusive=(True,True),
- axis=None):
- """
- Trims the data by masking values from one tail.
- Parameters
- ----------
- data : array_like
- Data to trim.
- proportiontocut : float, optional
- Percentage of trimming. If n is the number of unmasked values
- before trimming, the number of values after trimming is
- ``(1 - proportiontocut) * n``. Default is 0.2.
- tail : {'left','right'}, optional
- If 'left' the `proportiontocut` lowest values will be masked.
- If 'right' the `proportiontocut` highest values will be masked.
- Default is 'left'.
- inclusive : {(bool, bool) tuple}, optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False). Default is
- (True, True).
- axis : int, optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened. Default is None.
- Returns
- -------
- trimtail : ndarray
- Returned array of same shape as `data` with masked tail values.
- """
- tail = str(tail).lower()[0]
- if tail == 'l':
- limits = (proportiontocut,None)
- elif tail == 'r':
- limits = (None, proportiontocut)
- else:
- raise TypeError("The tail argument should be in ('left','right')")
- return trimr(data, limits=limits, axis=axis, inclusive=inclusive)
- trim1 = trimtail
- def trimmed_mean(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None):
- """Returns the trimmed mean of the data along the given axis.
- %s
- """
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- return trimr(a,limits=limits,inclusive=inclusive,axis=axis).mean(axis=axis)
- else:
- return trima(a,limits=limits,inclusive=inclusive).mean(axis=axis)
- if trimmed_mean.__doc__:
- trimmed_mean.__doc__ = trimmed_mean.__doc__ % trimdoc
- def trimmed_var(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None, ddof=0):
- """Returns the trimmed variance of the data along the given axis.
- %s
- ddof : {0,integer}, optional
- Means Delta Degrees of Freedom. The denominator used during computations
- is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
- biased estimate of the variance.
- """
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- out = trimr(a,limits=limits, inclusive=inclusive,axis=axis)
- else:
- out = trima(a,limits=limits,inclusive=inclusive)
- return out.var(axis=axis, ddof=ddof)
- if trimmed_var.__doc__:
- trimmed_var.__doc__ = trimmed_var.__doc__ % trimdoc
- def trimmed_std(a, limits=(0.1,0.1), inclusive=(1,1), relative=True,
- axis=None, ddof=0):
- """Returns the trimmed standard deviation of the data along the given axis.
- %s
- ddof : {0,integer}, optional
- Means Delta Degrees of Freedom. The denominator used during computations
- is (n-ddof). DDOF=0 corresponds to a biased estimate, DDOF=1 to an un-
- biased estimate of the variance.
- """
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- if relative:
- out = trimr(a,limits=limits,inclusive=inclusive,axis=axis)
- else:
- out = trima(a,limits=limits,inclusive=inclusive)
- return out.std(axis=axis,ddof=ddof)
- if trimmed_std.__doc__:
- trimmed_std.__doc__ = trimmed_std.__doc__ % trimdoc
- def trimmed_stde(a, limits=(0.1,0.1), inclusive=(1,1), axis=None):
- """
- Returns the standard error of the trimmed mean along the given axis.
- Parameters
- ----------
- a : sequence
- Input array
- limits : {(0.1,0.1), tuple of float}, optional
- tuple (lower percentage, upper percentage) to cut on each side of the
- array, with respect to the number of unmasked data.
- If n is the number of unmasked data before trimming, the values
- smaller than ``n * limits[0]`` and the values larger than
- ``n * `limits[1]`` are masked, and the total number of unmasked
- data after trimming is ``n * (1.-sum(limits))``. In each case,
- the value of one limit can be set to None to indicate an open interval.
- If `limits` is None, no trimming is performed.
- inclusive : {(bool, bool) tuple} optional
- Tuple indicating whether the number of data being masked on each side
- should be rounded (True) or truncated (False).
- axis : int, optional
- Axis along which to trim.
- Returns
- -------
- trimmed_stde : scalar or ndarray
- """
- def _trimmed_stde_1D(a, low_limit, up_limit, low_inclusive, up_inclusive):
- "Returns the standard error of the trimmed mean for a 1D input data."
- n = a.count()
- idx = a.argsort()
- if low_limit:
- if low_inclusive:
- lowidx = int(low_limit*n)
- else:
- lowidx = np.round(low_limit*n)
- a[idx[:lowidx]] = masked
- if up_limit is not None:
- if up_inclusive:
- upidx = n - int(n*up_limit)
- else:
- upidx = n - np.round(n*up_limit)
- a[idx[upidx:]] = masked
- a[idx[:lowidx]] = a[idx[lowidx]]
- a[idx[upidx:]] = a[idx[upidx-1]]
- winstd = a.std(ddof=1)
- return winstd / ((1-low_limit-up_limit)*np.sqrt(len(a)))
- a = ma.array(a, copy=True, subok=True)
- a.unshare_mask()
- if limits is None:
- return a.std(axis=axis,ddof=1)/ma.sqrt(a.count(axis))
- if (not isinstance(limits,tuple)) and isinstance(limits,float):
- limits = (limits, limits)
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- (loinc, upinc) = inclusive
- if (axis is None):
- return _trimmed_stde_1D(a.ravel(),lolim,uplim,loinc,upinc)
- else:
- if a.ndim > 2:
- raise ValueError("Array 'a' must be at most two dimensional, "
- "but got a.ndim = %d" % a.ndim)
- return ma.apply_along_axis(_trimmed_stde_1D, axis, a,
- lolim,uplim,loinc,upinc)
- def _mask_to_limits(a, limits, inclusive):
- """Mask an array for values outside of given limits.
- This is primarily a utility function.
- Parameters
- ----------
- a : array
- limits : (float or None, float or None)
- A tuple consisting of the (lower limit, upper limit). Values in the
- input array less than the lower limit or greater than the upper limit
- will be masked out. None implies no limit.
- inclusive : (bool, bool)
- A tuple consisting of the (lower flag, upper flag). These flags
- determine whether values exactly equal to lower or upper are allowed.
- Returns
- -------
- A MaskedArray.
- Raises
- ------
- A ValueError if there are no values within the given limits.
- """
- lower_limit, upper_limit = limits
- lower_include, upper_include = inclusive
- am = ma.MaskedArray(a)
- if lower_limit is not None:
- if lower_include:
- am = ma.masked_less(am, lower_limit)
- else:
- am = ma.masked_less_equal(am, lower_limit)
- if upper_limit is not None:
- if upper_include:
- am = ma.masked_greater(am, upper_limit)
- else:
- am = ma.masked_greater_equal(am, upper_limit)
- if am.count() == 0:
- raise ValueError("No array values within given limits")
- return am
- def tmean(a, limits=None, inclusive=(True, True), axis=None):
- """
- Compute the trimmed mean.
- Parameters
- ----------
- a : array_like
- Array of values.
- limits : None or (lower limit, upper limit), optional
- Values in the input array less than the lower limit or greater than the
- upper limit will be ignored. When limits is None (default), then all
- values are used. Either of the limit values in the tuple can also be
- None representing a half-open interval.
- inclusive : (bool, bool), optional
- A tuple consisting of the (lower flag, upper flag). These flags
- determine whether values exactly equal to the lower or upper limits
- are included. The default value is (True, True).
- axis : int or None, optional
- Axis along which to operate. If None, compute over the
- whole array. Default is None.
- Returns
- -------
- tmean : float
- Notes
- -----
- For more details on `tmean`, see `scipy.stats.tmean`.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import mstats
- >>> a = np.array([[6, 8, 3, 0],
- ... [3, 9, 1, 2],
- ... [8, 7, 8, 2],
- ... [5, 6, 0, 2],
- ... [4, 5, 5, 2]])
- ...
- ...
- >>> mstats.tmean(a, (2,5))
- 3.3
- >>> mstats.tmean(a, (2,5), axis=0)
- masked_array(data=[4.0, 5.0, 4.0, 2.0],
- mask=[False, False, False, False],
- fill_value=1e+20)
- """
- return trima(a, limits=limits, inclusive=inclusive).mean(axis=axis)
- def tvar(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
- """
- Compute the trimmed variance
- This function computes the sample variance of an array of values,
- while ignoring values which are outside of given `limits`.
- Parameters
- ----------
- a : array_like
- Array of values.
- limits : None or (lower limit, upper limit), optional
- Values in the input array less than the lower limit or greater than the
- upper limit will be ignored. When limits is None, then all values are
- used. Either of the limit values in the tuple can also be None
- representing a half-open interval. The default value is None.
- inclusive : (bool, bool), optional
- A tuple consisting of the (lower flag, upper flag). These flags
- determine whether values exactly equal to the lower or upper limits
- are included. The default value is (True, True).
- axis : int or None, optional
- Axis along which to operate. If None, compute over the
- whole array. Default is zero.
- ddof : int, optional
- Delta degrees of freedom. Default is 1.
- Returns
- -------
- tvar : float
- Trimmed variance.
- Notes
- -----
- For more details on `tvar`, see `scipy.stats.tvar`.
- """
- a = a.astype(float).ravel()
- if limits is None:
- n = (~a.mask).sum() # todo: better way to do that?
- return np.ma.var(a) * n/(n-1.)
- am = _mask_to_limits(a, limits=limits, inclusive=inclusive)
- return np.ma.var(am, axis=axis, ddof=ddof)
- def tmin(a, lowerlimit=None, axis=0, inclusive=True):
- """
- Compute the trimmed minimum
- Parameters
- ----------
- a : array_like
- array of values
- lowerlimit : None or float, optional
- Values in the input array less than the given limit will be ignored.
- When lowerlimit is None, then all values are used. The default value
- is None.
- axis : int or None, optional
- Axis along which to operate. Default is 0. If None, compute over the
- whole array `a`.
- inclusive : {True, False}, optional
- This flag determines whether values exactly equal to the lower limit
- are included. The default value is True.
- Returns
- -------
- tmin : float, int or ndarray
- Notes
- -----
- For more details on `tmin`, see `scipy.stats.tmin`.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import mstats
- >>> a = np.array([[6, 8, 3, 0],
- ... [3, 2, 1, 2],
- ... [8, 1, 8, 2],
- ... [5, 3, 0, 2],
- ... [4, 7, 5, 2]])
- ...
- >>> mstats.tmin(a, 5)
- masked_array(data=[5, 7, 5, --],
- mask=[False, False, False, True],
- fill_value=999999)
- """
- a, axis = _chk_asarray(a, axis)
- am = trima(a, (lowerlimit, None), (inclusive, False))
- return ma.minimum.reduce(am, axis)
- def tmax(a, upperlimit=None, axis=0, inclusive=True):
- """
- Compute the trimmed maximum
- This function computes the maximum value of an array along a given axis,
- while ignoring values larger than a specified upper limit.
- Parameters
- ----------
- a : array_like
- array of values
- upperlimit : None or float, optional
- Values in the input array greater than the given limit will be ignored.
- When upperlimit is None, then all values are used. The default value
- is None.
- axis : int or None, optional
- Axis along which to operate. Default is 0. If None, compute over the
- whole array `a`.
- inclusive : {True, False}, optional
- This flag determines whether values exactly equal to the upper limit
- are included. The default value is True.
- Returns
- -------
- tmax : float, int or ndarray
- Notes
- -----
- For more details on `tmax`, see `scipy.stats.tmax`.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats import mstats
- >>> a = np.array([[6, 8, 3, 0],
- ... [3, 9, 1, 2],
- ... [8, 7, 8, 2],
- ... [5, 6, 0, 2],
- ... [4, 5, 5, 2]])
- ...
- ...
- >>> mstats.tmax(a, 4)
- masked_array(data=[4, --, 3, 2],
- mask=[False, True, False, False],
- fill_value=999999)
- """
- a, axis = _chk_asarray(a, axis)
- am = trima(a, (None, upperlimit), (False, inclusive))
- return ma.maximum.reduce(am, axis)
- def tsem(a, limits=None, inclusive=(True, True), axis=0, ddof=1):
- """
- Compute the trimmed standard error of the mean.
- This function finds the standard error of the mean for given
- values, ignoring values outside the given `limits`.
- Parameters
- ----------
- a : array_like
- array of values
- limits : None or (lower limit, upper limit), optional
- Values in the input array less than the lower limit or greater than the
- upper limit will be ignored. When limits is None, then all values are
- used. Either of the limit values in the tuple can also be None
- representing a half-open interval. The default value is None.
- inclusive : (bool, bool), optional
- A tuple consisting of the (lower flag, upper flag). These flags
- determine whether values exactly equal to the lower or upper limits
- are included. The default value is (True, True).
- axis : int or None, optional
- Axis along which to operate. If None, compute over the
- whole array. Default is zero.
- ddof : int, optional
- Delta degrees of freedom. Default is 1.
- Returns
- -------
- tsem : float
- Notes
- -----
- For more details on `tsem`, see `scipy.stats.tsem`.
- """
- a = ma.asarray(a).ravel()
- if limits is None:
- n = float(a.count())
- return a.std(axis=axis, ddof=ddof)/ma.sqrt(n)
- am = trima(a.ravel(), limits, inclusive)
- sd = np.sqrt(am.var(axis=axis, ddof=ddof))
- return sd / np.sqrt(am.count())
- def winsorize(a, limits=None, inclusive=(True, True), inplace=False,
- axis=None, nan_policy='propagate'):
- """Returns a Winsorized version of the input array.
- The (limits[0])th lowest values are set to the (limits[0])th percentile,
- and the (limits[1])th highest values are set to the (1 - limits[1])th
- percentile.
- Masked values are skipped.
- Parameters
- ----------
- a : sequence
- Input array.
- limits : {None, tuple of float}, optional
- Tuple of the percentages to cut on each side of the array, with respect
- to the number of unmasked data, as floats between 0. and 1.
- Noting n the number of unmasked data before trimming, the
- (n*limits[0])th smallest data and the (n*limits[1])th largest data are
- masked, and the total number of unmasked data after trimming
- is n*(1.-sum(limits)) The value of one limit can be set to None to
- indicate an open interval.
- inclusive : {(True, True) tuple}, optional
- Tuple indicating whether the number of data being masked on each side
- should be truncated (True) or rounded (False).
- inplace : {False, True}, optional
- Whether to winsorize in place (True) or to use a copy (False)
- axis : {None, int}, optional
- Axis along which to trim. If None, the whole array is trimmed, but its
- shape is maintained.
- nan_policy : {'propagate', 'raise', 'omit'}, optional
- Defines how to handle when input contains nan.
- The following options are available (default is 'propagate'):
- * 'propagate': allows nan values and may overwrite or propagate them
- * 'raise': throws an error
- * 'omit': performs the calculations ignoring nan values
- Notes
- -----
- This function is applied to reduce the effect of possibly spurious outliers
- by limiting the extreme values.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats.mstats import winsorize
- A shuffled array contains integers from 1 to 10.
- >>> a = np.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6])
- The 10% of the lowest value (i.e., `1`) and the 20% of the highest
- values (i.e., `9` and `10`) are replaced.
- >>> winsorize(a, limits=[0.1, 0.2])
- masked_array(data=[8, 4, 8, 8, 5, 3, 7, 2, 2, 6],
- mask=False,
- fill_value=999999)
- """
- def _winsorize1D(a, low_limit, up_limit, low_include, up_include,
- contains_nan, nan_policy):
- n = a.count()
- idx = a.argsort()
- if contains_nan:
- nan_count = np.count_nonzero(np.isnan(a))
- if low_limit:
- if low_include:
- lowidx = int(low_limit * n)
- else:
- lowidx = np.round(low_limit * n).astype(int)
- if contains_nan and nan_policy == 'omit':
- lowidx = min(lowidx, n-nan_count-1)
- a[idx[:lowidx]] = a[idx[lowidx]]
- if up_limit is not None:
- if up_include:
- upidx = n - int(n * up_limit)
- else:
- upidx = n - np.round(n * up_limit).astype(int)
- if contains_nan and nan_policy == 'omit':
- a[idx[upidx:-nan_count]] = a[idx[upidx - 1]]
- else:
- a[idx[upidx:]] = a[idx[upidx - 1]]
- return a
- contains_nan, nan_policy = _contains_nan(a, nan_policy)
- # We are going to modify a: better make a copy
- a = ma.array(a, copy=np.logical_not(inplace))
- if limits is None:
- return a
- if (not isinstance(limits, tuple)) and isinstance(limits, float):
- limits = (limits, limits)
- # Check the limits
- (lolim, uplim) = limits
- errmsg = "The proportion to cut from the %s should be between 0. and 1."
- if lolim is not None:
- if lolim > 1. or lolim < 0:
- raise ValueError(errmsg % 'beginning' + "(got %s)" % lolim)
- if uplim is not None:
- if uplim > 1. or uplim < 0:
- raise ValueError(errmsg % 'end' + "(got %s)" % uplim)
- (loinc, upinc) = inclusive
- if axis is None:
- shp = a.shape
- return _winsorize1D(a.ravel(), lolim, uplim, loinc, upinc,
- contains_nan, nan_policy).reshape(shp)
- else:
- return ma.apply_along_axis(_winsorize1D, axis, a, lolim, uplim, loinc,
- upinc, contains_nan, nan_policy)
- def moment(a, moment=1, axis=0):
- """
- Calculates the nth moment about the mean for a sample.
- Parameters
- ----------
- a : array_like
- data
- moment : int, optional
- order of central moment that is returned
- axis : int or None, optional
- Axis along which the central moment is computed. Default is 0.
- If None, compute over the whole array `a`.
- Returns
- -------
- n-th central moment : ndarray or float
- The appropriate moment along the given axis or over all values if axis
- is None. The denominator for the moment calculation is the number of
- observations, no degrees of freedom correction is done.
- Notes
- -----
- For more details about `moment`, see `scipy.stats.moment`.
- """
- a, axis = _chk_asarray(a, axis)
- if a.size == 0:
- moment_shape = list(a.shape)
- del moment_shape[axis]
- dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
- # empty array, return nan(s) with shape matching `moment`
- out_shape = (moment_shape if np.isscalar(moment)
- else [len(moment)] + moment_shape)
- if len(out_shape) == 0:
- return dtype(np.nan)
- else:
- return ma.array(np.full(out_shape, np.nan, dtype=dtype))
- # for array_like moment input, return a value for each.
- if not np.isscalar(moment):
- mean = a.mean(axis, keepdims=True)
- mmnt = [_moment(a, i, axis, mean=mean) for i in moment]
- return ma.array(mmnt)
- else:
- return _moment(a, moment, axis)
- # Moment with optional pre-computed mean, equal to a.mean(axis, keepdims=True)
- def _moment(a, moment, axis, *, mean=None):
- if np.abs(moment - np.round(moment)) > 0:
- raise ValueError("All moment parameters must be integers")
- if moment == 0 or moment == 1:
- # By definition the zeroth moment about the mean is 1, and the first
- # moment is 0.
- shape = list(a.shape)
- del shape[axis]
- dtype = a.dtype.type if a.dtype.kind in 'fc' else np.float64
- if len(shape) == 0:
- return dtype(1.0 if moment == 0 else 0.0)
- else:
- return (ma.ones(shape, dtype=dtype) if moment == 0
- else ma.zeros(shape, dtype=dtype))
- else:
- # Exponentiation by squares: form exponent sequence
- n_list = [moment]
- current_n = moment
- while current_n > 2:
- if current_n % 2:
- current_n = (current_n-1)/2
- else:
- current_n /= 2
- n_list.append(current_n)
- # Starting point for exponentiation by squares
- mean = a.mean(axis, keepdims=True) if mean is None else mean
- a_zero_mean = a - mean
- if n_list[-1] == 1:
- s = a_zero_mean.copy()
- else:
- s = a_zero_mean**2
- # Perform multiplications
- for n in n_list[-2::-1]:
- s = s**2
- if n % 2:
- s *= a_zero_mean
- return s.mean(axis)
- def variation(a, axis=0, ddof=0):
- """
- Compute the coefficient of variation.
- The coefficient of variation is the standard deviation divided by the
- mean. This function is equivalent to::
- np.std(x, axis=axis, ddof=ddof) / np.mean(x)
- The default for ``ddof`` is 0, but many definitions of the coefficient
- of variation use the square root of the unbiased sample variance
- for the sample standard deviation, which corresponds to ``ddof=1``.
- Parameters
- ----------
- a : array_like
- Input array.
- axis : int or None, optional
- Axis along which to calculate the coefficient of variation. Default
- is 0. If None, compute over the whole array `a`.
- ddof : int, optional
- Delta degrees of freedom. Default is 0.
- Returns
- -------
- variation : ndarray
- The calculated variation along the requested axis.
- Notes
- -----
- For more details about `variation`, see `scipy.stats.variation`.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats.mstats import variation
- >>> a = np.array([2,8,4])
- >>> variation(a)
- 0.5345224838248487
- >>> b = np.array([2,8,3,4])
- >>> c = np.ma.masked_array(b, mask=[0,0,1,0])
- >>> variation(c)
- 0.5345224838248487
- In the example above, it can be seen that this works the same as
- `scipy.stats.variation` except 'stats.mstats.variation' ignores masked
- array elements.
- """
- a, axis = _chk_asarray(a, axis)
- return a.std(axis, ddof=ddof)/a.mean(axis)
- def skew(a, axis=0, bias=True):
- """
- Computes the skewness of a data set.
- Parameters
- ----------
- a : ndarray
- data
- axis : int or None, optional
- Axis along which skewness is calculated. Default is 0.
- If None, compute over the whole array `a`.
- bias : bool, optional
- If False, then the calculations are corrected for statistical bias.
- Returns
- -------
- skewness : ndarray
- The skewness of values along an axis, returning 0 where all values are
- equal.
- Notes
- -----
- For more details about `skew`, see `scipy.stats.skew`.
- """
- a, axis = _chk_asarray(a,axis)
- mean = a.mean(axis, keepdims=True)
- m2 = _moment(a, 2, axis, mean=mean)
- m3 = _moment(a, 3, axis, mean=mean)
- zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
- with np.errstate(all='ignore'):
- vals = ma.where(zero, 0, m3 / m2**1.5)
- if not bias and zero is not ma.masked and m2 is not ma.masked:
- n = a.count(axis)
- can_correct = ~zero & (n > 2)
- if can_correct.any():
- n = np.extract(can_correct, n)
- m2 = np.extract(can_correct, m2)
- m3 = np.extract(can_correct, m3)
- nval = ma.sqrt((n-1.0)*n)/(n-2.0)*m3/m2**1.5
- np.place(vals, can_correct, nval)
- return vals
- def kurtosis(a, axis=0, fisher=True, bias=True):
- """
- Computes the kurtosis (Fisher or Pearson) of a dataset.
- Kurtosis is the fourth central moment divided by the square of the
- variance. If Fisher's definition is used, then 3.0 is subtracted from
- the result to give 0.0 for a normal distribution.
- If bias is False then the kurtosis is calculated using k statistics to
- eliminate bias coming from biased moment estimators
- Use `kurtosistest` to see if result is close enough to normal.
- Parameters
- ----------
- a : array
- data for which the kurtosis is calculated
- axis : int or None, optional
- Axis along which the kurtosis is calculated. Default is 0.
- If None, compute over the whole array `a`.
- fisher : bool, optional
- If True, Fisher's definition is used (normal ==> 0.0). If False,
- Pearson's definition is used (normal ==> 3.0).
- bias : bool, optional
- If False, then the calculations are corrected for statistical bias.
- Returns
- -------
- kurtosis : array
- The kurtosis of values along an axis. If all values are equal,
- return -3 for Fisher's definition and 0 for Pearson's definition.
- Notes
- -----
- For more details about `kurtosis`, see `scipy.stats.kurtosis`.
- """
- a, axis = _chk_asarray(a, axis)
- mean = a.mean(axis, keepdims=True)
- m2 = _moment(a, 2, axis, mean=mean)
- m4 = _moment(a, 4, axis, mean=mean)
- zero = (m2 <= (np.finfo(m2.dtype).resolution * mean.squeeze(axis))**2)
- with np.errstate(all='ignore'):
- vals = ma.where(zero, 0, m4 / m2**2.0)
- if not bias and zero is not ma.masked and m2 is not ma.masked:
- n = a.count(axis)
- can_correct = ~zero & (n > 3)
- if can_correct.any():
- n = np.extract(can_correct, n)
- m2 = np.extract(can_correct, m2)
- m4 = np.extract(can_correct, m4)
- nval = 1.0/(n-2)/(n-3)*((n*n-1.0)*m4/m2**2.0-3*(n-1)**2.0)
- np.place(vals, can_correct, nval+3.0)
- if fisher:
- return vals - 3
- else:
- return vals
- DescribeResult = namedtuple('DescribeResult', ('nobs', 'minmax', 'mean',
- 'variance', 'skewness',
- 'kurtosis'))
- def describe(a, axis=0, ddof=0, bias=True):
- """
- Computes several descriptive statistics of the passed array.
- Parameters
- ----------
- a : array_like
- Data array
- axis : int or None, optional
- Axis along which to calculate statistics. Default 0. If None,
- compute over the whole array `a`.
- ddof : int, optional
- degree of freedom (default 0); note that default ddof is different
- from the same routine in stats.describe
- bias : bool, optional
- If False, then the skewness and kurtosis calculations are corrected for
- statistical bias.
- Returns
- -------
- nobs : int
- (size of the data (discarding missing values)
- minmax : (int, int)
- min, max
- mean : float
- arithmetic mean
- variance : float
- unbiased variance
- skewness : float
- biased skewness
- kurtosis : float
- biased kurtosis
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats.mstats import describe
- >>> ma = np.ma.array(range(6), mask=[0, 0, 0, 1, 1, 1])
- >>> describe(ma)
- DescribeResult(nobs=3, minmax=(masked_array(data=0,
- mask=False,
- fill_value=999999), masked_array(data=2,
- mask=False,
- fill_value=999999)), mean=1.0, variance=0.6666666666666666,
- skewness=masked_array(data=0., mask=False, fill_value=1e+20),
- kurtosis=-1.5)
- """
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis)
- mm = (ma.minimum.reduce(a, axis=axis), ma.maximum.reduce(a, axis=axis))
- m = a.mean(axis)
- v = a.var(axis, ddof=ddof)
- sk = skew(a, axis, bias=bias)
- kurt = kurtosis(a, axis, bias=bias)
- return DescribeResult(n, mm, m, v, sk, kurt)
- def stde_median(data, axis=None):
- """Returns the McKean-Schrader estimate of the standard error of the sample
- median along the given axis. masked values are discarded.
- Parameters
- ----------
- data : ndarray
- Data to trim.
- axis : {None,int}, optional
- Axis along which to perform the trimming.
- If None, the input array is first flattened.
- """
- def _stdemed_1D(data):
- data = np.sort(data.compressed())
- n = len(data)
- z = 2.5758293035489004
- k = int(np.round((n+1)/2. - z * np.sqrt(n/4.),0))
- return ((data[n-k] - data[k-1])/(2.*z))
- data = ma.array(data, copy=False, subok=True)
- if (axis is None):
- return _stdemed_1D(data)
- else:
- if data.ndim > 2:
- raise ValueError("Array 'data' must be at most two dimensional, "
- "but got data.ndim = %d" % data.ndim)
- return ma.apply_along_axis(_stdemed_1D, axis, data)
- SkewtestResult = namedtuple('SkewtestResult', ('statistic', 'pvalue'))
- def skewtest(a, axis=0, alternative='two-sided'):
- """
- Tests whether the skew is different from the normal distribution.
- Parameters
- ----------
- a : array_like
- The data to be tested
- axis : int or None, optional
- Axis along which statistics are calculated. Default is 0.
- If None, compute over the whole array `a`.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the skewness of the distribution underlying the sample
- is different from that of the normal distribution (i.e. 0)
- * 'less': the skewness of the distribution underlying the sample
- is less than that of the normal distribution
- * 'greater': the skewness of the distribution underlying the sample
- is greater than that of the normal distribution
- .. versionadded:: 1.7.0
- Returns
- -------
- statistic : array_like
- The computed z-score for this test.
- pvalue : array_like
- A p-value for the hypothesis test
- Notes
- -----
- For more details about `skewtest`, see `scipy.stats.skewtest`.
- """
- a, axis = _chk_asarray(a, axis)
- if axis is None:
- a = a.ravel()
- axis = 0
- b2 = skew(a,axis)
- n = a.count(axis)
- if np.min(n) < 8:
- raise ValueError(
- "skewtest is not valid with less than 8 samples; %i samples"
- " were given." % np.min(n))
- y = b2 * ma.sqrt(((n+1)*(n+3)) / (6.0*(n-2)))
- beta2 = (3.0*(n*n+27*n-70)*(n+1)*(n+3)) / ((n-2.0)*(n+5)*(n+7)*(n+9))
- W2 = -1 + ma.sqrt(2*(beta2-1))
- delta = 1/ma.sqrt(0.5*ma.log(W2))
- alpha = ma.sqrt(2.0/(W2-1))
- y = ma.where(y == 0, 1, y)
- Z = delta*ma.log(y/alpha + ma.sqrt((y/alpha)**2+1))
- return SkewtestResult(*scipy.stats._stats_py._normtest_finish(Z, alternative))
- KurtosistestResult = namedtuple('KurtosistestResult', ('statistic', 'pvalue'))
- def kurtosistest(a, axis=0, alternative='two-sided'):
- """
- Tests whether a dataset has normal kurtosis
- Parameters
- ----------
- a : array_like
- array of the sample data
- axis : int or None, optional
- Axis along which to compute test. Default is 0. If None,
- compute over the whole array `a`.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis.
- The following options are available (default is 'two-sided'):
- * 'two-sided': the kurtosis of the distribution underlying the sample
- is different from that of the normal distribution
- * 'less': the kurtosis of the distribution underlying the sample
- is less than that of the normal distribution
- * 'greater': the kurtosis of the distribution underlying the sample
- is greater than that of the normal distribution
- .. versionadded:: 1.7.0
- Returns
- -------
- statistic : array_like
- The computed z-score for this test.
- pvalue : array_like
- The p-value for the hypothesis test
- Notes
- -----
- For more details about `kurtosistest`, see `scipy.stats.kurtosistest`.
- """
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis=axis)
- if np.min(n) < 5:
- raise ValueError(
- "kurtosistest requires at least 5 observations; %i observations"
- " were given." % np.min(n))
- if np.min(n) < 20:
- warnings.warn(
- "kurtosistest only valid for n>=20 ... continuing anyway, n=%i" %
- np.min(n))
- b2 = kurtosis(a, axis, fisher=False)
- E = 3.0*(n-1) / (n+1)
- varb2 = 24.0*n*(n-2.)*(n-3) / ((n+1)*(n+1.)*(n+3)*(n+5))
- x = (b2-E)/ma.sqrt(varb2)
- sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * np.sqrt((6.0*(n+3)*(n+5)) /
- (n*(n-2)*(n-3)))
- A = 6.0 + 8.0/sqrtbeta1 * (2.0/sqrtbeta1 + np.sqrt(1+4.0/(sqrtbeta1**2)))
- term1 = 1 - 2./(9.0*A)
- denom = 1 + x*ma.sqrt(2/(A-4.0))
- if np.ma.isMaskedArray(denom):
- # For multi-dimensional array input
- denom[denom == 0.0] = masked
- elif denom == 0.0:
- denom = masked
- term2 = np.ma.where(denom > 0, ma.power((1-2.0/A)/denom, 1/3.0),
- -ma.power(-(1-2.0/A)/denom, 1/3.0))
- Z = (term1 - term2) / np.sqrt(2/(9.0*A))
- return KurtosistestResult(
- *scipy.stats._stats_py._normtest_finish(Z, alternative)
- )
- NormaltestResult = namedtuple('NormaltestResult', ('statistic', 'pvalue'))
- def normaltest(a, axis=0):
- """
- Tests whether a sample differs from a normal distribution.
- Parameters
- ----------
- a : array_like
- The array containing the data to be tested.
- axis : int or None, optional
- Axis along which to compute test. Default is 0. If None,
- compute over the whole array `a`.
- Returns
- -------
- statistic : float or array
- ``s^2 + k^2``, where ``s`` is the z-score returned by `skewtest` and
- ``k`` is the z-score returned by `kurtosistest`.
- pvalue : float or array
- A 2-sided chi squared probability for the hypothesis test.
- Notes
- -----
- For more details about `normaltest`, see `scipy.stats.normaltest`.
- """
- a, axis = _chk_asarray(a, axis)
- s, _ = skewtest(a, axis)
- k, _ = kurtosistest(a, axis)
- k2 = s*s + k*k
- return NormaltestResult(k2, distributions.chi2.sf(k2, 2))
- def mquantiles(a, prob=list([.25,.5,.75]), alphap=.4, betap=.4, axis=None,
- limit=()):
- """
- Computes empirical quantiles for a data array.
- Samples quantile are defined by ``Q(p) = (1-gamma)*x[j] + gamma*x[j+1]``,
- where ``x[j]`` is the j-th order statistic, and gamma is a function of
- ``j = floor(n*p + m)``, ``m = alphap + p*(1 - alphap - betap)`` and
- ``g = n*p + m - j``.
- Reinterpreting the above equations to compare to **R** lead to the
- equation: ``p(k) = (k - alphap)/(n + 1 - alphap - betap)``
- Typical values of (alphap,betap) are:
- - (0,1) : ``p(k) = k/n`` : linear interpolation of cdf
- (**R** type 4)
- - (.5,.5) : ``p(k) = (k - 1/2.)/n`` : piecewise linear function
- (**R** type 5)
- - (0,0) : ``p(k) = k/(n+1)`` :
- (**R** type 6)
- - (1,1) : ``p(k) = (k-1)/(n-1)``: p(k) = mode[F(x[k])].
- (**R** type 7, **R** default)
- - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``: Then p(k) ~ median[F(x[k])].
- The resulting quantile estimates are approximately median-unbiased
- regardless of the distribution of x.
- (**R** type 8)
- - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``: Blom.
- The resulting quantile estimates are approximately unbiased
- if x is normally distributed
- (**R** type 9)
- - (.4,.4) : approximately quantile unbiased (Cunnane)
- - (.35,.35): APL, used with PWM
- Parameters
- ----------
- a : array_like
- Input data, as a sequence or array of dimension at most 2.
- prob : array_like, optional
- List of quantiles to compute.
- alphap : float, optional
- Plotting positions parameter, default is 0.4.
- betap : float, optional
- Plotting positions parameter, default is 0.4.
- axis : int, optional
- Axis along which to perform the trimming.
- If None (default), the input array is first flattened.
- limit : tuple, optional
- Tuple of (lower, upper) values.
- Values of `a` outside this open interval are ignored.
- Returns
- -------
- mquantiles : MaskedArray
- An array containing the calculated quantiles.
- Notes
- -----
- This formulation is very similar to **R** except the calculation of
- ``m`` from ``alphap`` and ``betap``, where in **R** ``m`` is defined
- with each type.
- References
- ----------
- .. [1] *R* statistical software: https://www.r-project.org/
- .. [2] *R* ``quantile`` function:
- http://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.stats.mstats import mquantiles
- >>> a = np.array([6., 47., 49., 15., 42., 41., 7., 39., 43., 40., 36.])
- >>> mquantiles(a)
- array([ 19.2, 40. , 42.8])
- Using a 2D array, specifying axis and limit.
- >>> data = np.array([[ 6., 7., 1.],
- ... [ 47., 15., 2.],
- ... [ 49., 36., 3.],
- ... [ 15., 39., 4.],
- ... [ 42., 40., -999.],
- ... [ 41., 41., -999.],
- ... [ 7., -999., -999.],
- ... [ 39., -999., -999.],
- ... [ 43., -999., -999.],
- ... [ 40., -999., -999.],
- ... [ 36., -999., -999.]])
- >>> print(mquantiles(data, axis=0, limit=(0, 50)))
- [[19.2 14.6 1.45]
- [40. 37.5 2.5 ]
- [42.8 40.05 3.55]]
- >>> data[:, 2] = -999.
- >>> print(mquantiles(data, axis=0, limit=(0, 50)))
- [[19.200000000000003 14.6 --]
- [40.0 37.5 --]
- [42.800000000000004 40.05 --]]
- """
- def _quantiles1D(data,m,p):
- x = np.sort(data.compressed())
- n = len(x)
- if n == 0:
- return ma.array(np.empty(len(p), dtype=float), mask=True)
- elif n == 1:
- return ma.array(np.resize(x, p.shape), mask=nomask)
- aleph = (n*p + m)
- k = np.floor(aleph.clip(1, n-1)).astype(int)
- gamma = (aleph-k).clip(0,1)
- return (1.-gamma)*x[(k-1).tolist()] + gamma*x[k.tolist()]
- data = ma.array(a, copy=False)
- if data.ndim > 2:
- raise TypeError("Array should be 2D at most !")
- if limit:
- condition = (limit[0] < data) & (data < limit[1])
- data[~condition.filled(True)] = masked
- p = np.array(prob, copy=False, ndmin=1)
- m = alphap + p*(1.-alphap-betap)
- # Computes quantiles along axis (or globally)
- if (axis is None):
- return _quantiles1D(data, m, p)
- return ma.apply_along_axis(_quantiles1D, axis, data, m, p)
- def scoreatpercentile(data, per, limit=(), alphap=.4, betap=.4):
- """Calculate the score at the given 'per' percentile of the
- sequence a. For example, the score at per=50 is the median.
- This function is a shortcut to mquantile
- """
- if (per < 0) or (per > 100.):
- raise ValueError("The percentile should be between 0. and 100. !"
- " (got %s)" % per)
- return mquantiles(data, prob=[per/100.], alphap=alphap, betap=betap,
- limit=limit, axis=0).squeeze()
- def plotting_positions(data, alpha=0.4, beta=0.4):
- """
- Returns plotting positions (or empirical percentile points) for the data.
- Plotting positions are defined as ``(i-alpha)/(n+1-alpha-beta)``, where:
- - i is the rank order statistics
- - n is the number of unmasked values along the given axis
- - `alpha` and `beta` are two parameters.
- Typical values for `alpha` and `beta` are:
- - (0,1) : ``p(k) = k/n``, linear interpolation of cdf (R, type 4)
- - (.5,.5) : ``p(k) = (k-1/2.)/n``, piecewise linear function
- (R, type 5)
- - (0,0) : ``p(k) = k/(n+1)``, Weibull (R type 6)
- - (1,1) : ``p(k) = (k-1)/(n-1)``, in this case,
- ``p(k) = mode[F(x[k])]``. That's R default (R type 7)
- - (1/3,1/3): ``p(k) = (k-1/3)/(n+1/3)``, then
- ``p(k) ~ median[F(x[k])]``.
- The resulting quantile estimates are approximately median-unbiased
- regardless of the distribution of x. (R type 8)
- - (3/8,3/8): ``p(k) = (k-3/8)/(n+1/4)``, Blom.
- The resulting quantile estimates are approximately unbiased
- if x is normally distributed (R type 9)
- - (.4,.4) : approximately quantile unbiased (Cunnane)
- - (.35,.35): APL, used with PWM
- - (.3175, .3175): used in scipy.stats.probplot
- Parameters
- ----------
- data : array_like
- Input data, as a sequence or array of dimension at most 2.
- alpha : float, optional
- Plotting positions parameter. Default is 0.4.
- beta : float, optional
- Plotting positions parameter. Default is 0.4.
- Returns
- -------
- positions : MaskedArray
- The calculated plotting positions.
- """
- data = ma.array(data, copy=False).reshape(1,-1)
- n = data.count()
- plpos = np.empty(data.size, dtype=float)
- plpos[n:] = 0
- plpos[data.argsort(axis=None)[:n]] = ((np.arange(1, n+1) - alpha) /
- (n + 1.0 - alpha - beta))
- return ma.array(plpos, mask=data._mask)
- meppf = plotting_positions
- def obrientransform(*args):
- """
- Computes a transform on input data (any number of columns). Used to
- test for homogeneity of variance prior to running one-way stats. Each
- array in ``*args`` is one level of a factor. If an `f_oneway()` run on
- the transformed data and found significant, variances are unequal. From
- Maxwell and Delaney, p.112.
- Returns: transformed data for use in an ANOVA
- """
- data = argstoarray(*args).T
- v = data.var(axis=0,ddof=1)
- m = data.mean(0)
- n = data.count(0).astype(float)
- # result = ((N-1.5)*N*(a-m)**2 - 0.5*v*(n-1))/((n-1)*(n-2))
- data -= m
- data **= 2
- data *= (n-1.5)*n
- data -= 0.5*v*(n-1)
- data /= (n-1.)*(n-2.)
- if not ma.allclose(v,data.mean(0)):
- raise ValueError("Lack of convergence in obrientransform.")
- return data
- def sem(a, axis=0, ddof=1):
- """
- Calculates the standard error of the mean of the input array.
- Also sometimes called standard error of measurement.
- Parameters
- ----------
- a : array_like
- An array containing the values for which the standard error is
- returned.
- axis : int or None, optional
- If axis is None, ravel `a` first. If axis is an integer, this will be
- the axis over which to operate. Defaults to 0.
- ddof : int, optional
- Delta degrees-of-freedom. How many degrees of freedom to adjust
- for bias in limited samples relative to the population estimate
- of variance. Defaults to 1.
- Returns
- -------
- s : ndarray or float
- The standard error of the mean in the sample(s), along the input axis.
- Notes
- -----
- The default value for `ddof` changed in scipy 0.15.0 to be consistent with
- `scipy.stats.sem` as well as with the most common definition used (like in
- the R documentation).
- Examples
- --------
- Find standard error along the first axis:
- >>> import numpy as np
- >>> from scipy import stats
- >>> a = np.arange(20).reshape(5,4)
- >>> print(stats.mstats.sem(a))
- [2.8284271247461903 2.8284271247461903 2.8284271247461903
- 2.8284271247461903]
- Find standard error across the whole array, using n degrees of freedom:
- >>> print(stats.mstats.sem(a, axis=None, ddof=0))
- 1.2893796958227628
- """
- a, axis = _chk_asarray(a, axis)
- n = a.count(axis=axis)
- s = a.std(axis=axis, ddof=ddof) / ma.sqrt(n)
- return s
- F_onewayResult = namedtuple('F_onewayResult', ('statistic', 'pvalue'))
- def f_oneway(*args):
- """
- Performs a 1-way ANOVA, returning an F-value and probability given
- any number of groups. From Heiman, pp.394-7.
- Usage: ``f_oneway(*args)``, where ``*args`` is 2 or more arrays,
- one per treatment group.
- Returns
- -------
- statistic : float
- The computed F-value of the test.
- pvalue : float
- The associated p-value from the F-distribution.
- """
- # Construct a single array of arguments: each row is a group
- data = argstoarray(*args)
- ngroups = len(data)
- ntot = data.count()
- sstot = (data**2).sum() - (data.sum())**2/float(ntot)
- ssbg = (data.count(-1) * (data.mean(-1)-data.mean())**2).sum()
- sswg = sstot-ssbg
- dfbg = ngroups-1
- dfwg = ntot - ngroups
- msb = ssbg/float(dfbg)
- msw = sswg/float(dfwg)
- f = msb/msw
- prob = special.fdtrc(dfbg, dfwg, f) # equivalent to stats.f.sf
- return F_onewayResult(f, prob)
- FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
- ('statistic', 'pvalue'))
- def friedmanchisquare(*args):
- """Friedman Chi-Square is a non-parametric, one-way within-subjects ANOVA.
- This function calculates the Friedman Chi-square test for repeated measures
- and returns the result, along with the associated probability value.
- Each input is considered a given group. Ideally, the number of treatments
- among each group should be equal. If this is not the case, only the first
- n treatments are taken into account, where n is the number of treatments
- of the smallest group.
- If a group has some missing values, the corresponding treatments are masked
- in the other groups.
- The test statistic is corrected for ties.
- Masked values in one group are propagated to the other groups.
- Returns
- -------
- statistic : float
- the test statistic.
- pvalue : float
- the associated p-value.
- """
- data = argstoarray(*args).astype(float)
- k = len(data)
- if k < 3:
- raise ValueError("Less than 3 groups (%i): " % k +
- "the Friedman test is NOT appropriate.")
- ranked = ma.masked_values(rankdata(data, axis=0), 0)
- if ranked._mask is not nomask:
- ranked = ma.mask_cols(ranked)
- ranked = ranked.compressed().reshape(k,-1).view(ndarray)
- else:
- ranked = ranked._data
- (k,n) = ranked.shape
- # Ties correction
- repeats = [find_repeats(row) for row in ranked.T]
- ties = np.array([y for x, y in repeats if x.size > 0])
- tie_correction = 1 - (ties**3-ties).sum()/float(n*(k**3-k))
- ssbg = np.sum((ranked.sum(-1) - n*(k+1)/2.)**2)
- chisq = ssbg * 12./(n*k*(k+1)) * 1./tie_correction
- return FriedmanchisquareResult(chisq,
- distributions.chi2.sf(chisq, k-1))
- BrunnerMunzelResult = namedtuple('BrunnerMunzelResult', ('statistic', 'pvalue'))
- def brunnermunzel(x, y, alternative="two-sided", distribution="t"):
- """
- Computes the Brunner-Munzel test on samples x and y
- Missing values in `x` and/or `y` are discarded.
- Parameters
- ----------
- x, y : array_like
- Array of samples, should be one-dimensional.
- alternative : 'less', 'two-sided', or 'greater', optional
- Whether to get the p-value for the one-sided hypothesis ('less'
- or 'greater') or for the two-sided hypothesis ('two-sided').
- Defaults value is 'two-sided' .
- distribution : 't' or 'normal', optional
- Whether to get the p-value by t-distribution or by standard normal
- distribution.
- Defaults value is 't' .
- Returns
- -------
- statistic : float
- The Brunner-Munzer W statistic.
- pvalue : float
- p-value assuming an t distribution. One-sided or
- two-sided, depending on the choice of `alternative` and `distribution`.
- See Also
- --------
- mannwhitneyu : Mann-Whitney rank test on two samples.
- Notes
- -----
- For more details on `brunnermunzel`, see `scipy.stats.brunnermunzel`.
- """
- x = ma.asarray(x).compressed().view(ndarray)
- y = ma.asarray(y).compressed().view(ndarray)
- nx = len(x)
- ny = len(y)
- if nx == 0 or ny == 0:
- return BrunnerMunzelResult(np.nan, np.nan)
- rankc = rankdata(np.concatenate((x,y)))
- rankcx = rankc[0:nx]
- rankcy = rankc[nx:nx+ny]
- rankcx_mean = np.mean(rankcx)
- rankcy_mean = np.mean(rankcy)
- rankx = rankdata(x)
- ranky = rankdata(y)
- rankx_mean = np.mean(rankx)
- ranky_mean = np.mean(ranky)
- Sx = np.sum(np.power(rankcx - rankx - rankcx_mean + rankx_mean, 2.0))
- Sx /= nx - 1
- Sy = np.sum(np.power(rankcy - ranky - rankcy_mean + ranky_mean, 2.0))
- Sy /= ny - 1
- wbfn = nx * ny * (rankcy_mean - rankcx_mean)
- wbfn /= (nx + ny) * np.sqrt(nx * Sx + ny * Sy)
- if distribution == "t":
- df_numer = np.power(nx * Sx + ny * Sy, 2.0)
- df_denom = np.power(nx * Sx, 2.0) / (nx - 1)
- df_denom += np.power(ny * Sy, 2.0) / (ny - 1)
- df = df_numer / df_denom
- p = distributions.t.cdf(wbfn, df)
- elif distribution == "normal":
- p = distributions.norm.cdf(wbfn)
- else:
- raise ValueError(
- "distribution should be 't' or 'normal'")
- if alternative == "greater":
- pass
- elif alternative == "less":
- p = 1 - p
- elif alternative == "two-sided":
- p = 2 * np.min([p, 1-p])
- else:
- raise ValueError(
- "alternative should be 'less', 'greater' or 'two-sided'")
- return BrunnerMunzelResult(wbfn, p)
|