00001
00002
00003 import math,random
00004
00005
00006
00007
00008
00009 from ScrumPy.Util import Sci
00010 from ScrumPy.ThirdParty import Special
00011
00012 Pi2 = 2*math.pi
00013 RootPi2 = math.sqrt(Pi2)
00014 Root2 = math.sqrt(2.0)
00015 NaN=float("NaN")
00016
00017 def NormPdf(x,mu=0,sigma=1):
00018 frac = (x-mu)/sigma
00019 Exp = math.exp(-0.5*frac*frac)
00020 recip = 1/(sigma*RootPi2)
00021 return recip * Exp
00022
00023
00024 def NormCdf(x,mu=0,sigma=1):
00025
00026 erfarg = (x-mu)/(sigma*Root2)
00027 return (1+Special.Erf(erfarg))/2
00028
00029
00030
00031 def Binv(p,q):
00032 """inverse beta function"""
00033
00034 lgamma = Special.Lgamma
00035 return math.exp(lgamma(p+q)-lgamma(p)-lgamma(q))
00036
00037
00038
00039 def t_pdf(x,nu):
00040 """ student's t pdf for nu d.f. """
00041
00042
00043
00044 return Binv(0.5*nu,0.5)/math.sqrt(nu)*(1+(x*x)/nu)**(-0.5*(nu+1.0))
00045
00046
00047
00048 def t_cdf(x,nu):
00049 """ student's t cdf for nu d.f. """
00050
00051
00052 ibeta = Special.betai
00053
00054 if x<0:
00055 sgn5 = -0.5
00056 else:
00057 sgn5 = 0.5
00058
00059 return 0.5+sgn5*(1-ibeta(0.5*nu,0.5,nu/(nu+x*x)))
00060
00061
00062 def inv_t_cdf(c,nu, tol=1e-6, lim=30):
00063
00064
00065
00066
00067 def itcdf(lo,hi,n=0):
00068
00069 mid = (lo+hi)/2
00070 cur = t_cdf(mid,nu)
00071 err = 1 - lo/mid
00072
00073 if err <= tol:
00074 return mid
00075
00076 if n>lim:
00077 print "!! itcdf: only achieved tol=", err, "requested", tol, "!!"
00078 return mid
00079
00080 if cur > c:
00081 return itcdf(lo,mid,n+1)
00082
00083 return itcdf(mid,hi,n+1)
00084
00085 inihi = 5.
00086 while t_cdf(inihi,nu)<c:
00087 inihi *=2
00088
00089
00090 return itcdf(0.0,inihi)
00091
00092
00093 def Conflims(conf, s, df):
00094
00095
00096
00097 c = 1 - (1-conf)/2
00098 tval = inv_t_cdf(c,df-1)
00099 return tval *s/math.sqrt(df)
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110 def NormList(mu,sigma,size):
00111
00112 return map(lambda x: random.normalvariate(mu,sigma), [0]*size)
00113
00114
00115 def SumProds(x,y):
00116 """ pre: len(x) == len(y), items are numeric
00117 post: sum(x[i] * y[i]) : i = 0..len(x)-1 """
00118 return sum(map(lambda a,b: a*b, x,y))
00119
00120 def SqSum(x):
00121 """ pre: sum(x)
00122 post: sum(x)**2 """
00123
00124 s = sum(x)
00125 return s*s
00126
00127 def SumSq(x):
00128 return sum(map(lambda a:a*a,x))
00129
00130 def SumSqDiff(vals, const):
00131
00132 return(SumSq(map(lambda x: x-const, vals)))
00133
00134
00135 def Adj_p(plist):
00136 rv = plist[:]
00137 rv.sort()
00138 mul = len(rv)
00139 for i in range(len(plist)):
00140 rv[i] *= mul
00141 mul -= 1
00142 return rv
00143
00144 def sortidx(x):
00145 """
00146 faster equivalent of stats.lshellsort(inlist)
00147 """
00148 n = len(x)
00149 xs,x = x[:],x[:]
00150 xs.sort()
00151 rv = [0] *n
00152 for i in range(n):
00153 rv[i] = x.index(xs[i])
00154 x[rv[i]] = None
00155 return xs,rv
00156
00157
00158 def Rank(inlist):
00159 """
00160 Ranks the data in inlist, dealing with ties appropritely. Assumes
00161 a 1D inlist. Adapted from Gary Perlman's |Stat ranksort.
00162
00163 Usage: lrankdata(inlist)
00164 Returns: a list of length equal to inlist, containing rank scores
00165 """
00166 n = len(inlist)
00167 svec, ivec = sortidx(inlist)
00168 sumranks = 0
00169 dupcount = 0
00170 newlist = [0]*n
00171 for i in range(n):
00172 sumranks += i
00173 dupcount += 1
00174 if i==n-1 or svec[i] <> svec[i+1]:
00175 averank = sumranks / float(dupcount )+ 1
00176 for j in range(i-dupcount+1,i+1):
00177 newlist[ivec[j]] = averank
00178 sumranks = 0
00179 dupcount = 0
00180 return newlist
00181
00182
00183
00184
00185
00186
00187 def Mean(a):
00188 return sum(a)/len(a)
00189
00190 def Var(a,abar=None):
00191 if abar ==None:
00192 abar = Mean(a)
00193
00194 return sum( map(lambda x:(x-abar)**2,a))/(len(a)-1)
00195
00196
00197 def StdDev(a, abar = None):
00198
00199 if abar == None:
00200 abar = Mean(a)
00201
00202 return math.sqrt(Var(a,abar))
00203
00204
00205 def StdErr(a):
00206
00207 return StdDev(a)/math.sqrt(len(a))
00208
00209
00210 def Skewness(a,mu=None,sigma=None):
00211
00212 lena = len(a)
00213 if mu==None:
00214 mu = Mean(a)
00215 if sigma==None:
00216 sigma = StdDev(a,mu)
00217
00218 return sum( map(lambda x: ((x - mu)/sigma)**3,a))/lena
00219
00220
00221 def Kurtosis(a):
00222 lena = len(a)
00223 mean = Mean(a)
00224 sigma = StdDev(a)
00225
00226 return (sum( map(lambda x: ((x - mean)/sigma)**4,a))/lena)-3
00227
00228
00229
00230 def Lower_n_ile(vals, n,UseCopy=True):
00231 """ pre: len(self[Name]) > 0, 0 < n <= 1
00232 post: self.Lower_n_ile(self, Name, n) == value of self[Name] which is the limit of the
00233 lower nth portion of self[Name] """
00234
00235 if UseCopy:
00236 vals = vals[:]
00237 vals.sort()
00238 index = int(len(vals)*n)
00239 return vals[index]
00240
00241
00242 def Upper_n_ile(vals, n,UseCopy=True):
00243 """ pre: len(self[Name]) > 0, 0 < n <= 1
00244 post: self.Lower_n_ile(self, Name, n) == value of self[Name] which is the limit of the
00245 upper nth portion of self[Name] """
00246
00247 return Lower_n_ile(vals, 1.0-n,UseCopy)
00248
00249 def Median(vals, UseCopy=True):
00250 """ pre: len(self[Name]) > 0
00251 post: self.Median(Name) == Median value of self[Name] """
00252
00253 return Lower_n_ile(vals, 0.5,UseCopy)
00254
00255
00256
00257 def FTest(vals1, vals2):
00258 """ F and p(F) for unequal variance
00259 """
00260
00261 var1 = Var(vals1)
00262 var2 = Var(vals2)
00263 df1 = len(vals1)-1
00264 df2 = len(vals2)-1
00265
00266 f = var1/var2
00267 if f<1:
00268 f = 1/f
00269 df1, df2 = df2, df1
00270
00271 p = 2* Special.betai(0.5*df2, 0.5*df1, df2/(f*df1+df2))
00272
00273 if p>1:
00274 p = 2-p
00275
00276 return f, p
00277
00278
00279 def TTest(vals1, vals2):
00280 """ Student's (two tailed) t and p(t) fo different means
00281 """
00282
00283
00284
00285
00286 v1bar = Mean(vals1)
00287 v2bar = Mean(vals2)
00288 v1var = Var(vals1)
00289 v2var = Var(vals2)
00290 n1 = len(vals1)
00291 n2 = len(vals2)
00292
00293 t = (v1bar-v2bar)/(
00294 math.sqrt(
00295 v1var/n1 + v2var/n2
00296 )
00297 )
00298
00299 df = (v1var/n1 + v2var/n2)**2/(
00300 ((v1var/n1)**2/(n1-1)) +
00301 ((v2var/n2)**2/(n2-1))
00302 )
00303
00304
00305 return t, Prob_of_t(t, df)
00306
00307
00308 def Prob_of_t(t, df):
00309
00310 df = float(df)
00311
00312 return Special.betai(0.5*df, 0.5, df/(df+t*t))
00313
00314
00315
00316
00317
00318 def CoVar(a,b,abar=None, bbar=None):
00319
00320 if abar == None: abar = Mean(a)
00321 if bbar == None: bbar = Mean(b)
00322
00323 return sum(map(lambda x,y:(x-abar)*(y-bbar),a,b))/(len(a)-1)
00324
00325
00326 def Pearson_r(x,y):
00327 """
00328 pre: len(x) == len(y), items are numeric type
00329 post: Pearsons r
00330 """
00331
00332 xmean = Mean(x)
00333 ymean = Mean(y)
00334 xsubmean = map(lambda a: a-xmean,x)
00335 ysubmean = map(lambda a: a-ymean, y)
00336 sq = lambda n: n*n
00337
00338 r_num = SumProds(xsubmean, ysubmean)
00339 r_den = math.sqrt(sum(map(sq, xsubmean))) * math.sqrt(sum(map(sq, ysubmean)))
00340
00341 if r_den == 0:
00342 return NaN
00343
00344 return r_num / r_den
00345
00346
00347
00348 def Spearman_r(x, y, rankxy=None,):
00349
00350 """
00351 Calculates a Spearman rank-order correlation coefficient. Taken
00352 from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
00353
00354 Usage: lspearmanr(x,y) where x and y are equal-length lists
00355 Returns: Spearman's r, two-tailed p-value
00356 """
00357
00358 n = len(x)
00359 if rankxy == None:
00360 rankx = Rank(x)
00361 ranky = Rank(y)
00362 else:
00363 rankx, ranky = rankxy
00364
00365 diffs = map(lambda a,b: a-b, rankx,ranky)
00366 dsq = SumSq(diffs)
00367 return 1 - 6*dsq / float(n*(n**2-1))
00368
00369 def Prob_of_r(r, n):
00370 df = n-2
00371 t = r * math.sqrt(df / ( (r+1.0)*(1.0-r)))
00372 return Special.betai(0.5*df, 0.5, df/(df+t*t) )
00373
00374
00375
00376
00377
00378 def Prob_of_chisq(chisq,n):
00379 """ probability of getting <chisq for correct model with n deg freedom.
00380 NRiC 6.2.15 """
00381
00382 return Special.Gamma_p(float(n),chisq)
00383
00384
00385
00386
00387 """
00388 Calculates a one-way chi square for list of observed frequencies and returns
00389 the result. If no expected frequencies are given, the total N is assumed to
00390 be equally distributed across all groups.
00391
00392 Usage: lchisquare(Samp, f_exp=None) f_obs = list of observed cell freq.
00393 Returns: chisquare-statistic, associated p-value
00394 """
00395
00396 def ChiSquare(Samp, Ref, NormRef=True):
00397
00398 if NormRef:
00399 coeff = float(sum(Ref))/sum(Samp)
00400 Ref = map(lambda x:x/coeff,Ref)
00401
00402 return sum(map(lambda s,r:((s-r)**2)/r,Samp,Ref))
00403
00404
00405 def LinFit(Xseq, Yseq, Resids=False):
00406 """
00407 pre: Xseq, Yseq equal length sequence of numbers
00408
00409 Calculate best m,c for y = mx+c
00410
00411 post:
00412 Resids=False : return (m,c)
00413 else : return (m,c, [resids])
00414 """
00415
00416 m,c = Sci.polyfit(Xseq,Yseq)
00417 if Resids:
00418 best = map(lambda x:m*x+c, Xseq)
00419 resids = map(lambda x,y:x-y, best, Yseq)
00420 rv = m,c,resids
00421 else:
00422 rv = m,c
00423
00424 return rv
00425
00426
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443 def ColMeansMtx(mtx):
00444
00445 return map(lambda c: Mean(mtx.GetCol(c)),mtx.cnames)
00446
00447
00448 def CoVarMtx(mtx):
00449
00450 lenr = len(mtx.rnames)
00451 coeff = math.sqrt(lenr-1.)
00452 mtx = mtx.Copy()
00453 negmeans = map(lambda x:-x,ColMeansMtx(mtx))
00454
00455 for r in range(lenr):
00456 mtx.AddRow(r, negmeans)
00457 mtx.DivRow(r,k=coeff)
00458
00459 return mtx.Copy(tx=1).Mul(mtx)
00460
00461
00462 def CorMtx(mtx):
00463 raise "use pearson instead"
00464
00465 def Rank_mtx(mtx):
00466 rv = mtx.Copy()
00467 for c in mtx.cnames:
00468 rv.SetCol(c, Rank(mtx.GetCol(c)))
00469 return rv
00470
00471 def Spearman_rmtx(mtx):
00472 rmtx = Rank_mtx(mtx)
00473 rmtx.Transpose()
00474
00475 def spear(m,a,b):
00476 return Spearman_r(m[a],m[b],rankxy=(rmtx[a],rmtx[b]))
00477
00478 mtx.Transpose()
00479 rv = mtx.RowDiffMtx2(spear)
00480 for n in range(len(rv)):
00481 rv[n,n] = 1.0
00482 mtx.Transpose()
00483 return rv
00484
00485 def Pearson_rmtx(mtx):
00486 mtx.Transpose()
00487 rv = mtx.RowDiffMtx(Pearson_r)
00488 mtx.Transpose()
00489 for n in range(len(rv)):
00490 rv[n,n] = 1.0
00491 return rv
00492
00493 def Prob_rmtx(rmtx,n):
00494 """
00495 pre: rmtx is a matrix of r values, n>2 the number of samples used to determine it
00496 post: matrix of corresponding p values
00497 """
00498 rv = rmtx.Copy()
00499 n = len(rmtx)
00500 for r1 in range(n):
00501 for r2 in range(r1+1,n):
00502 rv[r1,r2] = rv[r2,r1] = Prob_of_r(rmtx[r1,r2],n)
00503
00504 for r in range(n):
00505 rv[r,r] = 0.0
00506
00507 return rv
00508
00509
00510
00511
00512
00513
00514
00515