view mytools/cdf.r @ 0:39217fa39ff2

Uploaded
author xuebing
date Tue, 13 Mar 2012 23:34:52 -0400
parents
children
line wrap: on
line source

# bin and average
binaverage = function(x,nbin,rankORvalue){
# use x[,1] to bin x[,2]
	binned = list()
	if (rankORvalue == 'value'){
		mi = min(x[,1])
		ma = max(x[,1])
		bins = seq(mi,ma,length.out=nbin+1)
		bins[1] = bins[1] - abs(mi)/100
		bins[nbin+1] = bins[nbin+1] + abs(ma)/100
		for (i in 1:nbin){
			binned[[i]] = x[x[,1] >= bins[i] & x[,1] < bins[i+1],2]
		}
		bins[1] = bins[1] + abs(mi)/100
		bins[nbin+1] = bins[nbin+1] - abs(ma)/100
	} else {
		x = x[order(x[,1]),]
		step = round(nrow(x)/nbin)
		bins = x[1,1]
		for (i in 1:(nbin-1)){
			binned[[i]] = x[((i-1)*step+1):(i*step),2]
			bins = c(bins,x[i*step+1,1])
		}
		binned[[nbin]] = x[((nbin-1)*step+1):nrow(x),2]
		bins[nbin+1] = x[nrow(x),1]
	}
# bin lavel
	labels = character(0)
	for (i in 1:nbin){
		labels = c(labels,paste(format(bins[i],digits=2,nsmall=2),format(bins[i+1],digits=2,nsmall=2),sep='~'))	
	}
    list(binned=binned,bins=bins,labels=labels)
}
#res = binaverage(x,3,'rank')

# CDF plot and KS.test
mycdf = function(list,title,labels,legendposition,xlabel,legend_title){
    L = length(list)
    
    # barplot for mean and std
    avg = numeric(L)
    err = numeric(L)
    for (i in 1:L){
        avg[i] = mean(list[[i]])
        err[i] = sd(list[[i]])
    }
    #print(list[[1]])
    #print(list[[2]])
    #print(avg)
    #print(err)
	par(cex=1.5,mar=c(8,6,6,4))
    xticks <- barplot(avg,names.arg=labels,las=2,ylab=xlabel,main='mean and standard deviation',xlab=legend_title,ylim=c(0,max(avg+err)))
    arrows(xticks,avg+err, xticks, avg-err, angle=90, code=3, length=0.0)
    
	if (L>1){
    # ks test
	cat('\nKS test:\n')
	cat('sample1\tsample2\tp-value\n')
	cat('-------------------------------------------------\n')
    for (i in 1:(L-1)){
        for (j in (i+1):L){
        cat(labels[i],'\t',labels[j],'\t')
        ks = ks.test(list[[i]],list[[j]])
        pv = max(2.2e-16,ks$p.value)
	    pv = format(pv,digits=3,nsmall=2)
		cat(pv,'\n')
        }
    }
	cat('-------------------------------------------------\n')
	}
	if (L == 2){
		title = paste(title,'\np=',pv,sep='')
	}
    # cdf plot
    listx = list()
    listy = list()
    mi = 1e10
    ma = 1e-10
    for (i in 1:L){
        mi = min(mi,list[[i]])
        ma = max(ma,list[[i]])
    }
    for (i in 1:L){
        listx[[i]] = c(mi,listx[i],ma)
        listy[[i]] = c(0,listy[i],1)
    }
    for (i in 1:L){
        mi = min(mi,list[[i]])
        ma = max(ma,list[[i]])
        listx[[i]] = sort(list[[i]])
        listy[[i]] = c(1:length(list[[i]]))/length(list[[i]])          
    }
#par(xlog=(xlog=='xlog'))
    plot(listx[[1]],listy[[1]],type='l',lty=1,lwd=2,col=2,main=title,xlab=xlabel,ylab='cumulative frequency')
    for (i in 2:L){
        lines(listx[[i]],listy[[i]],type='l',lty=i,lwd=2,col=i+1)
    }
    # legend
    for (i in 1:L){
        labels[i] = paste(labels[i],', n=',length(list[[i]]),sep='')
    }
    legend(legendposition,legend=labels,col=2:(L+1), lty=1:L,lwd=2, bty='n',title=legend_title)
}