view abims_anova.r @ 11:102049093b7d draft default tip

planemo upload for repository https://github.com/workflow4metabolomics/anova commit 4922313a0e9569326b7723c41babb89f998dbfd9
author lecorguille
date Tue, 13 Mar 2018 09:47:21 -0400
parents b147b17759a6
children
line wrap: on
line source

#!/usr/local/public/bin/Rscript
# version="1.1"

# date: 06-06-2012
# update: 18-02-2014
# **Authors** Gildas Le Corguille  ABiMS - UPMC/CNRS - Station Biologique de Roscoff - gildas.lecorguille|at|sb-roscoff.fr

# abims_anova.r version 20140218

library(batch)


# function avova
anova = function (file, sampleinfo, varinfo, mode="column", condition=1, interaction=F, method="BH", threshold=0.01, selection_method="intersection", sep=";", dec=".", outputdatapvalue="anova.data.output", outputdatasignif="anova.datasignif.output") {


	if (sep=="tabulation") sep="\t"
    	if (sep=="semicolon") sep=";"
    	if (sep=="comma") sep=","

	anova_formula_operator = "+"
	if (interaction) anova_formula_operator = "*"

  	# -- import --
	data=read.table(file, header = TRUE, row.names=1, sep = sep, quote="\"", dec = dec, fill = TRUE, comment.char="",na.strings = "NA", check.names=FALSE)

  	if (mode == "row") {data=t(data)} else {data=as.matrix(data)}

	sampleinfoTab=read.table(sampleinfo, header = TRUE, row.names=1, sep = sep, quote="\"")
	rownames(sampleinfoTab) = make.names(rownames(sampleinfoTab))

	varinfoTab=read.table(varinfo, header = TRUE, sep = sep, quote="\"")
	if(sum(colnames(data)!=varinfoTab[,1])!=0){ # if ID not exactly identical
		if(sum(colnames(data)[order(colnames(data))]!=varinfoTab[order(varinfoTab[,1]),1])==0){
			# reorder data matrix to match variable metadata order
			data = data[,match(varinfoTab[,1],colnames(data))]
		}else{
			stop(paste0("\nVariables' ID do not match between your data matrix and your variable",
			"metadata file. \nPlease check your data."))
		}
	}

	# -- group --
	match_data_sampleinfoTab = match(rownames(data),rownames(sampleinfoTab))
	if (sum(is.na(match_data_sampleinfoTab)) > 0) {
	  write("ERROR: There is a problem during to match sample names from the data matrix and from the sample info (presence of NA).", stderr())
	  write("You may need to use change the mode (column/row)", stderr())
	  write("10 first sample names in the data matrix:", stderr())
	  write(head(colnames(data)), stderr())
	  write("10 first sample names in the sample info:", stderr())
	  write(head(rownames(sampleinfoTab)), stderr())
	  quit("no",status=10)
	}


	# -- anova --

  	# formula
	grps=list()
	anova_formula_s = "data ~ "
	cat("\ncontrasts:\n")
	for (i in 1:length(condition)) {
	  grps[[i]] = factor(sampleinfoTab[,condition[i]][match_data_sampleinfoTab])
	  anova_formula_s = paste(anova_formula_s, "grps[[",i,"]]",anova_formula_operator, sep="")
	  cat(condition[i],"\t",levels(grps[[i]]),"\n")
	# write("Current groups: ", stderr())
	# write(grp[[i]], stderr())
	}
	anova_formula_s = substr(anova_formula_s, 1, nchar(anova_formula_s)-1)
	anova_formula = as.formula(anova_formula_s)



	# anova
	manovaObjectList = manova(anova_formula)
	manovaList = summary.aov(manovaObjectList)

  	# condition renaming
	manovaRownames = gsub(" ","",rownames(manovaList[[1]]))
	manovaNbrPvalue = length(manovaRownames)-1
	manovaRownames = manovaRownames[-(manovaNbrPvalue+1)]

	for (i in 1:length(condition)) {
	  manovaRownames = sub(paste("grps\\[\\[",i,"\\]\\]",sep=""),condition[i],manovaRownames)
  	  anova_formula_s = sub(paste("grps\\[\\[",i,"\\]\\]",sep=""),condition[i],anova_formula_s)
	}

  	# log
  	cat("\nanova_formula",anova_formula_s,"\n")

	# p-value
	aovPValue = sapply(manovaList,function(x){x[-(manovaNbrPvalue+1),5]})
	if(length(condition) == 1) aovPValue = t(aovPValue)
	rownames(aovPValue) = paste("pvalue_",manovaRownames,sep="")

	# p-value adjusted
	if(length(condition) == 1) {
		aovAdjPValue = t(p.adjust(aovPValue,method=method))
	} else {
		aovAdjPValue = t(apply(aovPValue,1,p.adjust, method=method))
	}
	rownames(aovAdjPValue) = paste("pval.",method,".",manovaRownames,sep="")

	# selection
	colSumThreshold = colSums(aovAdjPValue <= threshold)
	if (selection_method == "intersection") {
		datafiltered = data[,colSumThreshold == nrow(aovAdjPValue ), drop=FALSE]
	} else {
		datafiltered = data[,colSumThreshold != 0, drop=FALSE]
	}
	selected.var = rep("no",ncol(data))
	selected.var[colnames(data)%in%colnames(datafiltered)] = "yes"

	#data=rbind(data, aovPValue, aovAdjPValue)
	varinfoTab=cbind(varinfoTab, round(t(aovAdjPValue),10), selected.var)

	# group means
	for (i in 1:length(condition)) {
		for(j in levels(grps[[i]])){
			subgp = rownames(sampleinfoTab[which(sampleinfoTab[,condition[i]]==j),])
			modmean = colMeans(data[which(rownames(data)%in%subgp),],na.rm=TRUE)
			varinfoTab=cbind(varinfoTab, modmean)
			colnames(varinfoTab)[ncol(varinfoTab)] = paste0("Mean_",condition[i],".",j)
		}
	}
	colnames(varinfoTab) = make.unique(colnames(varinfoTab))

	# pdf for significant variables
	pdf(outputdatasignif)
	### Venn diagram
	if(nrow(aovAdjPValue)>5){
		pie(100,labels=NA,main=paste0("Venn diagram only available for maximum 5 terms\n",
		    "(your analysis concerns ",nrow(aovAdjPValue)," terms)\n\n",
			"Number of significant variables relatively to\nyour chosen threshold and ",
			"selection method: ",ncol(datafiltered)),cex.main=0.8)
	}else{
		vennlist = list(NULL)
		names(vennlist) = rownames(aovAdjPValue)[1]
		if(length(colnames(aovAdjPValue))==0){colnames(aovAdjPValue)=varinfoTab[,1]}
		for(i in 1:nrow(aovAdjPValue)){
			vennlist[[rownames(aovAdjPValue)[i]]]=colnames(aovAdjPValue[i,which(aovAdjPValue[i,]<=threshold),drop=FALSE])
		}
		if(length(vennlist)==0){ pie(100,labels=NA,main="No significant ions was found.")
		}else{ library(venn) ; venn(vennlist, zcolor="style", cexil=2, cexsn=1.5) }
	}
	### Boxplot
	par(mfrow=c(2,2),mai=rep(0.5,4))
	data <- as.data.frame(data)
	for(i in 1:nrow(aovAdjPValue)){
		factmain = gsub(paste0("pval.",method,"."),"",rownames(aovAdjPValue)[i])
		factsignif = aovAdjPValue[i,which(aovAdjPValue[i,]<=threshold),drop=FALSE]
		if((ncol(factsignif)!=0)&(factmain%in%colnames(sampleinfoTab))){
			for(j in 1:ncol(factsignif)){
				varsignif = gsub(" Response ","",colnames(factsignif)[j])
				boxplot(as.formula(paste0("data$",varsignif," ~ sampleinfoTab$",factmain)),
				        main=paste0(factmain,"\n",varsignif), col="grey", mai=7)
			}
		}
	}
	dev.off()

	# summary for significant variables
	cat("\n\n- - - - - - - number of significant variables - - - - - - -\n\n")
	for(i in 1:nrow(aovAdjPValue)){
		cat(rownames(aovAdjPValue)[i],"-",
		    sum(aovAdjPValue[i,]<=threshold),"significant variable(s)\n")
	}
	cat("\nIf some of your factors are missing here, this may be due to\neffects",
	    "not estimable; your design may not be balanced enough.\n")

	# -- output / return --
	write.table(varinfoTab, outputdatapvalue, sep=sep, quote=F, row.names=FALSE)

	# log
	cat("\nthreshold:",threshold,"\n")
	cat("result:",ncol(datafiltered),"/",ncol(data),"\n\n")

	quit("no",status=0)
}

# log
cat("ANOVA\n\n")
cat("Arguments\n")
args <- commandArgs(trailingOnly = TRUE)
print(args)

listArguments = parseCommandArgs(evaluate=FALSE)
do.call(anova, listArguments)