Mercurial > repos > vandelj > giant_gsea_format
changeset 0:3022feec50fe draft
"planemo upload for repository https://github.com/juliechevalier/GIANT/tree/master commit cb276a594444c8f32e9819fefde3a21f121d35df"
author | vandelj |
---|---|
date | Fri, 26 Jun 2020 09:36:46 -0400 |
parents | |
children | d72f1bc5ce9e |
files | galaxy/wrappers/FormatForGSEA.xml src/ExprPlotsScript.R src/General_functions.py src/LIMMA_options.py src/LIMMAscriptV4.R src/VolcanoPlotsScript.R src/getopt.R src/heatMapClustering.R src/utils.R |
diffstat | 9 files changed, 4469 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy/wrappers/FormatForGSEA.xml Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,228 @@ +<tool name="GIANT-GSEA Formatting" id="giant_gsea_format" version="0.2.0"> + <description>Format input files for GSEA software</description> + <code file="../../src/General_functions.py"/> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + <exit_code range="10" level="fatal" description="Error during file generation, see log file for more information." /> + </stdio> + <command> <![CDATA[ +#if $mainCondition.selection=="classicGSEA": + + awk 'BEGIN{FS="\t";OFS="";ORS="";nlines=0} ARGIND==1 && FNR>1{nlines++} ARGIND==2 && FNR==1{print "\#1.2\n"nlines"\t"NF-1"\n";print "NAME\tDESCRIPTION"; for(i=2;i<=NF;i++)print"\t"\$i;print "\n"} ARGIND==2 && FNR>1{print \$1"\tna";for(i=2;i<=NF;i++)print "\t"\$i;print "\n"}' $mainCondition.expressionData $mainCondition.expressionData > $outExpression; + + if [ ! -e $outExpression ]; then + printf "[ERROR]Expression file failed" >> $log; + exit 10; + fi + ; + awk -v factor="$mainCondition.factorToInclude" 'BEGIN{FS="\t";OFS="";ORS="";nameCond="";line="";cpt=0;lgt=0} ARGIND==1 && FNR==1{for(iCond=2;iCond<=NF;iCond++){conditionOrder[iCond-1]=\$iCond};cpt=NF-1} ARGIND==2 && FNR==1{for(i=1;i<=NF;i++)if(\$i==factor)factorInd=i} ARGIND==2 && FNR>1{valueFact[\$1]=\$factorInd} END{for(i=1;i<=cpt;i++){ line=line""valueFact[conditionOrder[i]]" "; if(dico[valueFact[conditionOrder[i]]]!=1){lgt++; nameCond=nameCond""valueFact[conditionOrder[i]]" ";dico[valueFact[conditionOrder[i]]]=1}};print cpt" "lgt" 1\n";print "\# "nameCond"\n";print line}' $mainCondition.expressionData $mainCondition.conditionInformation > $outPhenotypes; + + if [ ! -e $outPhenotypes ]; then + printf "[ERROR]Phenotype file failed" >> $log; + exit 10; + fi + ; +#else: + + if [ \$(awk 'NR==1{print (NF-2)%5}' $mainCondition.differentialAnalysis ) -ne 0 ]; then + printf "[ERROR] please check that differential analysis file respect requested input format especially concerning the column number" >> $log; + exit 10; + fi + ; + + awk -v comparison="$mainCondition.comparisonsToUse" -v rkIndice="$mainCondition.rankingIndice" -v pvalTh="$mainCondition.pvalThreshold" 'BEGIN{FS="\t"} NR==1{if(rkIndice=="Tstat" || rkIndice=="absTstat"){start=7}else{start=6};for(i=start;i<=NF;i=i+5){if(\$i==comparison)refCol=i};} NR>2{if(rkIndice=="Tstat" || rkIndice=="absTstat"){if(\$(refCol-3)<pvalTh){if(rkIndice=="Tstat"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}else{if(\$(refCol-2)<pvalTh){if(rkIndice=="FC"){val=\$refCol}else{if(\$refCol>0){val=\$refCol}else{val=-\$refCol}};print \$1"\t"val}}}' $mainCondition.differentialAnalysis > ./temp.txt; + printf "#NAME\tSCORE\n" > $outRankedGenes; + LC_ALL=C sort -t$'\t' -k2,2 -gr ./temp.txt >> $outRankedGenes; + rm ./temp.txt; + + if [ ! -e $outRankedGenes ]; then + printf "[ERROR]Rank file failed" >> $log; + exit 10; + fi + ; +#end if + + printf "[INFO]End of tool script" >> $log; + ]]> + </command> + <inputs> + <param type="text" name="title" value="GSEAformat_toPersonalize" label="Title for output (without space)"/> + <conditional name="mainCondition"> + <param name="selection" type="select" label="GSEA configuration" force_select="true"> + <option value="classicGSEA">GSEA analysis </option> + <option value="rankedGSEA">Pre-ranked GSEA analysis</option> + </param> + <when value="classicGSEA"> + <param type="data" name="expressionData" format="tabular" label="Normalized expression tabular file" optional="false" multiple="false"> + </param> + <param type="data" name="conditionInformation" format="tabular" label="Factor information tabular file" optional="false" multiple="false"> + </param> + <param name="factorToInclude" type="select" label="Reference factor" multiple="false" optional="false" refresh_on_change="true" + dynamic_options="get_column_names_filteredList(mainCondition['conditionInformation'].file_name,[0])"> + </param> + </when> + <when value="rankedGSEA"> + <param type="data" name="differentialAnalysis" format="tabular" label="Differential analysis tabular file (as given by LIMMA diff.exp. tool)" multiple="false" help="This file should contain only annotated gene names or only probe identifiers as rows but no both kinds."> + </param> + <param name="comparisonsToUse" type="select" label="Reference contrast" optional="false" multiple="false" refresh_on_change="true" dynamic_options="get_column_names_filteredList(mainCondition['differentialAnalysis'].file_name,[0,1],5)"> + <validator type="empty_field" message="You should specify one factor"></validator> + </param> + <param type="select" name="rankingIndice" display="radio" label="Reference statistic"> + <option value="FC">Relative value of Log2(Fold Change)</option> + <option value="absFC">Absolute value of Log2(Fold Change)</option> + <option value="Tstat">Relative value of moderated t-statistic</option> + <option value="absTstat">Absolute value of moderated t-statistic</option> + </param> + <param name="pvalThreshold" type="float" value="0.05" label="FDR p-val threshold"> + <validator type="in_range" min="0" max="1" exclude_min="true" message="Threshold should be between 0 and 1"/> + </param> + </when> + </conditional> + </inputs> + + <outputs> + <data format="gct" name="outExpression" label="${title}_Expressions"> + <filter>mainCondition['selection']=="classicGSEA"</filter> + </data> + + <data format="cls" name="outPhenotypes" label="${title}_Phenotypes"> + <filter>mainCondition['selection']=="classicGSEA"</filter> + </data> + + <data format="rnk" name="outRankedGenes" label="${title}_Ranked_Genes"> + <filter>mainCondition['selection']=="rankedGSEA"</filter> + </data> + + <data format="txt" name="log" label="${title}_Log" /> + </outputs> + + <tests> + <test maxseconds="3600" > + <param name="selection" value="classicGSEA" /> + <param name="expressionData" value="./NormalizedData.tabular" /> + <param name="conditionInformation" value="./conditionGroups.txt" /> + <param name="factorToInclude" value="Treatment" /> + <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" /> + <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesTreatment.cls" /> + <output name="log" file="./GSEA-Formatting/outputRanks.log" /> + </test> + <test maxseconds="3600" > + <param name="selection" value="classicGSEA" /> + <param name="expressionData" value="./NormalizedData.tabular" /> + <param name="conditionInformation" value="./conditionGroups.txt" /> + <param name="factorToInclude" value="Type" /> + <output name="outExpression" file="./GSEA-Formatting/outputExpression.gct" /> + <output name="outPhenotypes" file="./GSEA-Formatting/outputPhenotypesType.cls" /> + <output name="log" file="./GSEA-Formatting/outputRanks.log" /> + </test> + <test maxseconds="3600" > + <param name="selection" value="rankedGSEA" /> + <param name="differentialAnalysis" value="./LIMMAstatistics.tabular" /> + <param name="comparisonsToUse" value="WT*Control-KO*Control" /> + <param name="rankingIndice" value="FC" /> + <param name="pvalThreshold" value="0.05" /> + <output name="outRankedGenes" file="./GSEA-Formatting/outputRanks.rnk" /> + <output name="log" file="./GSEA-Formatting/outputRanks.log" /> + </test> +</tests> + <help> + <![CDATA[ +**What it does** + +Generate input files required for GSEA analysis. + +----- + +**Parameters** + +\- **Title** to personalize output file names (please avoid special characters). + +\- **GSEA configuration** : "GSEA analysis" requires expression dataset and phenotype label for each sample whereas "Pre-ranked GSEA Analysis" needs ranked list of samples extracted from differential analysis results. + +- **GSEA Analysis** + + \- **Expression tabular file** with samples as columns and genes as rows (header row contains sample names and first column gene identifiers). + + :: + + Conditions 157_(HuGene-2_0-st).CEL 156_(HuGene-2_0-st).CEL 155_(HuGene-2_0-st).CEL 154_(HuGene-2_0-st).CEL + DDX11L2 4.500872 4.429759 4.780281 4.996189 + MIR1302-2 3.415065 3.520472 3.471503 3.567988 + OR4F5 3.737956 3.011586 3.424494 3.497545 + VWA1 5.189621 5.129595 4.806793 5.227014 + + \- **Factor information tabular file** with factors as columns and samples as rows (header row contains factor names and first column sample names). + + :: + + Conditions Sex Treatment Reaction + 138_(HuGene-2_0-st).CEL 1 TreatA Pos + 148_(HuGene-2_0-st).CEL 0 NoTreat Pos + 139_(HuGene-2_0-st).CEL 0 TreatB Neg + 149_(HuGene-2_0-st).CEL 0 NoTreat Neg + + \- **Reference factor** to use as phenotype in GSEA amongst available factors in factor information file. + +- **Pre-ranked GSEA Analysis** + + \- **Differential analysis tabular file** with contrasts statistics (p-val, FDR p-val, FC, log2(FC) and moderated t-statistic) as columns and genes as rows (first and second rows contain contrasts definition and first and second columns contain gene identifiers and functional informations). Please respect the GIANT-Differential Expression Analysis with LIMMA tool output format. + + :: + + LIMMA comparison WT*Treat WT*Treat WT*Treat WT*Treat WT*Treat + Gene Info p-val FDR.p-val FC log2(FC) t-stat + ARSD na 0.0057 0.41 0.8389 -0.2534 -5.175 + TTTY10 na 1.6e-07 0.0074 0.6403 -0.6432 -6.122 + MIR548AL na 0.072 0.2914 1.711 0.775 10.43 + + + \- **Reference contrast** from available contrasts in differential analysis file to use for gene ranking. + + \- **Reference statistic** from reference contrast used to rank genes. Relative or absolute value of log2(FC) or moderated t-statistic is used to sort genes in a decreasing way. + + \- **FDR p-val threshold** to discard genes with higher FDR p-val than specific threshold [0.05 recommended]. Genes with high FDR p-val are not significant for differential expression in reference contrast. + + +----- + +**Outputs** + +Depends on GSEA configuration: + +- **GSEA Analysis** + + \- **phenotype file (.cls)** to use as "phenotype labels" file for GSEA. + + \- **expression file (.gct)** to use as "expression dataset" file for GSEA. + +- **Pre-ranked GSEA Analysis** + + \- **pre-ranked file (.rnk)** to use as "ranked list" file for GSEA pre-ranked. + +For all configurations: + +\- **LOG file** containing information about execution. Useful especially if tool execution fails. Please attach this log file in any bug report. +]]> + </help> + + <citations> + <citation type="bibtex">@misc{vandel_jimmy_2018_1477870, author = {Vandel, J. and Gheeraert, C. and Eeckhoute, J. and Staels, B. and Lefebvre, P. and Dubois-Chevalier, J.}, title = {GIANT: Galaxy-based Interactive tools for ANalaysis of Transcriptomic data}, month = nov, year = 2018, doi = {10.5281/zenodo.1477870}, url = {https://doi.org/10.5281/zenodo.1477870} + }</citation> + + <citation type="bibtex">@article {Subramanian15545, + author = {Subramanian, Aravind and Tamayo, Pablo and Mootha, Vamsi K. and Mukherjee, Sayan and Ebert, Benjamin L. and Gillette, Michael A. and Paulovich, Amanda and Pomeroy, Scott L. and Golub, Todd R. and Lander, Eric S. and Mesirov, Jill P.}, + title = {Gene set enrichment analysis: A knowledge-based approach for interpreting genome-wide expression profiles}, + volume = {102}, + number = {43}, + pages = {15545--15550}, + year = {2005}, + publisher = {National Academy of Sciences}, + issn = {0027-8424}, + journal = {Proceedings of the National Academy of Sciences} + }</citation> + </citations> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ExprPlotsScript.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,465 @@ +# A command-line interface to basic plots for use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +#get options +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","t", 1, "character", + "dataFileFormat","j",1,"character", + "conditionNames","c",1,"character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "histo" , "h", 1, "character", + "maPlot" , "a", 1, "character", + "boxplot" , "b", 1, "character", + "microarray" , "m", 1, "character", + "acp" , "p" , 1, "character", + "screePlot" , "s" , 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile) || is.null(opt$dataFileFormat)) { + addComment("[ERROR]'dataFile' and it format are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$histo) & is.null(opt$maPlot) & is.null(opt$boxplot) & is.null(opt$microarray) & is.null(opt$acp)){ + addComment("[ERROR]Select at least one plot to draw",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("oligo") + library("ff") + library("ggplot2") + library("plotly") +}) + + +#chargement des fichiers en entrée +#fichier de type CEL +dataAreFromCel=FALSE +if(toupper(opt$dataFileFormat)=="CEL"){ + dataAreFromCel=TRUE + celData=read.celfiles(unlist(strsplit(opt$dataFile,","))) + #load all expressions + dataMatrix=exprs(celData) + #select "pm" probes + probeInfo=getProbeInfo(celData,probeType = c("pm"),target="probeset") + #reduce dataMatrix to log expression matrix for a randomly probe selection + dataMatrix=log2(dataMatrix[sample(unique(probeInfo[,1]),min(100000,length(unique(probeInfo[,1])))),]) + addComment("[INFO]Raw data are log2 transformed",TRUE,opt$log,display=FALSE) + remove(probeInfo) +}else{ + #fichier deja tabule + dataMatrix=read.csv(file=opt$dataFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=dataMatrix[1,-1] + dataMatrix=dataMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=dataMatrix[,1] + dataMatrix=dataMatrix[,-1] + if(is.data.frame(dataMatrix)){ + dataMatrix=data.matrix(dataMatrix) + }else{ + dataMatrix=data.matrix(as.numeric(dataMatrix)) + } + dimnames(dataMatrix)=list(rowNamesData,colNamesData) + if(any(duplicated(rowNamesData)))addComment("[WARNING] several rows share the same probe/gene name, you should merge or rename them to avoid further analysis mistakes",TRUE,opt$log,display=FALSE) +} + +addComment("[INFO]Input data loaded",TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Dim of data matrix:",dim(dataMatrix)),T,opt$log,display=FALSE) + +#get number of conditions +nbConditions=ncol(dataMatrix) + +#get condition names if they are specified +if(!is.null(opt$conditionNames) && length(opt$conditionNames)==nbConditions){ + nameConditions=opt$conditionNames + colnames(dataMatrix)=nameConditions + #rownames(phenoData(celData)@data)=nameConditions + #rownames(protocolData(celData)@data)=nameConditions +}else{ + nameConditions=colnames(dataMatrix) +} + +#create a correspondance table between plot file names and name displayed in figure legend and html items +correspondanceNameTable=matrix("",ncol=2,nrow=nbConditions) +correspondanceNameTable[,1]=paste("Condition",1:nbConditions,sep="") +correspondanceNameTable[,2]=nameConditions +rownames(correspondanceNameTable)=correspondanceNameTable[,2] + +addComment("[INFO]Retreive condition names",TRUE,opt$log,display=FALSE) + +if(!is.null(opt$factorInfo)){ + #chargement du fichier des facteurs + factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + + if(length(setdiff(colnames(dataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix and remove spurious sample + factorInfoMatrix=factorInfoMatrix[colnames(dataMatrix),] + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +} + +addComment("[INFO]Ready to plot",T,opt$log,display=FALSE) + + +##---------------------- + +###plot histograms### +histogramPerFigure=50 +if (!is.null(opt$histo)) { + for(iToPlot in 1:(((nbConditions-1)%/%histogramPerFigure)+1)){ + firstPlot=1+histogramPerFigure*(iToPlot-1) + lastPlot=min(nbConditions,histogramPerFigure*iToPlot) + dataToPlot=data.frame(x=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix))) + p <- ggplot(data=dataToPlot, aes(x = x, color=Experiment)) + stat_density(geom="line", size=1, position="identity") + + ggtitle("Intensity densities") + theme_bw() + ylab(label="Density") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + if(dataAreFromCel){ + #original ploting function + #hist(celData[,firstPlot:lastPlot],lty=rep(1,nbConditions)[firstPlot:lastPlot],lwd=2,which='pm',target="probeset",transfo=log2,col=rainbow(nbConditions)[firstPlot:lastPlot]) + p <- p + xlab(label="Log2 intensities") + }else{ + p <- p + xlab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Histograms drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot MAplots### +MAplotPerPage=4 +if (!is.null(opt$maPlot)) { + iToPlot=1 + plotVector=list() + toTake=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + refMedianColumn=rowMedians(as.matrix(dataMatrix[toTake,])) + if(length(toTake)>100000)addComment(c("[INFO]high number of input data rows ",length(toTake),"; the generation of MA plot can take a while, please be patient"),TRUE,opt$log,display=FALSE) + for (iCondition in 1:nbConditions){ + #MAplot(celData,which=i,what=pm,transfo=log2) + #smoothScatter(x=xToPlot,y=yToPlot,main=nameConditions[iCondition]) + dataA=dataMatrix[toTake,iCondition] + dataB=refMedianColumn####ATTENTION PAR DEFAUT + xToPlot=0.5*(dataA+dataB) + yToPlot=dataA-dataB + tempX=seq(min(xToPlot),max(xToPlot),0.1) + tempY=unlist(lapply(tempX,function(x){median(yToPlot[intersect(which(xToPlot>=(x-0.1/2)),which(xToPlot<(x+0.1/2)))])})) + + dataToPlot=data.frame(x=xToPlot,y=yToPlot) + dataMedianToPlot=data.frame(x=tempX,y=tempY) + p <- ggplot(data=dataToPlot, aes(x,y)) + stat_density2d(aes(fill = ..density..^0.25), geom = "tile", contour = FALSE, n = 100) + + scale_fill_continuous(low = "white", high = "dodgerblue4") + geom_smooth(data=dataMedianToPlot,colour="red", size=0.5, se=FALSE) + + ggtitle(correspondanceNameTable[iCondition,2]) + theme_bw() + xlab(label="") + ylab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$maPlot,"_",correspondanceNameTable[iCondition,1],".html"),collapse=""),selfcontained = F) + + if(iCondition==nbConditions || length(plotVector)==MAplotPerPage){ + #define a new plotting file + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$maPlot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$maPlot,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iCondition<nbConditions){ + #prepare for a new plotting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + remove(p,dataToPlot,dataA,dataB,toTake,xToPlot,yToPlot) + addComment("[INFO]MAplots drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot boxplots### +boxplotPerFigure=50 +if (!is.null(opt$boxplot)) { + for(iToPlot in 1:(((nbConditions-1)%/%boxplotPerFigure)+1)){ + firstPlot=1+boxplotPerFigure*(iToPlot-1) + lastPlot=min(nbConditions,boxplotPerFigure*iToPlot) + dataToPlot=data.frame(intensities=c(dataMatrix[,firstPlot:lastPlot]),Experiment=rep(colnames(dataMatrix)[firstPlot:lastPlot],each=nrow(dataMatrix))) + #to make HTML file lighter, sampling will be done amongst outliers + #get outliers for each boxplot + boxplotsOutliers=apply(dataMatrix[,firstPlot:lastPlot],2,function(x)boxplot.stats(x)$out) + #sample amongst them to keep at maximum of 1000 points and include both min and max outliers values + boxplotsOutliers=lapply(boxplotsOutliers,function(x)if(length(x)>0)c(sample(c(x),min(length(x),1000)),max(c(x)),min(c(x)))) + dataOutliers=data.frame(yVal=unlist(boxplotsOutliers),xVal=unlist(lapply(seq_along(boxplotsOutliers),function(x)rep(names(boxplotsOutliers)[x],length(boxplotsOutliers[[x]]))))) + #plot boxplots without outliers + p <- ggplot(data=dataToPlot, aes(y = intensities, x=Experiment ,color=Experiment)) + geom_boxplot(outlier.colour=NA,outlier.shape =NA) + + ggtitle("Intensities") + theme_bw() + xlab(label="") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 45, hjust = 1),plot.margin=unit(c(10,10,max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x))))),15+max(unlist(lapply(dataToPlot$Experiment,function(x)nchar(as.character(x)))))),"mm")) + #add to plot sampled outliers + p <- p + geom_point(data=dataOutliers,aes(x=xVal,y=yVal,color=xVal),inherit.aes = F) + if(dataAreFromCel){ + #original plotting function + #boxplot(celData[,firstPlot:lastPlot],which='pm',col=rainbow(nbConditions)[firstPlot:lastPlot],target="probeset",transfo=log2,names=nameConditions[firstPlot:lastPlot],main="Intensities") + p <- p + ylab(label="Log2 intensities") + }else{ + p <- p + ylab(label="Intensities") + } + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$boxplot,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$boxplot,iToPlot,".png"),collapse="")) + } + print(p) + dev.off() + #save plotly files + pp <- ggplotly(p) + + #modify plotly object to get HTML file not too heavy for loading + for(iData in 1:length(pp$x$data)){ + ##get kept outliers y values + #yPointsToKeep=dataOutliers$yVal[which(dataOutliers$xVal==pp$x$data[[iData]]$name)] + if(pp$x$data[[iData]]$type=="scatter"){ + ##scatter plot represent outliers points added to boxplot through geom_point + ##nothing to do as outliers have been sampled allready, just have to modify hover text + #if(length(yPointsToKeep)>0){ + #pointsToKeep=which(pp$x$data[[iData]]$y %in% yPointsToKeep) + #pp$x$data[[iData]]$x=pp$x$data[[iData]]$x[pointsToKeep] + #pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[pointsToKeep] + #pp$x$data[[iData]]$text=pp$x$data[[iData]]$text[pointsToKeep] + #}else{ + #pp$x$data[[iData]]$x=NULL + #pp$x$data[[iData]]$y=NULL + #pp$x$data[[iData]]$marker$opacity=0 + #pp$x$data[[iData]]$hoverinfo=NULL + #pp$x$data[[iData]]$text=NULL + #} + #modify text to display + if(dataAreFromCel){ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("log2(intensity) ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + }else{ + pp$x$data[[iData]]$text=unlist(lapply(seq_along(pp$x$data[[iData]]$y),function(x)return(paste(c("intensity ",prettyNum(pp$x$data[[iData]]$y[x],digits=4)),collapse = "")))) + } + }else{ + ##disable marker plotting to keep only box and whiskers plot (outliers are displayed through scatter plot) + pp$x$data[[iData]]$marker$opacity=0 + + #sample 50000 points amongst all data to get a lighter html file, sampling size should not be too low to avoid modifying limit of boxplots + pp$x$data[[iData]]$y=c(sample(dataMatrix[,pp$x$data[[iData]]$name],min(length(dataMatrix[,pp$x$data[[iData]]$name]),50000)),min(dataMatrix[,pp$x$data[[iData]]$name]),max(dataMatrix[,pp$x$data[[iData]]$name])) + pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + + ##first remove outliers info + #downUpValues=boxplot.stats(dataMatrix[,pp$x$data[[iData]]$name])$stats + #if(verbose)addComment(c("filter values for boxplot",pp$x$data[[iData]]$name,"between",min(downUpValues),"and",max(downUpValues)),T,opt$log) + #pointsToRemove=which(pp$x$data[[iData]]$y<min(downUpValues)) + #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #pointsToRemove=which(pp$x$data[[iData]]$y>max(downUpValues)) + #if(length(pointsToRemove)>0)pp$x$data[[iData]]$y=pp$x$data[[iData]]$y[-pointsToRemove] + #then add sampled outliers info + #pp$x$data[[iData]]$y=c(yPointsToKeep,pp$x$data[[iData]]$y) + #pp$x$data[[iData]]$x=rep(pp$x$data[[iData]]$x[1],length(pp$x$data[[iData]]$y)) + } + } + + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$boxplot,iToPlot,".html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot) + addComment("[INFO]Boxplots drawn",T,opt$log,display=FALSE) + +} + +##---------------------- + +###plot microarrays (only for .CEL files)### +if (!is.null(opt$microarray) && dataAreFromCel) { + for (iCondition in 1:nbConditions){ + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".pdf"),collapse=""),onefile = F,width = 5,height = 5)}else{ + png(paste(c("./plotDir/",opt$microarray,"_",correspondanceNameTable[iCondition,1],".png"),collapse="")) + } + image(celData[,iCondition],main=correspondanceNameTable[iCondition,2]) + dev.off() + } + addComment("[INFO]Microarray drawn",T,opt$log,display=FALSE) +} + +##---------------------- + +###plot PCA plot### +if (!is.null(opt$acp)){ + ##to avoid error when nrow is too large, results quite stable with 200k random selected rows + randomSelection=sample(nrow(dataMatrix),min(200000,nrow(dataMatrix))) + #remove constant variables + + dataFiltered=dataMatrix[randomSelection,] + toRemove=which(unlist(apply(dataFiltered,1,var))==0) + if(length(toRemove)>0){ + dataFiltered=dataFiltered[-toRemove,] + } + ##geom_text(aes(label=Experiments,hjust=1, vjust=1.3), y = PC2+0.01) + PACres = prcomp(t(dataFiltered),scale.=TRUE) + + if(!is.null(opt$screePlot)){ + #scree plot + #p <- fviz_eig(PACres) + dataToPlot=data.frame(compo=seq(1,length(PACres$sdev)),var=(PACres$sdev^2/sum(PACres$sdev^2))*100) + p<-ggplot(data=dataToPlot, aes(x=compo, y=var)) + geom_bar(stat="identity", fill="steelblue") + geom_line() + geom_point() + + ggtitle("Scree plot") + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + + xlab(label="Dimensions") + ylab(label="% explained variances") + scale_x_discrete(limits=dataToPlot$compo) + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$screePlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$screePlot,".png"),collapse="")) + } + plot(p) + dev.off() + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$screePlot,".html"),collapse=""),selfcontained = F) + } + + #now plot pca plots + + if(!is.null(opt$factorInfo)){ + fileIdent="" + symbolset = c("circle","cross","square","diamond","circle-open","square-open","diamond-open","x") + + #save equivalence between real factor names and generic ones in correspondanceNameTable + correspondanceNameTable=rbind(correspondanceNameTable,matrix(c(paste("Factor",1:(ncol(factorInfoMatrix)-1),sep=""),colnames(factorInfoMatrix)[-1]),ncol=2,nrow=ncol(factorInfoMatrix)-1)) + rownames(correspondanceNameTable)=correspondanceNameTable[,2] + + #first order factors from decreasing groups number + orderedFactors=colnames(factorInfoMatrix)[-1][order(unlist(lapply(colnames(factorInfoMatrix)[-1],function(x)length(table(factorInfoMatrix[,x])))),decreasing = T)] + allFactorsBigger=length(table(factorInfoMatrix[,orderedFactors[length(orderedFactors)]]))>length(symbolset) + if(allFactorsBigger)addComment("[WARNING]All factors are composed of too many groups to display two factors at same time, each PCA plot will display only one factor groups",T,opt$log,display=FALSE) + for(iFactor in 1:length(orderedFactors)){ + #if it is the last factor of the list or if all factor + if(iFactor==length(orderedFactors) || allFactorsBigger){ + if(length(orderedFactors)==1 || allFactorsBigger){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=correspondanceNameTable[orderedFactors[iFactor],1] + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + } + }else{ + for(iFactorBis in (iFactor+1):length(orderedFactors)){ + if(length(table(factorInfoMatrix[,orderedFactors[iFactorBis]]))<=length(symbolset)){ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x), Attribute1=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactor]], Attribute2=factorInfoMatrix[rownames(PACres$x),orderedFactors[iFactorBis]], hoverLabel=unlist(lapply(rownames(PACres$x),function(x)paste(factorInfoMatrix[x,-1],collapse=",")))) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers", color=~Attribute1,colors=rainbow(length(levels(dataToPlot$Attribute1))+2),symbol=~Attribute2,symbols = symbolset,hoverinfo = 'text', text = ~paste(Experiments,"\n",hoverLabel),marker=list(size=5))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + fileIdent=paste(correspondanceNameTable[orderedFactors[c(iFactor,iFactorBis)],1],collapse="_AND_") + #add text label to plot + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + #save the plotly plot + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_",fileIdent,".html"),collapse=""),selfcontained = F) + }else{ + addComment(c("[WARNING]PCA with",orderedFactors[iFactor],"and",orderedFactors[iFactorBis],"groups cannot be displayed, too many groups (max",length(symbolset),")"),T,opt$log,display=FALSE) + } + } + } + } + }else{ + dataToPlot=data.frame(PC1=PACres$x[,1],PC2=PACres$x[,2],PC3=PACres$x[,3],Experiments=rownames(PACres$x)) + p <- plot_ly(dataToPlot,x = ~PC1, y = ~PC2, z = ~PC3, type = 'scatter3d', mode="markers",marker=list(size=5,color="salmon"),hoverinfo = 'text',text = ~paste(Experiments))%>% + layout(title = "Principal Component Analysis", scene = list(xaxis = list(title = "Component 1"),yaxis = list(title = "Component 2"),zaxis = list(title = "Component 3")), + legend=list(font = list(family = "sans-serif",size = 15,color = "#000"))) + ##p <- add_text(p,x = dataToPlot$PC1, y = dataToPlot$PC2 + (max(PACres$x[,2])-min(PACres$x[,2]))*0.02, z = dataToPlot$PC3, mode = 'text', inherit = F, text=rownames(PACres$x), hoverinfo='skip', showlegend = FALSE, color=I('black')) + + #save plotly files + htmlwidgets::saveWidget(as_widget(p), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$acp,"_plot.html"),collapse=""),selfcontained = F) + } + remove(p,dataToPlot,dataFiltered) + addComment("[INFO]ACP plot drawn",T,opt$log,display=FALSE) +} + +#write correspondances between plot file names and displayed names in figure legends, usefull to define html items in xml file +write.table(correspondanceNameTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/General_functions.py Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,206 @@ +import re +import numpy as np + +def get_column_names( file_path, toNotConsider=-1, each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_filteredList( file_path, toNotConsider=[], each=1): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i not in toNotConsider:#to squeeze the first columns + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + return options + +def get_column_names_mergeNumber(file_path, numberToMerge=1, toNotConsider=[]): + options=[] + inputfile = open(file_path) + if int(numberToMerge)>0: + iHeader=0 + for iCurrentLine in inputfile: + iHeader=iHeader+1 + if iHeader>int(numberToMerge): + break + currentLine=iCurrentLine.strip().split("\t") + iOption=-1 + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + iOption=iOption+1 + if iHeader==1: + options.append( ( str(field_component), str(field_component), False ) ) + else: + options[iOption]=(options[iOption][0]+"_"+str(field_component),options[iOption][1]+"_"+str(field_component),False) + else: + currentLine = next(inputfile).strip().split("\t") + for i, field_component in enumerate( currentLine ): + if i not in toNotConsider:#to squeeze specified columns + options.append( ( "Column_"+str(i), "Column_"+str(i), False ) ) + inputfile.close() + return options + +def get_row_names( file_path, factorName ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumn=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorName:#to test + iColumn=i + options=[] + if iColumn!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_condition_file_names( file_list, toNotConsider=-1, each=1): + options=[] + if not isinstance(file_list,list):#if input file is a tabular file, act as get_column_names + inputfile = open(file_list.file_name) + firstLine = next(inputfile).strip().split("\t") + cpt=0 + for i, field_component in enumerate( firstLine ): + if i!=toNotConsider:#to squeeze the first column + if cpt==0: + options.append( ( field_component, field_component, False ) ) + cpt+=1 + if cpt==each: + cpt=0 + inputfile.close() + else:#if input file is a .cel file list or a collection + if not hasattr(file_list[0],'collection'):#if it is not a collection, get name easily + for i, field_component in enumerate( file_list ): + options.append( ( field_component.name, field_component.name, False ) ) + else:#if the file is a collection, have to get deeper in the corresponding HistoryDatasetCollectionAssociation object + for i, field_component in enumerate( file_list[0].collection.elements ): + options.append( ( field_component.element_identifier, field_component.element_identifier, False ) ) + return options + +def generateFactorFile( file_list, factor_list, outputFileName, logFile): + forbidenCharacters={"*",":",",","|"} + outputfile = open(outputFileName, 'w') + outputLog = open(logFile, 'w') + sampleList=[] + if not isinstance(file_list,list): + conditionNames=get_condition_file_names(file_list,0) #unique expression file, remove the first column (index=0) + else : + conditionNames=get_condition_file_names(file_list) #.CEL files + for iSample, sample_component in enumerate (conditionNames): + sampleList.append(str(sample_component[1])) + outputLog.write("[INFO] "+str(len(sampleList))+" sample are detected as input\n") + globalDict=dict() + factorNameList=[] + firstLine="Conditions" + if len(factor_list)==0:#check if there is at least one factor available + outputLog.write("[ERROR] no factor was defined !\n") + return 1 + else: + for iFactor, factor_component in enumerate( factor_list ): + currentSampleList=list(sampleList) + currentFactor=str(factor_component['factorName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentFactor.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in factor name : '"+currentFactor+"'\n") + return 4 + #check if factor allready named like that + if not globalDict.get(currentFactor) is None: + outputLog.write("[ERROR] '"+currentFactor+"' is used several times as factor name\n") + return 3 + globalDict[currentFactor]=dict() + firstLine=firstLine+"\t"+currentFactor + factorNameList.append(currentFactor) + if len(factor_component['valueList'])<=1:#check if there is at least two value available + outputLog.write("[ERROR] at least two different values are necessary for '"+currentFactor+"' factor\n") + return 1 + else: + for iValue, value_component in enumerate( factor_component['valueList'] ): + currentValue=str(value_component['valueName']) + #check if factor name contains forbidden characters + for specialCharacter in forbidenCharacters: + if currentValue.find(specialCharacter)!=-1: + outputLog.write("[ERROR] '"+specialCharacter+"' character is forbidden in value name : '"+currentValue+"'\n") + return 4 + currentSample=str(value_component['valueConditions']).split(",") + for iSample, sample_component in enumerate (currentSample): + if not sample_component in currentSampleList: + outputLog.write("[ERROR] sample "+sample_component+" was assigned several times for factor '"+currentFactor+"'\n") + return 2 + currentSampleList.remove(sample_component) + globalDict[currentFactor][sample_component]=currentValue + if(len(currentSampleList)>0): + outputLog.write("[ERROR] for factor '"+currentFactor+"'' sample "+str(currentSampleList)+" are not assigned to any value\n") + return 2 + outputLog.write("[INFO] "+str(len(globalDict))+" factors are detected\n") + #start writing the factor file + outputfile.write(firstLine+"\n") + for iSample, sample_component in enumerate(sampleList): + newLine=sample_component + for iFactor, factor_component in enumerate(factorNameList): + newLine=newLine+"\t"+globalDict[factor_component][sample_component] + outputfile.write(newLine+"\n") + outputfile.close() + outputLog.close() + return 0 + +def selectSubSetTable(file_path,headerLine_number,columnsToAdd,columnNamesToKeep,outputFileName,logFile): + outputLog = open(logFile, 'w') + outputLog.write("[INFO] header line number : "+ headerLine_number+" lines\n") + availableColumnsTuple=get_column_names_mergeNumber(file_path, headerLine_number) + #convert tuple list as a simple array + availableColumns=[] + for iTuple, tuple_content in enumerate (availableColumnsTuple): + availableColumns.append(str(tuple_content[0])) + if len(availableColumns)==0: + outputLog.write("[ERROR] No detected columns in input file\n") + return 1 + selectedColumns=list(columnsToAdd) + for iVolcano, volcano_content in enumerate(columnNamesToKeep): + selectedColumns.append(availableColumns.index(volcano_content['pvalColumn'])) + if volcano_content['fdrColumn'] in availableColumns: + selectedColumns.append(availableColumns.index(volcano_content['fdrColumn'])) + else: + selectedColumns.append(0) + selectedColumns.append(availableColumns.index(volcano_content['fcColumn'])) + if len(selectedColumns)!=(3*len(columnNamesToKeep)+len(columnsToAdd)): + outputLog.write("[ERROR] matching between input file colnames and requested column names failed\n") + return 1 + outputLog.write("[INFO] columns kept : "+str(selectedColumns)+"\n") + #start writting formatted file + inputfile = open(file_path) + outputfile = open(outputFileName, 'w') + iLineCpt=-1 + for iCurrentLine in inputfile: + iLineCpt=iLineCpt+1 + if iLineCpt>=int(headerLine_number): + currentLineFields=np.array(iCurrentLine.strip().split("\t")) + newLine="\t".join(currentLineFields[selectedColumns]) + outputfile.write(newLine+"\n") + if iLineCpt<int(headerLine_number): + outputLog.write("[ERROR] not enough lines in input files ("+(iLineCpt+1)+" lines)\n") + return 1 + inputfile.close() + outputfile.close() + outputLog.close() + return 0 \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/LIMMA_options.py Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,330 @@ +import re + +def get_column_names( file_path, toNotConsider=None, toNotConsiderBis=None): + options=[] + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for i, field_component in enumerate( firstLine ): + if i!=0 and field_component!=toNotConsider and field_component!=toNotConsiderBis:#to squeeze the first column + options.append( ( field_component, field_component, False ) ) + inputfile.close() + return options + +def get_row_names( file_path, factorName ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumn=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorName:#to test + iColumn=i + options=[] + if iColumn!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if (nextLine[iColumn], nextLine[iColumn], False) not in options: + options.append( (nextLine[iColumn], nextLine[iColumn], False) ) + inputfile.close() + return options + +def get_row_names_interaction( file_path, factorNameA, factorNameB ): + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + iColumnA=-1 + iColumnB=-1 + for i, field_component in enumerate( firstLine ): + if field_component==factorNameA:#to test + iColumnA=i + if field_component==factorNameB:#to test + iColumnB=i + possibleValuesA=[] + possibleValuesB=[] + if iColumnA!=-1 and iColumnB!=-1: + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + if nextLine[iColumnA] not in possibleValuesA: + possibleValuesA.append(nextLine[iColumnA]) + if nextLine[iColumnB] not in possibleValuesB: + possibleValuesB.append(nextLine[iColumnB]) + inputfile.close() + options=[] + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + options.append( (possibleValuesA[counterA]+"*"+possibleValuesB[counterB], possibleValuesA[counterA]+"*"+possibleValuesB[counterB], False) ) + return options + +def get_comparisonsA( factorA, valuesA ): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValues=formatValuesA.split("', u'") + if len(possibleValues)>=2: + for counter in range(len(possibleValues)-1): + for innerCounter in range(counter+1,len(possibleValues)): + options.append( (possibleValues[counter]+" - "+possibleValues[innerCounter], possibleValues[counter]+" - "+possibleValues[innerCounter], False) ) + options.append( (possibleValues[innerCounter]+" - "+possibleValues[counter], possibleValues[innerCounter]+" - "+possibleValues[counter], False) ) + return options + +def get_comparisonsAB(factorA, valuesA, factorB, valuesB, interaction): + options=[] + formatValuesA=re.sub("(^\[u')|('\]$)","", str(valuesA)) + possibleValuesA=formatValuesA.split("', u'") + formatValuesB=re.sub("(^\[u')|('\]$)","", str(valuesB)) + possibleValuesB=formatValuesB.split("', u'") + if str(interaction)=="False": + if len(possibleValuesA)>=2: + for counter in range(len(possibleValuesA)-1): + for innerCounter in range(counter+1,len(possibleValuesA)): + options.append( (possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], possibleValuesA[counter]+" - "+possibleValuesA[innerCounter], False) ) + options.append( (possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], possibleValuesA[innerCounter]+" - "+possibleValuesA[counter], False) ) + if len(possibleValuesB)>=2: + for counter in range(len(possibleValuesB)-1): + for innerCounter in range(counter+1,len(possibleValuesB)): + options.append( (possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], possibleValuesB[counter]+" - "+possibleValuesB[innerCounter], False) ) + options.append( (possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], possibleValuesB[innerCounter]+" - "+possibleValuesB[counter], False) ) + else: + if len(possibleValuesA)>=1 and len(possibleValuesB)>=1 and possibleValuesA[0]!="None" and possibleValuesB[0]!="None": + for counterA in range(len(possibleValuesA)): + for innerCounterA in range(len(possibleValuesA)): + for counterB in range(len(possibleValuesB)): + for innerCounterB in range(len(possibleValuesB)): + if not(counterA==innerCounterA and counterB==innerCounterB): + options.append( ("("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")","("+possibleValuesA[counterA]+" * "+possibleValuesB[counterB]+") - ("+possibleValuesA[innerCounterA]+" * "+possibleValuesB[innerCounterB]+")", False) ) + return options + +def get_row_names_allInteractions( file_path, factorSelected): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allCombinations=[] + for iFactor, factorComponent in enumerate(factorsList): + if iFactor==0: + allCombinations=valuesList[iFactor] + else: + currentCombinations=allCombinations + allCombinations=[] + for iValue, valueComponent in enumerate(valuesList[iFactor]): + for iCombination, combination in enumerate(currentCombinations): + allCombinations.append(combination+"*"+valueComponent) + + options=[] + for iCombination, combination in enumerate(allCombinations): + options.append((combination,combination,False)) + + return options + +def get_allrow_names( file_path, factorSelected ): + formatFactors=re.sub("(^\[u')|('\]$)","", str(factorSelected)) + factorsList=formatFactors.split("', u'") + iColumn=[None] * len(factorsList) + valuesList=[None] * len(factorsList) + + inputfile = open(file_path) + firstLine = next(inputfile).strip().split("\t") + for iField, fieldComponent in enumerate( firstLine ): + for iFactor, factorComponent in enumerate(factorsList): + if fieldComponent==factorComponent: + iColumn[iFactor]=iField + valuesList[iFactor]=[] + + for nextLine in inputfile: + nextLine=nextLine.strip().split("\t") + if len(nextLine)>1: + for iFactor, factorComponent in enumerate(factorsList): + if nextLine[iColumn[iFactor]] not in valuesList[iFactor]: + valuesList[iFactor].append(nextLine[iColumn[iFactor]]) + inputfile.close() + + allValues=[] + for iFactor, factorComponent in enumerate(factorsList): + for iValue, valueComponent in enumerate(valuesList[iFactor]): + allValues.append(factorComponent+":"+valueComponent) + + options=[] + for iValue, valueComponent in enumerate(allValues): + options.append((valueComponent,valueComponent,False)) + + return options + +def replaceNamesInFiles(expressionFile_name,conditionFile_name,outputExpressionFile,outputConditionFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + firstLine=1 + iFactor=1 + iValue=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0 + + +def replaceNamesBlockInFiles(expressionFile_name,conditionFile_name,blockingFile_name,outputExpressionFile,outputConditionFile,outputBlockingFile,ouputDictionnary): + dico={} + forbidenCharacters={"*",":",",","|"} + ##start with expression file, read only the first line + inputfile = open(expressionFile_name) + outputfile = open(outputExpressionFile, 'w') + firstLine = next(inputfile).rstrip().split("\t") + iCondition=1 + newFirstLine="" + for i, field_component in enumerate( firstLine ): + if (i>0): + #conditions names should not be redundant with other conditions + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newFirstLine+="\t"+"Condition"+str(iCondition) + iCondition+=1 + else: + raise NameError('condition name allready exists!') + else: + newFirstLine+=field_component + outputfile.write(newFirstLine+"\n") + for line in inputfile: + outputfile.write(line) + outputfile.close() + inputfile.close() + #then parse condition file, read all lines in this case + iFactor=1 + iValue=1 + for fileNum in range(2): + if fileNum==0: + inputfile = open(conditionFile_name) + outputfile = open(outputConditionFile, 'w') + else: + inputfile = open(blockingFile_name) + outputfile = open(outputBlockingFile, 'w') + firstLine=1 + for line in inputfile: + currentLine = line.rstrip().split("\t") + newCurrentLine="" + for i, field_component in enumerate( currentLine ): + #special treatment for the first line + if (firstLine==1): + if (i==0): + newCurrentLine=field_component + else: + #factor names should not be redundant with other factors or conditions + if(field_component not in dico): + dico[field_component]="Factor"+str(iFactor) + newCurrentLine+="\t"+"Factor"+str(iFactor) + iFactor+=1 + else: + raise NameError('factor name allready exists!') + else: + if (i==0): + #check if condition name allready exist and used it if it is, or create a new one if not + if(field_component not in dico): + dico[field_component]="Condition"+str(iCondition) + newCurrentLine="Condition"+str(iCondition) + iCondition+=1 + else: + newCurrentLine=dico[field_component] + else: + if(field_component not in dico): + dico[field_component]="Value"+str(iValue) + newCurrentLine+="\tValue"+str(iValue) + iValue+=1 + else: + newCurrentLine+="\t"+dico[field_component] + outputfile.write(newCurrentLine+"\n") + firstLine=0 + outputfile.close() + inputfile.close() + ##check if any entries in dictionnary contains forbiden character + for key, value in dico.items(): + for specialCharacter in forbidenCharacters: + if value.startswith("Condition")==False and key.find(specialCharacter)!=-1: + return 1 + ##then write dictionnary in a additional file + outputfile = open(ouputDictionnary, 'w') + for key, value in dico.items(): + outputfile.write(key+"\t"+value+"\n") + outputfile.close() + return 0
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/LIMMAscriptV4.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,1002 @@ +# A command-line interface for LIMMA to use with Galaxy +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "dataFile", "i", 1, "character", + "factorInfo","a", 1, "character", + "blockingInfo","b", 1, "character", + "dicoRenaming","g",1,"character", + "blockingPolicy","u", 1, "character", + "fdrThreshold","t", 1, "double", + "thresholdFC","d", 1, "double", + "format", "f", 1, "character", + "histo","h", 1, "character", + "volcano","v", 1, "character", + "factorsContrast","r", 1, "character", + "contrastNames","p", 1, "character", + "firstGroupContrast","m", 1, "character", + "secondGroupContrast","n", 1, "character", + "controlGroups","c", 1, "character", + "fratioFile","s",1,"character", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "quiet", "q", 0, "logical", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "outputDfFile" , "z", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$dataFile)) { + addComment("[ERROR]'dataFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (!is.null(opt$blockingInfo) && is.null(opt$blockingPolicy) ) { + addComment("[ERROR]blocking policy is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$dicoRenaming)) { + addComment("[ERROR]renaming dictionnary is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorsContrast)) { + addComment("[ERROR]factor informations are missing",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$firstGroupContrast)!=length(opt$secondGroupContrast)) { + addComment("[ERROR]some contrast groups seems to be empty",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$factorInfo)) { + addComment("[ERROR]factors info is missing",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile) || is.null(opt$outputDfFile)){ + addComment("[ERROR]'output files' are required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$thresholdFC)){ + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fratioFile)) { + addComment("[ERROR]F-ratio parameter is missing",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +#pour savoir si on remplace les FC calculés par LIMMA par un calcul du LS-MEAN (ie moyenne de moyennes de chaque groupe dans chaque terme du contraste plutôt qu'une moyenne globale dans chaque terme) +useLSmean=FALSE + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("limma") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") + library("RColorBrewer") +}) + + +#chargement du fichier dictionnaire de renommage +renamingDico=read.csv(file=file.path(getwd(), opt$dicoRenaming),header=F,sep="\t",colClasses="character") +rownames(renamingDico)=renamingDico[,2] + + +#chargement des fichiers en entrée +expDataMatrix=read.csv(file=file.path(getwd(), opt$dataFile),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames (to avoid X before colnames with header=T) +colNamesData=expDataMatrix[1,-1] +expDataMatrix=expDataMatrix[-1,] +#remove first colum to convert it as rownames +rowNamesData=expDataMatrix[,1] +expDataMatrix=expDataMatrix[,-1] +if(is.data.frame(expDataMatrix)){ + expDataMatrix=data.matrix(expDataMatrix) +}else{ + expDataMatrix=data.matrix(as.numeric(expDataMatrix)) +} +dimnames(expDataMatrix)=list(rowNamesData,colNamesData) + +#test the number of rows that are constant in dataMatrix +nbConstantRows=length(which(unlist(apply(expDataMatrix,1,var))==0)) +if(nbConstantRows>0){ + addComment(c("[WARNING]",nbConstantRows,"rows are constant across conditions in input data file"),T,opt$log,display=FALSE) +} + +#test if all condition names are present in dico +if(!all(colnames(expDataMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing condition names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Expression data loaded and checked",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of expression matrix:",dim(expDataMatrix)),T,opt$log,display=FALSE) + +#chargement du fichier des facteurs +factorInfoMatrix=read.csv(file=file.path(getwd(), opt$factorInfo),header=F,sep="\t",colClasses="character") +#remove first row to convert it as colnames +colnames(factorInfoMatrix)=factorInfoMatrix[1,] +factorInfoMatrix=factorInfoMatrix[-1,] +#use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case +rownames(factorInfoMatrix)=factorInfoMatrix[,1] + +if(length(setdiff(colnames(expDataMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log) + q( "no", 1, F ) +} + +#order sample as in expression matrix and remove spurious sample +factorInfoMatrix=factorInfoMatrix[colnames(expDataMatrix),] + +#test if all values names are present in dico +if(!all(unlist(factorInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing factor names in renaming dictionary",T,opt$log) + q( "no", 1, F ) +} + +addComment("[INFO]Factors OK",T,opt$log,display=FALSE) +addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) + +##manage blocking factor +blockingFactor=NULL +blockinFactorsList=NULL +if(!is.null(opt$blockingInfo)){ + + #chargement du fichier des blocking factors + blockingInfoMatrix=read.csv(file=file.path(getwd(), opt$blockingInfo),header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(blockingInfoMatrix)=blockingInfoMatrix[1,] + blockingInfoMatrix=blockingInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(blockingInfoMatrix)=blockingInfoMatrix[,1] + + + if(length(setdiff(colnames(expDataMatrix),rownames(blockingInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in blocking factor file",T,opt$log) + q( "no", 1, F ) + } + + #order sample as in expression matrix + blockingInfoMatrix=blockingInfoMatrix[colnames(expDataMatrix),] + + #test if all blocking names are present in dico + if(!all(unlist(blockingInfoMatrix) %in% rownames(renamingDico))){ + addComment("[ERROR]Missing blocking names in renaming dictionary",T,opt$log) + q( "no", 1, F ) + } + + #remove blocking factors allready present as real factors + blockingNotInMainFactors=setdiff(colnames(blockingInfoMatrix)[-1],colnames(factorInfoMatrix)[-1]) + + if(length(blockingNotInMainFactors)<(ncol(blockingInfoMatrix)-1))addComment("[WARNING]Blocking factors cannot be principal factors",T,opt$log,display=FALSE) + + if(length(blockingNotInMainFactors)>0){ + + blockingInfoMatrix=blockingInfoMatrix[,c(colnames(blockingInfoMatrix)[1],blockingNotInMainFactors)] + + groupBlocking=rep("c",ncol(expDataMatrix)) + #for each blocking factor + for(blockingFact in blockingNotInMainFactors){ + if(opt$blockingPolicy=="correlated"){ + indNewFact=as.numeric(factor(blockingInfoMatrix[,blockingFact])) + groupBlocking=paste(groupBlocking,indNewFact,sep="_") + }else{ + if(is.null(blockinFactorsList))blockinFactorsList=list() + blockinFactorsList[[blockingFact]]=factor(unlist(lapply(blockingInfoMatrix[,blockingFact],function(x)paste(c(blockingFact,"_",x),collapse="")))) + } + } + if(opt$blockingPolicy=="correlated"){ + blockingFactor=factor(groupBlocking) + if(length(levels(blockingFactor))==1){ + addComment("[ERROR]Selected blocking factors seems to be constant",T,opt$log) + q( "no", 1, F ) + } + } + addComment("[INFO]Blocking info OK",T,opt$log,display=FALSE) + }else{ + addComment("[WARNING]No blocking factors will be considered",T,opt$log,display=FALSE) + } +} + + +##rename different input parameters using renamingDictionary +opt$factorsContrast=renamingDico[unlist(lapply(unlist(strsplit(opt$factorsContrast,",")),function(x)which(renamingDico[,1]==x))),2] + +userDefinedContrasts=FALSE +if(!is.null(opt$firstGroupContrast) && !is.null(opt$secondGroupContrast)){ + userDefinedContrasts=TRUE + for(iContrast in 1:length(opt$firstGroupContrast)){ + opt$firstGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + opt$secondGroupContrast[iContrast]=paste(unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(renamingDico[unlist(lapply(unlist(strsplit(x,"\\*")),function(x)which(renamingDico[,1]==x))),2],collapse="*"))),collapse=",") + } +} + +if(!is.null(opt$controlGroups)){ + renamedGroups=c() + for(iGroup in unlist(strsplit(opt$controlGroups,","))){ + renamedControlGroup=paste(renamingDico[unlist(lapply(unlist(strsplit(iGroup,":")),function(x)which(renamingDico[,1]==x))),2],collapse=":") + if(length(renamedControlGroup)==0 || any(which(unlist(gregexpr(text = renamedControlGroup,pattern = ":"))==-1))){ + addComment("[ERROR]Control groups for interaction seem to mismatch, please check them.",T,opt$log) + q( "no", 1, F ) + } + renamedGroups=c(renamedGroups,renamedControlGroup) + } + opt$controlGroups=renamedGroups +} +addComment("[INFO]Contrast variables are renamed to avoid confusion",T,opt$log,display=FALSE) +##renaming done + +#to convert factor as numeric value --> useless now ? +#expDataMatrix=apply(expDataMatrix,c(1,2),function(x)as.numeric(paste(x))) + +#get factors info for LIMMA +factorsList=list() +for(iFactor in opt$factorsContrast){ + if(!(iFactor %in% colnames(factorInfoMatrix))){ + addComment("[ERROR]Required factors are missing in input file",T,opt$log) + q( "no", 1, F ) + } + factorsList[[iFactor]]=factor(unlist(lapply(factorInfoMatrix[,iFactor],function(x)paste(c(iFactor,"_",x),collapse="")))) + if(length(levels(factorsList[[iFactor]]))==1){ + addComment("[ERROR]One selected factor seems to be constant",T,opt$log) + q( "no", 1, F ) + } +} + +#check if there is at least 2 factors to allow interaction computation +if(!is.null(opt$controlGroups) && length(factorsList)<2){ + addComment("[ERROR]You cannot ask for interaction with less than 2 factors",T,opt$log) + q( "no", 1, F ) +} + +#merge all factors as a single one +factorsMerged=as.character(factorsList[[opt$factorsContrast[1]]]) +for(iFactor in opt$factorsContrast[-1]){ + factorsMerged=paste(factorsMerged,as.character(factorsList[[iFactor]]),sep=".") +} +factorsMerged=factor(factorsMerged) + +#checked that coefficient number (ie. factorsMerged levels) is strictly smaller than sample size +if(length(levels(factorsMerged))>=length(factorsMerged)){ + addComment(c("[ERROR]No enough samples (",length(factorsMerged),") to estimate ",length(levels(factorsMerged))," coefficients"),T,opt$log) + q( "no", 1, F ) +} + +#get the sample size of each factor values +sampleSizeFactor=table(factorsMerged) + + +if(!is.null(blockinFactorsList)){ + factorString=c("blockinFactorsList[['", names(blockinFactorsList)[1],"']]") + for(blockingFact in names(blockinFactorsList)[-1]){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + design = model.matrix(as.formula(paste(c("~ factorsMerged +",factorString," + 0"),collapse=""))) + + #rename design columns + coeffMeaning = levels(factorsMerged) + for(blockingFact in blockinFactorsList){ + coeffMeaning=c(coeffMeaning,levels(blockingFact)[-1]) + } + colnames(design) = coeffMeaning +}else{ + design = model.matrix(as.formula( ~ factorsMerged + 0)) + + #rename degin columns + coeffMeaning = levels(factorsMerged) + colnames(design) = coeffMeaning +} + +addComment(c("[INFO]Available coefficients: ",coeffMeaning),T,opt$log,display=F) + +estimableCoeff=which(colSums(design)!=0) + +addComment("[INFO]Design done",T,opt$log,display=F) + + #use blocking factor if exists +if(!is.null(blockingFactor)){ + corfit <- duplicateCorrelation(expDataMatrix, design, block=blockingFactor) + + addComment(c("[INFO]Correlation within groups: ",corfit$consensus.correlation),T,opt$log,display=F) + + #run linear model fit + data.fit = lmFit(expDataMatrix,design,block = blockingFactor, correlation=corfit$consensus.correlation) +}else{ + #run linear model fit + data.fit = lmFit(expDataMatrix,design) +} + +estimatedCoeff=which(!is.na(data.fit$coefficients[1,])) + +addComment("[INFO]Lmfit done",T,opt$log,display=F) + +#catch situation where some coefficients cannot be estimated, probably due to dependances between design columns +#if(length(setdiff(estimableCoeff,estimatedCoeff))>0){ +# addComment("[ERROR]Error in design matrix, check your group definitions",T,opt$log) +# q( "no", 1, F ) +#} +#to strong condition, should return ERROR only when coefficients relative to principal factors cannot be estimated, otherwise, return a simple WARNING + +#define requested contrasts +requiredContrasts=c() +humanReadingContrasts=c() +persoContrastName=c() +if(userDefinedContrasts){ + for(iContrast in 1:length(opt$firstGroupContrast)){ + posGroup=unlist(lapply(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + negGroup=unlist(lapply(unlist(strsplit(opt$secondGroupContrast[iContrast],",")),function(x)paste(paste(opt$factorsContrast,unlist(strsplit(x,"\\*")),sep="_"),collapse="."))) + #clear posGroup and negGroup from empty groups + emptyPosGroups=which(!(posGroup%in%coeffMeaning)) + if(length(emptyPosGroups)>0){ + addComment(c("[WARNING]The group(s)",posGroup[emptyPosGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + posGroup=posGroup[-emptyPosGroups] + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],","))[-emptyPosGroups],collapse="+") + }else{ + currentHumanContrast=paste(unlist(strsplit(opt$firstGroupContrast[iContrast],",")),collapse="+") + } + emptyNegGroups=which(!(negGroup%in%coeffMeaning)) + if(length(emptyNegGroups)>0){ + addComment(c("[WARNING]The group(s)",negGroup[emptyNegGroups],"is/are removed from contrast as it/they is/are empty"),T,opt$log,display=FALSE) + negGroup=negGroup[-emptyNegGroups] + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))[-emptyNegGroups]),collapse="-") + }else{ + currentHumanContrast=paste(c(currentHumanContrast,unlist(strsplit(opt$secondGroupContrast[iContrast],","))),collapse="-") + } + if(length(posGroup)==0 || length(negGroup)==0 ){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to empty group"),T,opt$log,display=FALSE) + }else{ + if(all(posGroup%in%negGroup) && all(negGroup%in%posGroup)){ + addComment(c("[WARNING]Contrast",currentHumanContrast,"cannot be estimated due to null contrast"),T,opt$log,display=FALSE) + }else{ + #get coefficients required for first group added as positive + positiveCoeffWeights=sampleSizeFactor[posGroup]/sum(sampleSizeFactor[posGroup]) + #positiveCoeffWeights=rep(1,length(posGroup)) + #names(positiveCoeffWeights)=names(sampleSizeFactor[posGroup]) + #get coefficients required for second group added as negative + negativeCoeffWeights=sampleSizeFactor[negGroup]/sum(sampleSizeFactor[negGroup]) + #negativeCoeffWeights=rep(1,length(negGroup)) + #names(negativeCoeffWeights)=names(sampleSizeFactor[negGroup]) + #build the resulting contrast + currentContrast=paste(paste(positiveCoeffWeights[posGroup],posGroup,sep="*"),collapse="+") + currentContrast=paste(c(currentContrast,paste(paste(negativeCoeffWeights[negGroup],negGroup,sep="*"),collapse="-")),collapse="-") + requiredContrasts=c(requiredContrasts,currentContrast) + + #build the human reading contrast + humanReadingContrasts=c(humanReadingContrasts,currentHumanContrast) + if(!is.null(opt$contrastNames) && nchar(opt$contrastNames[iContrast])>0){ + persoContrastName=c(persoContrastName,opt$contrastNames[iContrast]) + }else{ + persoContrastName=c(persoContrastName,"") + } + + addComment(c("[INFO]Contrast added : ",currentHumanContrast),T,opt$log,display=F) + addComment(c("with complete formula ",currentContrast),T,opt$log,display=F) + } + } + } +} + + + #define the true formula with interactions to get interaction coefficients + factorString=c("factorsList[['", names(factorsList)[1],"']]") + for(iFactor in names(factorsList)[-1]){ + factorString=c(factorString," * factorsList[['",iFactor,"']]") + } + + if(!is.null(blockinFactorsList)){ + for(blockingFact in names(blockinFactorsList)){ + factorString=c(factorString," + blockinFactorsList[['",blockingFact,"']]") + } + } + + #should not be null at the end + allFtestMeanSquare=NULL + #to get the F-test values + estimatedInteractions=rownames(anova(lm(as.formula(paste(c("expDataMatrix[1,] ~ ",factorString),collapse=""))))) + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x){temp=unlist(strsplit(x,"[ \" | : ]"));paste(temp[seq(2,length(temp),3)],collapse=":")})),estimatedInteractions[length(estimatedInteractions)]) + #rename estimated interaction terms using renamingDico + estimatedInteractions=c(unlist(lapply(estimatedInteractions[-length(estimatedInteractions)],function(x)paste(renamingDico[unlist(strsplit(x,":")),1],collapse=":"))),estimatedInteractions[length(estimatedInteractions)]) + t <- unlist(apply(expDataMatrix,1,function(x){temp=anova(lm(as.formula(paste(c("x ~ ",factorString),collapse=""))))$`Mean Sq`;temp/temp[length(temp)]})) + allFtestMeanSquare <- t(matrix(t,nrow=length(estimatedInteractions))) + #remove from allFtest rows containing NA + if(length(which(is.na(allFtestMeanSquare[,1])))>0)allFtestMeanSquare=allFtestMeanSquare[-(which(is.na(allFtestMeanSquare[,1]))),] + colnames(allFtestMeanSquare)=estimatedInteractions + + #add contrasts corresponding to interaction terms + if(!is.null(opt$controlGroups)){ + #first load user defined control group for each factor + controlGroup=rep(NA,length(factorsList)) + names(controlGroup)=names(factorsList) + for(iGroup in opt$controlGroups){ + splitGroup=unlist(strsplit(iGroup,":")) + splitGroup[2]=paste(splitGroup[1],splitGroup[2],sep = "_") + #check if defined control group is really a level of the corresponding factor + if(!splitGroup[1]%in%names(controlGroup) || !splitGroup[2]%in%factorsList[[splitGroup[1]]]){ + addComment(c("[ERROR]The factor name",splitGroup[1],"does not exist or group name",splitGroup[2]),T,opt$log) + q( "no", 1, F ) + } + if(!is.na(controlGroup[splitGroup[1]])){ + addComment("[ERROR]Several control groups are defined for the same factor, please select only one control group for each factor if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + controlGroup[splitGroup[1]]=splitGroup[2] + } + + #check if all factor have a defined control group + if(any(is.na(controlGroup))){ + addComment("[ERROR]Missing control group for some factors, please check them if you want to compute interaction contrasts",T,opt$log) + q( "no", 1, F ) + } + + nbFactors=length(factorsList) + interactionContrasts=c() + contrastClass=c() + #initialize list for the first level + newPreviousLoopContrast=list() + for(iFactorA in 1:(nbFactors-1)){ + nameFactorA=names(factorsList)[iFactorA] + compA=c() + for(levelA in setdiff(levels(factorsList[[iFactorA]]),controlGroup[nameFactorA])){ + compA=c(compA,paste(levelA,controlGroup[nameFactorA],sep="-")) + } + newPreviousLoopContrast[[as.character(iFactorA)]]=compA + } + #make a loop for growing interaction set + for(globalIfactor in 1:(nbFactors-1)){ + previousLoopContrast=newPreviousLoopContrast + newPreviousLoopContrast=list() + #factor A reuse contrasts made at previsous loop + for(iFactorA in names(previousLoopContrast)){ + compA=previousLoopContrast[[iFactorA]] + + if(max(as.integer(unlist(strsplit(iFactorA,"\\."))))<nbFactors){ + #factor B is the new factor to include in intreraction set + for(iFactorB in (max(as.integer(unlist(strsplit(iFactorA,"\\."))))+1):nbFactors){ + nameFactorB=names(factorsList)[iFactorB] + #keep contrasts identified trough interacting factors set + newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]]=c() + for(iCompA in compA){ + for(levelB in setdiff(levels(factorsList[[iFactorB]]),controlGroup[nameFactorB])){ + #decompose the contrast compA to apply the new level of factor B on each term + temp=unlist(strsplit(iCompA,"[ + ]")) + splitCompA=temp[1] + for(iTemp in temp[-1])splitCompA=c(splitCompA,"+",iTemp) + splitCompA=unlist(lapply(splitCompA,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + #apply on each contrast term the new level of factor B + firstTerm=paste(unlist(lapply(splitCompA,function(x)if(x!="+" && x!="-"){paste(x,levelB,sep=".")}else{x})),collapse="") + secondTerm=negativeExpression(paste(unlist(lapply(splitCompA,function(x)if(x!="+" && x!="-"){paste(x,controlGroup[nameFactorB],sep=".")}else{x})),collapse="")) + currentContrast=paste(c(firstTerm,secondTerm),collapse="") + + newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]]=c(newPreviousLoopContrast[[paste(iFactorA,iFactorB,sep=".")]],currentContrast) + } + } + } + } + } + for(iContrast in names(newPreviousLoopContrast)){ + contrastClass=c(contrastClass,rep(iContrast,length(newPreviousLoopContrast[[iContrast]]))) + } + interactionContrasts=c(interactionContrasts,unlist(newPreviousLoopContrast)) + } + #make human title for interactions + names(interactionContrasts)=contrastClass + humanReadingInteraction=unlist(lapply(interactionContrasts,function(x)gsub("\\.",":",unlist(strsplit(x,"[+-]"))[1]))) + + contrastToIgnore=c() + + #complete with control groups and order to match to coeffs + for(iContrast in 1:length(interactionContrasts)){ + missingFactor=setdiff(1:nbFactors,as.integer(unlist(strsplit(names(interactionContrasts[iContrast]),"\\.")))) + #decompose the contrast + temp=unlist(strsplit(interactionContrasts[iContrast],"[ + ]")) + splitContrast=temp[1] + for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp) + splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + for(iFactor in missingFactor){ + for(iTerm in 1:length(splitContrast)){ + if(splitContrast[iTerm]!="+" && splitContrast[iTerm]!="-"){ + splitTerm=unlist(strsplit(splitContrast[iTerm],"\\.")) + if(iFactor==1)splitContrast[iTerm]=paste(c(controlGroup[names(factorsList)[iFactor]],splitTerm),collapse=".") + if(iFactor==nbFactors)splitContrast[iTerm]=paste(c(splitTerm,controlGroup[names(factorsList)[iFactor]]),collapse=".") + if(iFactor>1 && iFactor<nbFactors)splitContrast[iTerm]=paste(c(splitTerm[1:(iFactor-1)],controlGroup[names(factorsList)[iFactor]],splitTerm[iFactor:length(splitTerm)]),collapse=".") + } + } + } + interactionContrasts[iContrast]=paste(splitContrast,collapse="") + if(all(splitContrast[seq(1,length(splitContrast),2)]%in%coeffMeaning)){ + addComment(c("[INFO]Interaction contrast added : ",humanReadingInteraction[iContrast]),T,opt$log,display=F) + addComment(c("with complete formula ",interactionContrasts[iContrast]),T,opt$log,display=F) + }else{ + contrastToIgnore=c(contrastToIgnore,iContrast) + addComment(c("[WARNING]Interaction contrast",humanReadingInteraction[iContrast],"is removed due to empty group"),T,opt$log,display=F) + } + } + + #add interaction contrasts to global contrast list + if(length(contrastToIgnore)>0){ + requiredContrasts=c(requiredContrasts,interactionContrasts[-contrastToIgnore]) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction[-contrastToIgnore]) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction[-contrastToIgnore]))) + }else{ + requiredContrasts=c(requiredContrasts,interactionContrasts) + humanReadingContrasts=c(humanReadingContrasts,humanReadingInteraction) + persoContrastName=c(persoContrastName,rep("",length(humanReadingInteraction))) + } + }#end of intreaction contrasts + + + #remove from requiredContrasts contrasts that cannot be estimated + toRemove=unique(unlist(lapply(setdiff(coeffMeaning,names(estimatedCoeff)),function(x)grep(x,requiredContrasts)))) + if(length(toRemove)>0){ + addComment(c("[WARNING]",length(toRemove)," contrasts are removed, due to missing coefficients"),T,opt$log,display=FALSE) + requiredContrasts=requiredContrasts[-toRemove] + humanReadingContrasts=humanReadingContrasts[-toRemove] + persoContrastName=persoContrastName[-toRemove] + } + + if(length(requiredContrasts)==0){ + addComment("[ERROR]No contrast to compute, please check your contrast definition.",T,opt$log) + q( "no", 1, F ) + } + + #compute for each contrast mean of coefficients in posGroup and negGroup for FC computation of log(FC) with LSmean as in Partek + meanPosGroup=list() + meanNegGroup=list() + for(iContrast in 1:length(requiredContrasts)){ + #define posGroup and negGroup + #first split contrast + temp=unlist(strsplit(requiredContrasts[iContrast],"[ + ]")) + splitContrast=temp[1] + for(iTemp in temp[-1])splitContrast=c(splitContrast,"+",iTemp) + splitContrast=unlist(lapply(splitContrast,function(x){temp=unlist(strsplit(x,"-"));splitCompB=temp[1];for(iTemp in temp[-1])splitCompB=c(splitCompB,"-",iTemp);splitCompB})) + #and then put each term in good group + posGroup=c() + negGroup=c() + nextIsPos=TRUE + for(iSplit in splitContrast){ + if(iSplit=="+")nextIsPos=TRUE + if(iSplit=="-")nextIsPos=FALSE + if(iSplit!="-" && iSplit!="+"){ + #remove weights of contrast terms + iSplitBis=unlist(strsplit(iSplit,"[*]")) + iSplitBis=iSplitBis[length(iSplitBis)] + if(nextIsPos)posGroup=c(posGroup,iSplitBis) + else negGroup=c(negGroup,iSplitBis) + } + } + #compute means for each group + meanPosGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,posGroup],ncol=length(posGroup)),1,mean) + meanNegGroup[[iContrast]]=apply(as.matrix(data.fit$coefficients[,negGroup],ncol=length(negGroup)),1,mean) + } + + + contrast.matrix = makeContrasts(contrasts=requiredContrasts,levels=design) + data.fit.con = contrasts.fit(data.fit,contrast.matrix) + + addComment("[INFO]Contrast definition done",T,opt$log,T,display=FALSE) + + #compute LIMMA statistics + data.fit.eb = eBayes(data.fit.con) + + addComment("[INFO]Estimation done",T,opt$log,T,display=FALSE) + + #adjust p.value through FDR + data.fit.eb$adj_p.value=data.fit.eb$p.value + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + data.fit.eb$adj_p.value[,iComparison]=p.adjust(data.fit.eb$p.value[,iComparison],"fdr") + } + + #add a new field based on LS-means for each contrast instead of global mean like they were calculated in coefficients field + data.fit.eb$coefficientsLS=data.fit.eb$coefficients + if(ncol(data.fit.eb$coefficients)!=length(meanPosGroup)){ + addComment("[ERROR]Estimated contrasts number unexpected",T,opt$log) + q( "no", 1, F ) + } + for(iContrast in 1:length(meanPosGroup)){ + data.fit.eb$coefficientsLS[,iContrast]=meanPosGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)]-meanNegGroup[[iContrast]][rownames(data.fit.eb$coefficientsLS)] + } + + #if requested replace coefficient computed as global mean by LS-means values + if(useLSmean)data.fit.eb$coefficients=data.fit.eb$coefficientsLS + +addComment("[INFO]Core treatment done",T,opt$log,T,display=FALSE) + + +##convert humanReadingContrasts with namingDictionary to create humanReadingContrastsRenamed and keep original humanReadingContrasts names for file names +humanReadingContrastsRenamed=rep("",length(humanReadingContrasts)) +for(iContrast in 1:length(humanReadingContrasts)){ + if(persoContrastName[iContrast]==""){ + #if(verbose)addComment(humanReadingContrasts[iContrast]) + specialCharacters=str_extract_all(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]] + #if(verbose)addComment(specialCharacters) + nameConverted=unlist(lapply(strsplit(humanReadingContrasts[iContrast],"[+|*|_|:|-]")[[1]],function(x)renamingDico[x,1])) + #if(verbose)addComment(nameConverted) + humanReadingContrastsRenamed[iContrast]=paste(nameConverted,specialCharacters,collapse="",sep="") + #if(verbose)addComment(humanReadingContrastsRenamed[iContrast]) + humanReadingContrastsRenamed[iContrast]=substr(humanReadingContrastsRenamed[iContrast],1,nchar(humanReadingContrastsRenamed[iContrast])-1) + }else{ + humanReadingContrastsRenamed[iContrast]=persoContrastName[iContrast] + } +} + +#write correspondances between plot file names (humanReadingContrasts) and displayed names in figure legends (humanReadingContrastsRenamed), usefull to define html items in xml file +correspondanceTable=matrix("",ncol=2,nrow=ncol(data.fit.eb$p.value)) +correspondanceTable[,1]=unlist(lapply(humanReadingContrasts,function(x)gsub(":","_INT_",gsub("\\+","_PLUS_",gsub("\\*","_AND_",x))))) +correspondanceTable[,2]=humanReadingContrastsRenamed +rownames(correspondanceTable)=correspondanceTable[,2] +write.table(correspondanceTable,file=file.path(getwd(), "correspondanceFileNames.csv"),quote=FALSE,sep="\t",col.names = F,row.names = F) + +#plot nominal p-val histograms for selected comparisons +histogramPerPage=6 +if (!is.null(opt$histo)) { + iToPlot=1 + plotVector=list() + nbComparisons=ncol(data.fit.eb$p.value) + for (iComparison in 1:nbComparisons){ + dataToPlot=data.frame(pval=data.fit.eb$p.value[,iComparison],id=rownames(data.fit.eb$p.value)) + p <- ggplot(data=dataToPlot, aes(x=pval)) + geom_histogram(colour="red", fill="salmon") + + theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="Frequencies") + xlab(label="Nominal p-val") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + plotVector[[length(plotVector)+1]]=p + + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$histo,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + if(iComparison==nbComparisons || length(plotVector)==histogramPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$histo,iToPlot,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$histo,iToPlot,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=2) + dev.off() + if(iComparison<nbComparisons){ + #prepare for a new plotting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + addComment("[INFO]Histograms drawn",T,opt$log,T,display=FALSE) + +} + +#plot F-test sum square barplot +if(!is.null(allFtestMeanSquare)){ + dataToPlot=data.frame(Fratio=apply(allFtestMeanSquare,2,mean),Factors=factor(colnames(allFtestMeanSquare),levels = colnames(allFtestMeanSquare))) + + p <- ggplot(data=dataToPlot, aes(x=Factors, y=Fratio, fill=Factors)) + + geom_bar(stat="identity") + scale_fill_manual(values = colorRampPalette(brewer.pal(9,"Set1"))(ncol(allFtestMeanSquare))[sample(ncol(allFtestMeanSquare))]) + ylab(label="mean F-ratio") + + theme_bw() + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5)) + ggtitle("Source of variation") + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$fratioFile,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$fratioFile,".png"),collapse="")) + } + plot(p) + dev.off() + + pp <- ggplotly(p) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$fratioFile,".html"),collapse=""),selfcontained = F) + + addComment("[INFO]SumSquareTest drawn",T,opt$log,T,display=FALSE) +} + +#plot VOLCANO plot +#volcanoplot(data.fit.eb,coef=1,highlight=10) +volcanoPerPage=1 +logFCthreshold=log2(opt$thresholdFC) +if (!is.null(opt$volcano)) { + iToPlot=1 + plotVector=list() + nbComparisons=ncol(data.fit.eb$adj_p.value) + for (iComparison in 1:nbComparisons){ + + #define the log10(p-val) threshold corresponding to FDR threshold fixed by user + probeWithLowFDR=-log10(data.fit.eb$p.value[which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold),iComparison]) + pvalThresholdFDR=NULL + if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(data.fit.eb$coefficients[,iComparison])>=logFCthreshold),which(data.fit.eb$adj_p.value[,iComparison]<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(data.fit.eb$coefficients[,iComparison])<=quantile(abs(data.fit.eb$coefficients[,iComparison]),c(0.66))),which(data.fit.eb$p.value[,iComparison]>=quantile(abs(data.fit.eb$p.value[,iComparison]),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(data.fit.eb$coefficients[,iComparison])-0.2 + xMaxLimPlot=max(data.fit.eb$coefficients[,iComparison])+0.2 + yMaxLimPlot= max(-log10(data.fit.eb$p.value[,iComparison]))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(data.fit.eb$p.value[significativePoints,iComparison]),FC=data.fit.eb$coefficients[significativePoints,iComparison],description=paste(names(data.fit.eb$coefficients[significativePoints,iComparison]),"\n","FC: " , round(2^data.fit.eb$coefficients[significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[significativePoints,iComparison],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)<nrow(data.fit.eb$p.value)){ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[-significativePoints,iComparison]),FC=data.fit.eb$coefficients[-significativePoints,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[-significativePoints,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[-significativePoints,iComparison],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + }else{ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[,iComparison]),FC=data.fit.eb$coefficients[,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[,iComparison],2) , " | FDR p-val: ",prettyNum(data.fit.eb$adj_p.value[,iComparison],digits=4), sep="")) + } + + ##traditional plot + p <- ggplot(data=dataToPlot, aes(x=FC, y=pval)) + geom_point() + + theme_bw() + ggtitle(humanReadingContrastsRenamed[iComparison]) + ylab(label="-log10(p-val)") + xlab(label="Log2 Fold Change") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position="none") + if(logFCthreshold!=0) p <- p + geom_vline(xintercept=-logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_vline(xintercept=logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_text(data.frame(text=c(paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),paste(c("log2(FC=",opt$thresholdFC,")"),collapse="")),x=c(-logFCthreshold,logFCthreshold),y=c(0,0)),mapping=aes(x=x, y=y, label=text), size=4, angle=90, vjust=-0.4, hjust=0, color="salmon") + if(!is.null(pvalThresholdFDR)) p <- p + geom_hline(yintercept=pvalThresholdFDR, color="skyblue1",linetype="dotted", size=0.5) + geom_text(data.frame(text=c(paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse="")),x=c(xMinLimPlot),y=c(pvalThresholdFDR)),mapping=aes(x=x, y=y, label=text), size=4, vjust=0, hjust=0, color="skyblue3") + if(length(significativePoints)>0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if remains any normal points to draw + if(length(pointToRemove)<nrow(data.fit.eb$p.value)){ + dataToPlot=data.frame(pval=-log10(data.fit.eb$p.value[-pointToRemove,iComparison]),FC=data.fit.eb$coefficients[-pointToRemove,iComparison],description=paste("FC: " , round(2^data.fit.eb$coefficients[-pointToRemove,iComparison],2) , " | FDR p-val: ", prettyNum(data.fit.eb$adj_p.value[-pointToRemove,iComparison],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + } + + if((nrow(dataToPlot)+nrow(dataSignifToPlot))>40000)addComment(c("[WARNING]For",humanReadingContrastsRenamed[iComparison],"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = humanReadingContrastsRenamed[iComparison],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$thresholdFC,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("FDR pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".html"),collapse=""),selfcontained = F) + + + if(iComparison==nbComparisons || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/",opt$volcano,"_",correspondanceTable[humanReadingContrastsRenamed[iComparison],1],".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iComparison<nbComparisons){ + #prepare for a new ploting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } + } + remove(dataToPlot,dataSignifToPlot) + addComment("[INFO]Volcanos drawn",T,opt$log,T,display=FALSE) +} + +rowItemInfo=NULL +if(!is.null(opt$rowNameType) && !is.null(opt$organismID)){ +##get gene information from BioMart +#if(!require("biomaRt")){ +# source("https://bioconductor.org/biocLite.R") +# biocLite("biomaRt") +#} + +ensembl_hs_mart <- useMart(biomart="ensembl", dataset=opt$organismID) +ensembl_df <- getBM(attributes=c(opt$rowNameType,"description"),mart=ensembl_hs_mart) +rowItemInfo=ensembl_df[which(ensembl_df[,1]!=""),2] +rowItemInfo=unlist(lapply(rowItemInfo,function(x)substr(unlist(strsplit(x," \\[Source"))[1],1,30))) +names(rowItemInfo)=ensembl_df[which(ensembl_df[,1]!=""),1] +} + +#write(unlist(dimnames(data.fit.eb$adj_p.value)),opt$log,append = T) + +#prepare additional output containing df informations +dfMatrix=matrix(0,ncol=3,nrow = nrow(data.fit.eb$coefficients),dimnames = list(rownames(data.fit.eb$coefficients),c("df.residual","df.prior","df.total"))) +dfMatrix[,"df.residual"]=data.fit.eb$df.residual +dfMatrix[,"df.prior"]=data.fit.eb$df.prior +dfMatrix[,"df.total"]=data.fit.eb$df.total + +#filter out genes with higher p-values for all comparisons +genesToKeep=names(which(apply(data.fit.eb$adj_p.value,1,function(x)length(which(x<=opt$fdrThreshold))>0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(data.fit.eb$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[genesToKeep,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=genesToKeep + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[genesToKeep,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=genesToKeep + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[genesToKeep,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=genesToKeep + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[genesToKeep,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=genesToKeep + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[genesToKeep,,drop=FALSE] + +}else{ + addComment(c("[WARNING]No significative genes considering the given FDR threshold : ",opt$fdrThreshold),T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(data.fit.eb$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(data.fit.eb$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(data.fit.eb$adj_p.value[,1]) +#data.fit.eb$adj_p.value=data.fit.eb$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(data.fit.eb$adj_p.value)) + for(iComparison in 1:ncol(data.fit.eb$adj_p.value)){ + currentRank=currentRank+rank(data.fit.eb$adj_p.value[,iComparison]) + } + currentRank=currentRank/ncol(data.fit.eb$adj_p.value) + newOrder=order(currentRank) + + data.fit.eb$adj_p.value=matrix(data.fit.eb$adj_p.value[newOrder,],ncol=ncol(data.fit.eb$adj_p.value)) + rownames(data.fit.eb$adj_p.value)=rownames(data.fit.eb$p.value)[newOrder] + colnames(data.fit.eb$adj_p.value)=colnames(data.fit.eb$p.value) + + data.fit.eb$p.value=matrix(data.fit.eb$p.value[newOrder,],ncol=ncol(data.fit.eb$p.value)) + rownames(data.fit.eb$p.value)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$p.value)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$coefficients=matrix(data.fit.eb$coefficients[newOrder,],ncol=ncol(data.fit.eb$coefficients)) + rownames(data.fit.eb$coefficients)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$coefficients)=colnames(data.fit.eb$adj_p.value) + + data.fit.eb$t=matrix(data.fit.eb$t[newOrder,],ncol=ncol(data.fit.eb$t)) + rownames(data.fit.eb$t)=rownames(data.fit.eb$adj_p.value) + colnames(data.fit.eb$t)=colnames(data.fit.eb$adj_p.value) + + dfMatrix=dfMatrix[newOrder,,drop=FALSE] +} + + +#formating output matrices depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") + + outputDfData=matrix(0,ncol=3+1,nrow=2) + outputDfData[1,]=c("X","df.residual","df.prior","df.total") + outputDfData[,1]=c("Statistics","noGene") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=3) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3,seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2,]=c(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual","df.prior","df.total")],digits=4)) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(data.fit.eb$adj_p.value)*5+2,nrow=nrow(data.fit.eb$adj_p.value)+2) + outputData[1,]=c("X","X",rep(humanReadingContrastsRenamed,each=5)) + outputData[2,]=c("X","X",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),ncol(data.fit.eb$adj_p.value))) + outputData[,1]=c("LIMMA","Gene",rownames(data.fit.eb$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(data.fit.eb$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(data.fit.eb$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),5)]=prettyNum(data.fit.eb$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),5)]=prettyNum(data.fit.eb$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),5)]=prettyNum(2^data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),5)]=prettyNum(data.fit.eb$coefficients,digits=4) + outputData[3:nrow(outputData),seq(7,ncol(outputData),5)]=prettyNum(data.fit.eb$t,digits=4) + + outputDfData=matrix(0,ncol=3+1,nrow=1+nrow(dfMatrix)) + outputDfData[1,]=c("Statistics","df.residual","df.prior","df.total") + outputDfData[2:(1+nrow(dfMatrix)),]=cbind(rownames(dfMatrix),prettyNum(dfMatrix[,c("df.residual")],digits=4),prettyNum(dfMatrix[,c("df.prior")],digits=4),prettyNum(dfMatrix[,c("df.total")],digits=4)) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +#write df info file +write.table(outputDfData,file=opt$outputDfFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) +#sessionInfo() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/VolcanoPlotsScript.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,426 @@ +# R script to plot volcanos through Galaxy based GIANT tool +# written by Jimmy Vandel +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE) +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "statisticsFile", "i", 1, "character", + "volcanoName" , "n", 1, "character", + "pvalColumnName" , "p", 1, "character", + "fdrColumnName" , "m", 1, "character", + "fcColumnName" , "c", 1, "character", + "fcKind","d", 1, "character", + "fdrThreshold","s", 1, "double", + "fcThreshold","e", 1, "double", + "organismID","x",1,"character", + "rowNameType","y",1,"character", + "log", "l", 1, "character", + "outputFile" , "o", 1, "character", + "format", "f", 1, "character", + "quiet", "q", 0, "logical"), + byrow=TRUE, ncol=4) +opt <- getopt(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required\n") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$statisticsFile)) { + addComment("[ERROR]'statisticsFile' is required",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)==0 || length(opt$fdrColumnName)==0 || length(opt$fcColumnName)==0) { + addComment("[ERROR]no selected columns",T,opt$log) + q( "no", 1, F ) +} +if (length(opt$pvalColumnName)!=length(opt$fcColumnName) || length(opt$pvalColumnName)!=length(opt$fdrColumnName)) { + addComment("[ERROR]different number of selected columns between p.val, adj-p.val and FC ",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcKind)) { + addComment("[ERROR]'fcKind' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fdrThreshold)) { + addComment("[ERROR]'FDR threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$fcThreshold)) { + addComment("[ERROR]'FC threshold' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} + +#demande si le script sera bavard +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE +} + +#paramètres internes +addComment("[INFO]Parameters checked test mode !",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#charge des packages silencieusement +suppressPackageStartupMessages({ + library("methods") + library("biomaRt") + library("ggplot2") + library("plotly") + library("stringr") +}) + +#define some usefull variable +nbVolcanosToPlot=length(opt$pvalColumnName) + +#load input file +statDataMatrix=read.csv(file=file.path(getwd(), opt$statisticsFile),header=F,sep="\t",colClasses="character") +#remove first colum to convert it as rownames +rownames(statDataMatrix)=statDataMatrix[,1] +statDataMatrix=statDataMatrix[,-1] + +#identify lines without adjusted p-value info (should contain the same content as rownames) and replace them with NA values +FDRinfo=rep(TRUE,nbVolcanosToPlot) +for(iVolcano in 1:nbVolcanosToPlot){ + #input parameter should be None when adjusted p-val are not available + if(opt$fdrColumnName[iVolcano]=="None"){ + #content of the corresponding column should also be the same as rownames + if(!all(statDataMatrix[,(iVolcano-1)*3+2]==rownames(statDataMatrix))){ + addComment(c("[ERROR]It seems that input stat matrix contains adjusted p-values for volcano",iVolcano,"whereas input parameter indicates that not."),T,opt$log) + q( "no", 1, F ) + } + FDRinfo[iVolcano]=FALSE + statDataMatrix[,(iVolcano-1)*3+2]=NA + } +} + +if(is.data.frame(statDataMatrix)){ + statDataMatrix=data.matrix(statDataMatrix) +}else{ + statDataMatrix=data.matrix(as.numeric(statDataMatrix)) +} + +#check if available column number match with volcano requested number +if(ncol(statDataMatrix)!=3*nbVolcanosToPlot){ + addComment("[ERROR]Input file column number is different from requested volcano number",T,opt$log) + q( "no", 1, F ) +} + +#build global dataFrame with data and fill with p.val and log2(FC) and FDR +dataFrame=data.frame(row.names = rownames(statDataMatrix)) +#start with p-value +dataFrame$p.value=statDataMatrix[,seq(1,nbVolcanosToPlot*3,3),drop=FALSE] +#compute FDR if needed or just get available info +dataFrame$adj_p.value=dataFrame$p.value +for(iVolcano in 1:nbVolcanosToPlot){ + #adjusted p-value are already computed + if(FDRinfo[iVolcano]){ + dataFrame$adj_p.value[,iVolcano]=statDataMatrix[,(iVolcano-1)*3+2,drop=FALSE] + }else{ + #adjusted p-value should be computed based on p-val using FDR + dataFrame$adj_p.value[,iVolcano]=p.adjust(dataFrame$p.value[,iVolcano,drop=FALSE],"fdr") + addComment(c("[INFO]Adjusted p-values are not available in input for volcano",iVolcano,", FDR approach will be used on available raw p-values"),T,opt$log) + } +} +if(opt$fcKind=="FC"){ + #we should transform as Log2FC + dataFrame$coefficients=log2(statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE]) + addComment(c("[INFO]FC are converted in log2(FC) for plotting"),T,opt$log) +}else{ + dataFrame$coefficients=statDataMatrix[,seq(3,nbVolcanosToPlot*3,3),drop=FALSE] +} + +addComment(c("[INFO]Input data available for",nbVolcanosToPlot,"volcano(s) with",nrow(statDataMatrix),"rows"),T,opt$log) + + +#plot VOLCANOs +volcanoPerPage=1 +logFCthreshold=log2(opt$fcThreshold) +iToPlot=1 +plotVector=list() +volcanoNameList=c() +for (iVolcano in 1:nbVolcanosToPlot){ + + if(nchar(opt$volcanoName[iVolcano])>0){ + curentVolcanoName=opt$volcanoName[iVolcano] + }else{ + curentVolcanoName=paste(iVolcano,opt$pvalColumnName[iVolcano],sep="_") + } + + #keep only rows without NA for p-val, adjusted p-val and coeff + pValToPlot=dataFrame$p.value[,iVolcano] + fdrToPlot=dataFrame$adj_p.value[,iVolcano] + coeffToPlot=dataFrame$coefficients[,iVolcano] + + rowToRemove=unique(c(which(is.na(pValToPlot)),which(is.na(fdrToPlot)),which(is.na(coeffToPlot)))) + if(length(rowToRemove)>0){ + pValToPlot=pValToPlot[-rowToRemove] + fdrToPlot=fdrToPlot[-rowToRemove] + coeffToPlot=coeffToPlot[-rowToRemove] + } + addComment(c("[INFO]For",curentVolcanoName,"volcano,",length(rowToRemove),"rows are discarded due to NA values,",length(pValToPlot),"remaining rows."),T,opt$log) + + #save volcano name + volcanoNameList=c(volcanoNameList,curentVolcanoName) + + #remove characters possibly troubling + volcanoFileName=iVolcano + + #define the log10(p-val) threshold corresponding to FDR threshold fixed by user + probeWithLowFDR=-log10(pValToPlot[which(fdrToPlot<=opt$fdrThreshold)]) + pvalThresholdFDR=NULL + if(length(probeWithLowFDR)>0)pvalThresholdFDR=min(probeWithLowFDR) + + #get significant points over FC and FDR thresholds + significativePoints=intersect(which(abs(coeffToPlot)>=logFCthreshold),which(fdrToPlot<=opt$fdrThreshold)) + + #to reduce size of html plot, we keep 20000 points maximum sampled amongst genes with pval>=33%(pval) and abs(log2(FC))<=66%(abs(log2(FC))) + htmlPointsToRemove=intersect(which(abs(coeffToPlot)<=quantile(abs(coeffToPlot),c(0.66))),which(pValToPlot>=quantile(abs(pValToPlot),c(0.33)))) + if(length(htmlPointsToRemove)>20000){ + htmlPointsToRemove=setdiff(htmlPointsToRemove,sample(htmlPointsToRemove,20000)) + }else{ + htmlPointsToRemove=c() + } + + xMinLimPlot=min(coeffToPlot)-0.2 + xMaxLimPlot=max(coeffToPlot)+0.2 + yMaxLimPlot= max(-log10(pValToPlot))+0.2 + + if(length(significativePoints)>0){ + dataSignifToPlot=data.frame(pval=-log10(pValToPlot[significativePoints]),FC=coeffToPlot[significativePoints],description=paste(names(coeffToPlot[significativePoints]),"\n","FC: " , round(2^coeffToPlot[significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[significativePoints],digits=4), sep="")) + #to test if remains any normal points to draw + if(length(significativePoints)<length(pValToPlot)){ + dataToPlot=data.frame(pval=-log10(pValToPlot[-significativePoints]),FC=coeffToPlot[-significativePoints],description=paste("FC: " , round(2^coeffToPlot[-significativePoints],2) , " | Adjusted p-val: ",prettyNum(fdrToPlot[-significativePoints],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + }else{ + dataToPlot=data.frame(pval=-log10(pValToPlot),FC=coeffToPlot,description=paste("FC: " , round(2^coeffToPlot,2) , " | Adjusted p-val: ",prettyNum(fdrToPlot,digits=4), sep="")) + } + + ##traditional plot + + p <- ggplot(data=dataToPlot, aes(x=FC, y=pval)) + geom_point() + + theme_bw() + ggtitle(curentVolcanoName) + ylab(label="-Log10(p-val)") + xlab(label="Log2 Fold Change") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position="none") + if(logFCthreshold!=0) p <- p + geom_vline(xintercept=-logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_vline(xintercept=logFCthreshold, color="salmon",linetype="dotted", size=1) + geom_text(data.frame(text=c(paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),paste(c("log2(FC=",opt$fcThreshold,")"),collapse="")),x=c(-logFCthreshold,logFCthreshold),y=c(0,0)),mapping=aes(x=x, y=y, label=text), size=4, angle=90, vjust=-0.4, hjust=0, color="salmon") + if(!is.null(pvalThresholdFDR)) p <- p + geom_hline(yintercept=pvalThresholdFDR, color="skyblue1",linetype="dotted", size=0.5) + geom_text(data.frame(text=c(paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse="")),x=c(xMinLimPlot),y=c(pvalThresholdFDR)),mapping=aes(x=x, y=y, label=text), size=4, vjust=0, hjust=0, color="skyblue3") + if(length(significativePoints)>0)p <- p + geom_point(data=dataSignifToPlot,aes(colour=description)) + + ##interactive plot + + if(length(htmlPointsToRemove)>0){ + pointToRemove=union(htmlPointsToRemove,significativePoints) + #to test if it remains any normal points to draw + if(length(pointToRemove)<length(pValToPlot)){ + dataToPlot=data.frame(pval=-log10(pValToPlot[-pointToRemove]),FC=coeffToPlot[-pointToRemove],description=paste("FC: " , round(2^coeffToPlot[-pointToRemove],2) , " | Adjusted p-val: ", prettyNum(fdrToPlot[-pointToRemove],digits=4), sep="")) + }else{ + dataToPlot=data.frame(pval=0,FC=0,description="null") + } + } + + if((nrow(dataToPlot)+length(significativePoints))>40000)addComment(c("[WARNING]For",curentVolcanoName,"volcano, numerous points to plot(",nrow(dataToPlot)+nrow(dataSignifToPlot),"), resulting volcano could be heavy, using more stringent thresholds could be helpful."),T,opt$log) + + phtml <- plot_ly(data=dataToPlot, x=~FC, y=~pval,type="scatter", mode="markers",showlegend = FALSE, marker = list(color="gray",opacity=0.5), text=~description, hoverinfo="text") %>% + layout(title = curentVolcanoName[iVolcano],xaxis=list(title="Log2 Fold Change",showgrid=TRUE, zeroline=FALSE),yaxis=list(title="-Log10(p-val)", showgrid=TRUE, zeroline=FALSE)) + if(length(significativePoints)>0) phtml=add_markers(phtml,data=dataSignifToPlot, x=~FC, y=~pval, mode="markers" , marker=list( color=log10(abs(dataSignifToPlot$FC)*dataSignifToPlot$pval),colorscale='Rainbow'), text=~description, hoverinfo="text", inherit = FALSE) %>% hide_colorbar() + if(logFCthreshold!=0){ + phtml=add_trace(phtml,x=c(-logFCthreshold,-logFCthreshold), y=c(0,yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=-logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(1/FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + phtml=add_trace(phtml,x=c(logFCthreshold,logFCthreshold), y=c(0, yMaxLimPlot), type="scatter", mode = "lines", line=list(color="coral",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=logFCthreshold,y=0,xref = "x",yref = "y",text = paste(c("log2(FC=",opt$fcThreshold,")"),collapse=""),xanchor = 'right',showarrow = F,textangle=270,font=list(color="coral")) + } + if(!is.null(pvalThresholdFDR)){ + phtml=add_trace(phtml,x=c(xMinLimPlot,xMaxLimPlot), y=c(pvalThresholdFDR,pvalThresholdFDR), type="scatter", mode = "lines", line=list(color="cornflowerblue",dash="dash"), hoverinfo='none', showlegend = FALSE,inherit = FALSE) + phtml=add_annotations(phtml,x=xMinLimPlot,y=pvalThresholdFDR+0.1,xref = "x",yref = "y",text = paste(c("Adjusted pval limit(",opt$fdrThreshold,")"),collapse=""),xanchor = 'left',showarrow = F,font=list(color="cornflowerblue")) + } + plotVector[[length(plotVector)+1]]=p + + #save plotly files + pp <- ggplotly(phtml) + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Volcanos_",volcanoFileName,".html"),collapse=""),selfcontained = F) + + + if(iVolcano==nbVolcanosToPlot || length(plotVector)==volcanoPerPage){ + #plot and close the actual plot + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/Volcanos_",volcanoFileName,".pdf"),collapse=""))}else{ + png(paste(c("./plotDir/Volcanos_",volcanoFileName,".png"),collapse="")) + } + multiplot(plotlist=plotVector,cols=1) + dev.off() + if(iVolcano<nbVolcanosToPlot){ + #prepare for a new ploting file if necessary + plotVector=list() + iToPlot=iToPlot+1 + } + } +} +remove(dataToPlot,dataSignifToPlot) +addComment("[INFO]Volcanos drawn",T,opt$log,T,display=FALSE) + + +#now add anotation infos about genes + +rowItemInfo=NULL +if(!is.null(opt$rowNameType) && !is.null(opt$organismID)){ + ##get gene information from BioMart + #if(!require("biomaRt")){ + # source("https://bioconductor.org/biocLite.R") + # biocLite("biomaRt") + #} + + ensembl_hs_mart <- useMart(biomart="ensembl", dataset=opt$organismID) + ensembl_df <- getBM(attributes=c(opt$rowNameType,"description"),mart=ensembl_hs_mart) + rowItemInfo=ensembl_df[which(ensembl_df[,1]!=""),2] + rowItemInfo=unlist(lapply(rowItemInfo,function(x)substr(unlist(strsplit(x," \\[Source"))[1],1,30))) + names(rowItemInfo)=ensembl_df[which(ensembl_df[,1]!=""),1] +} + +#filter out genes with higher p-values for all comparisons +genesToKeep=names(which(apply(dataFrame$adj_p.value,1,function(x)length(which(x<=opt$fdrThreshold))>0))) +#filter out genes with lower FC for all comparisons +genesToKeep=intersect(genesToKeep,names(which(apply(dataFrame$coefficients,1,function(x)length(which(abs(x)>=logFCthreshold))>0)))) + +if(length(genesToKeep)>0){ + dataFrameNew=data.frame(row.names=genesToKeep) + + dataFrameNew$adj_p.value=matrix(dataFrame$adj_p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrameNew$adj_p.value)=genesToKeep + colnames(dataFrameNew$adj_p.value)=colnames(dataFrame$p.value) + + dataFrameNew$p.value=matrix(dataFrame$p.value[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$p.value)) + rownames(dataFrameNew$p.value)=genesToKeep + colnames(dataFrameNew$p.value)=colnames(dataFrame$adj_p.value) + + dataFrameNew$coefficients=matrix(dataFrame$coefficients[genesToKeep,,drop=FALSE],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrameNew$coefficients)=genesToKeep + colnames(dataFrameNew$coefficients)=colnames(dataFrame$adj_p.value) + + dataFrame=dataFrameNew + rm(dataFrameNew) +}else{ + addComment("[WARNING]No significative genes",T,opt$log,display=FALSE) +} + +addComment("[INFO]Significant genes filtering done",T,opt$log,T,display=FALSE) + + +#plot VennDiagramm for genes below threshold between comparisons +#t=apply(dataFrame$adj_p.value[,1:4],2,function(x)names(which(x<=opt$threshold))) +#get.venn.partitions(t) +#vennCounts(dataFrame$adj_p.value[,1:4]<=opt$threshold) + +#make a simple sort genes based only on the first comparison +#newOrder=order(dataFrame$adj_p.value[,1]) +#dataFrame$adj_p.value=dataFrame$adj_p.value[newOrder,] + +#alternative sorting strategy based on the mean gene rank over all comparisons +if(length(genesToKeep)>1){ + currentRank=rep(0,nrow(dataFrame$adj_p.value)) + for(iVolcano in 1:ncol(dataFrame$adj_p.value)){ + currentRank=currentRank+rank(dataFrame$adj_p.value[,iVolcano]) + } + currentRank=currentRank/ncol(dataFrame$adj_p.value) + newOrder=order(currentRank) + rownames(dataFrame)=rownames(dataFrame)[newOrder] + + dataFrame$adj_p.value=matrix(dataFrame$adj_p.value[newOrder,],ncol=ncol(dataFrame$adj_p.value)) + rownames(dataFrame$adj_p.value)=rownames(dataFrame$p.value)[newOrder] + colnames(dataFrame$adj_p.value)=colnames(dataFrame$p.value) + + dataFrame$p.value=matrix(dataFrame$p.value[newOrder,],ncol=ncol(dataFrame$p.value)) + rownames(dataFrame$p.value)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$p.value)=colnames(dataFrame$adj_p.value) + + dataFrame$coefficients=matrix(dataFrame$coefficients[newOrder,],ncol=ncol(dataFrame$coefficients)) + rownames(dataFrame$coefficients)=rownames(dataFrame$adj_p.value) + colnames(dataFrame$coefficients)=colnames(dataFrame$adj_p.value) +} + +#formating output matrix depending on genes to keep +if(length(genesToKeep)==0){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene","noGene") + outputData[,2]=c("Comparison","Info","noInfo") +}else{ + if(length(genesToKeep)==1){ + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=3) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",genesToKeep) + outputData[,2]=c("Comparison","Info","na") + if(!is.null(rowItemInfo))outputData[3,2]=rowItemInfo[genesToKeep] + outputData[3,seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3,seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3,seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3,seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + }else{ + #format matrix to be correctly read by galaxy (move headers in first column and row) + outputData=matrix(0,ncol=ncol(dataFrame$adj_p.value)*4+2,nrow=nrow(dataFrame$adj_p.value)+2) + outputData[1,]=c("X","X",rep(volcanoNameList,each=4)) + outputData[2,]=c("X","X",rep(c("p-val","Adjusted.p-val","FC","log2(FC)"),ncol(dataFrame$adj_p.value))) + outputData[,1]=c("Volcano","Gene",rownames(dataFrame$adj_p.value)) + outputData[,2]=c("Comparison","Info",rep("na",nrow(dataFrame$adj_p.value))) + if(!is.null(rowItemInfo))outputData[3:nrow(outputData),2]=rowItemInfo[rownames(dataFrame$adj_p.value)] + outputData[3:nrow(outputData),seq(3,ncol(outputData),4)]=prettyNum(dataFrame$p.value,digits=4) + outputData[3:nrow(outputData),seq(4,ncol(outputData),4)]=prettyNum(dataFrame$adj_p.value,digits=4) + outputData[3:nrow(outputData),seq(5,ncol(outputData),4)]=prettyNum(2^dataFrame$coefficients,digits=4) + outputData[3:nrow(outputData),seq(6,ncol(outputData),4)]=prettyNum(dataFrame$coefficients,digits=4) + } +} +addComment("[INFO]Formated output",T,opt$log,display=FALSE) + +#write output results +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) + +#sessionInfo() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/getopt.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,773 @@ +# Copyright (c) 2008-2010 Allen Day +# Copyright (c) 2011-2013 Trevor L. Davis <trevor.l.davis@stanford.edu> +# +# Modified by J.Vandel 2017 to consider situation of multiple identical flag +# and concatenate as a vector the set of parameter for the same flag instead of +# keeping only the last value as done by the previous version. +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +#' C-like getopt behavior +#' +#' getopt is primarily intended to be used with ``\link{Rscript}''. It +#' facilitates writing ``\#!'' shebang scripts that accept short and long +#' flags/options. It can also be used from ``R'' directly, but is probably less +#' useful in this context. +#' +#' getopt() returns a \link{list} data structure containing \link{names} of the +#' flags that were present in the \link{character} \link{vector} passed in under +#' the \emph{opt} argument. Each value of the \link{list} is coerced to the +#' data type specified according to the value of the \emph{spec} argument. See +#' below for details. +#' +#' Notes on naming convention: +#' +#' 1. An \emph{option} is one of the shell-split input strings. +#' +#' 2. A \emph{flag} is a type of \emph{option}. a \emph{flag} can be defined as +#' having no \emph{argument} (defined below), a required \emph{argument}, or an +#' optional \emph{argument}. +#' +#' 3. An \emph{argument} is a type of \emph{option}, and is the value associated +#' with a flag. +#' +#' 4. A \emph{long flag} is a type of \emph{flag}, and begins with the string +#' ``--''. If the \emph{long flag} has an associated \emph{argument}, it may be +#' delimited from the \emph{long flag} by either a trailing \emph{=}, or may be +#' the subsequent \emph{option}. +#' +#' 5. A \emph{short flag} is a type of \emph{flag}, and begins with the string +#' ``-''. If a \emph{short flag} has an associated \emph{argument}, it is the +#' subsequent \emph{option}. \emph{short flags} may be bundled together, +#' sharing a single leading ``-'', but only the final \emph{short flag} is able +#' to have a corresponding \emph{argument}. +#' +#' Many users wonder whether they should use the getopt package, optparse package, +#' or argparse package. +#' Here is some of the major differences: +#' +#' Features available in \code{getopt} unavailable in \code{optparse} +#' +#' 1. As well as allowing one to specify options that take either +#' no argument or a required argument like \code{optparse}, +#' \code{getopt} also allows one to specify option with an optional argument. +#' +#' Some features implemented in \code{optparse} package unavailable in \code{getopt} +#' +#' 1. Limited support for capturing positional arguments after the optional arguments +#' when \code{positional_arguments} set to TRUE in \code{parse_args} +#' +#' 2. Automatic generation of an help option and printing of help text when encounters an "-h" +#' +#' 3. Option to specify default arguments for options as well the +#' variable name to store option values +#' +#' There is also new package \code{argparse} introduced in 2012 which contains +#' all the features of both getopt and optparse but which has a dependency on +#' Python 2.7 or 3.2+ and has not been used in production since 2008 or 2009 +#' like the getopt and optparse packages. +#' +#' Some Features unlikely to be implemented in \code{getopt}: +#' +#' 1. Support for multiple, identical flags, e.g. for "-m 3 -v 5 -v", the +#' trailing "-v" overrides the preceding "-v 5", result is v=TRUE (or equivalent +#' typecast). +#' +#' 2. Support for multi-valued flags, e.g. "--libpath=/usr/local/lib +#' --libpath=/tmp/foo". +#' +#' 3. Support for lists, e.g. "--define os=linux --define os=redhat" would +#' set result$os$linux=TRUE and result$os$redhat=TRUE. +#' +#' 4. Support for incremental, argument-less flags, e.g. "/path/to/script +#' -vvv" should set v=3. +#' +#' 5. Support partial-but-unique string match on options, e.g. "--verb" and +#' "--verbose" both match long flag "--verbose". +#' +#' 6. No support for mixing in positional arguments or extra arguments that +#' don't match any options. For example, you can't do "my.R --arg1 1 foo bar +#' baz" and recover "foo", "bar", "baz" as a list. Likewise for "my.R foo +#' --arg1 1 bar baz". +#' +#' @aliases getopt getopt-package +#' @param spec The getopt specification, or spec of what options are considered +#' valid. The specification must be either a 4-5 column \link{matrix}, or a +#' \link{character} \link{vector} coercible into a 4 column \link{matrix} using +#' \link{matrix}(x,ncol=4,byrow=TRUE) command. The \link{matrix}/\link{vector} +#' contains: +#' +#' Column 1: the \emph{long flag} name. A multi-\link{character} string. +#' +#' Column 2: \emph{short flag} alias of Column 1. A single-\link{character} +#' string. +#' +#' Column 3: \emph{Argument} mask of the \emph{flag}. An \link{integer}. +#' Possible values: 0=no argument, 1=required argument, 2=optional argument. +#' +#' Column 4: Data type to which the \emph{flag}'s argument shall be cast using +#' \link{storage.mode}. A multi-\link{character} string. This only considered +#' for same-row Column 3 values of 1,2. Possible values: \link{logical}, +#' \link{integer}, \link{double}, \link{complex}, \link{character}. +#' If \link{numeric} is encountered then it will be converted to double. +#' +#' Column 5 (optional): A brief description of the purpose of the option. +#' +#' The terms \emph{option}, \emph{flag}, \emph{long flag}, \emph{short flag}, +#' and \emph{argument} have very specific meanings in the context of this +#' document. Read the ``Description'' section for definitions. +#' @param opt This defaults to the return value of \link{commandArgs}(TRUE). +#' +#' If R was invoked directly via the ``R'' command, this corresponds to all +#' arguments passed to R after the ``--args'' flag. +#' +#' If R was invoked via the ``\link{Rscript}'' command, this corresponds to all +#' arguments after the name of the R script file. +#' +#' Read about \link{commandArgs} and \link{Rscript} to learn more. +#' @param command The string to use in the usage message as the name of the +#' script. See argument \emph{usage}. +#' @param usage If TRUE, argument \emph{opt} will be ignored and a usage +#' statement (character string) will be generated and returned from \emph{spec}. +#' @param debug This is used internally to debug the getopt() function itself. +#' @author Allen Day +#' @seealso \code{\link{getopt}} +#' @keywords data +#' @export +#' @examples +#' +#' #!/path/to/Rscript +#' library('getopt'); +#' #get options, using the spec as defined by the enclosed list. +#' #we read the options from the default: commandArgs(TRUE). +#' spec = matrix(c( +#' 'verbose', 'v', 2, "integer", +#' 'help' , 'h', 0, "logical", +#' 'count' , 'c', 1, "integer", +#' 'mean' , 'm', 1, "double", +#' 'sd' , 's', 1, "double" +#' ), byrow=TRUE, ncol=4); +#' opt = getopt(spec); +#' +#' # if help was asked for print a friendly message +#' # and exit with a non-zero error code +#' if ( !is.null(opt$help) ) { +#' cat(getopt(spec, usage=TRUE)); +#' q(status=1); +#' } +#' +#' #set some reasonable defaults for the options that are needed, +#' #but were not specified. +#' if ( is.null(opt$mean ) ) { opt$mean = 0 } +#' if ( is.null(opt$sd ) ) { opt$sd = 1 } +#' if ( is.null(opt$count ) ) { opt$count = 10 } +#' if ( is.null(opt$verbose ) ) { opt$verbose = FALSE } +#' +#' #print some progress messages to stderr, if requested. +#' if ( opt$verbose ) { write("writing...",stderr()); } +#' +#' #do some operation based on user input. +#' cat(paste(rnorm(opt$count,mean=opt$mean,sd=opt$sd),collapse="\n")); +#' cat("\n"); +#' +#' #signal success and exit. +#' #q(status=0); +getopt = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],'|',spec[j,col.short.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste(" -%s|--%-",max.long,"s %s\n",sep=''), + spec[j,col.short.name],spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } else if ( substr(optstring, 1, 1) == '-' ) { + if ( debug ) print(paste(" short option:",opt[i])); + + these.flags = strsplit(optstring,'')[[1]]; + + done = FALSE; + for ( j in 2:length(these.flags) ) { + this.flag = these.flags[j]; + rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + + #short flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('short flag "', this.flag, '" is invalid', sep='')); + + #short flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + + #short flag has an argument, but is not the last in a compound flag string + } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + + #short flag has no argument, flag it as present + } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + done = TRUE; + + #can't definitively process this flag yet, need to see if next option is an argument or not + } else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + done = FALSE; + } + } + if ( done ) { + i = i + 1; + next; + } + } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} + + + +######################### +#set a modified version using only long named parameters + +getoptLong = function (spec=NULL,opt=commandArgs(TRUE),command=get_Rscript_filename(),usage=FALSE,debug=FALSE) { + + # littler compatibility - map argv vector to opt + if (exists("argv", where = .GlobalEnv, inherits = FALSE)) { + opt = get("argv", envir = .GlobalEnv); + } + + ncol=4; + maxcol=6; + col.long.name = 1; + #col.short.name = 2; + col.has.argument = 3; + col.mode = 4; + col.description = 5; + + flag.no.argument = 0; + flag.required.argument = 1; + flag.optional.argument = 2; + + result = list(); + result$ARGS = vector(mode="character"); + + #no spec. fail. + if ( is.null(spec) ) { + stop('argument "spec" must be non-null.'); + + #spec is not a matrix. attempt to coerce, if possible. issue a warning. + } else if ( !is.matrix(spec) ) { + if ( length(spec)/4 == as.integer(length(spec)/4) ) { + warning('argument "spec" was coerced to a 4-column (row-major) matrix. use a matrix to prevent the coercion'); + spec = matrix( spec, ncol=ncol, byrow=TRUE ); + } else { + stop('argument "spec" must be a matrix, or a character vector with length divisible by 4, rtfm.'); + } + + #spec is a matrix, but it has too few columns. + } else if ( dim(spec)[2] < ncol ) { + stop(paste('"spec" should have at least ",ncol," columns.',sep='')); + + #spec is a matrix, but it has too many columns. + } else if ( dim(spec)[2] > maxcol ) { + stop(paste('"spec" should have no more than ",maxcol," columns.',sep='')); + + #spec is a matrix, and it has some optional columns. + } else if ( dim(spec)[2] != ncol ) { + ncol = dim(spec)[2]; + } + + #sanity check. make sure long names are unique, and short names are unique. + if ( length(unique(spec[,col.long.name])) != length(spec[,col.long.name]) ) { + stop(paste('redundant long names for flags (column ',col.long.name,').',sep='')); + } + # if ( length(na.omit(unique(spec[,col.short.name]))) != length(na.omit(spec[,col.short.name])) ) { + # stop(paste('redundant short names for flags (column ',col.short.name,').',sep='')); + # } + # convert numeric type to double type + spec[,4] <- gsub("numeric", "double", spec[,4]) + + # if usage=TRUE, don't process opt, but generate a usage string from the data in spec + if ( usage ) { + ret = ''; + ret = paste(ret,"Usage: ",command,sep=''); + for ( j in 1:(dim(spec))[1] ) { + ret = paste(ret,' [-[-',spec[j,col.long.name],']',sep=''); + if (spec[j,col.has.argument] == flag.no.argument) { + ret = paste(ret,']',sep=''); + } else if (spec[j,col.has.argument] == flag.required.argument) { + ret = paste(ret,' <',spec[j,col.mode],'>]',sep=''); + } else if (spec[j,col.has.argument] == flag.optional.argument) { + ret = paste(ret,' [<',spec[j,col.mode],'>]]',sep=''); + } + } + # include usage strings + if ( ncol >= 5 ) { + max.long = max(apply(cbind(spec[,col.long.name]),1,function(x)length(strsplit(x,'')[[1]]))); + ret = paste(ret,"\n",sep=''); + for (j in 1:(dim(spec))[1] ) { + ret = paste(ret,sprintf(paste("--%-",max.long,"s %s\n",sep='') + ,spec[j,col.long.name],spec[j,col.description] + ),sep=''); + } + } + else { + ret = paste(ret,"\n",sep=''); + } + return(ret); + } + + #XXX check spec validity here. e.g. column three should be convertible to integer + + i = 1; + + while ( i <= length(opt) ) { + if ( debug ) print(paste("processing",opt[i])); + + current.flag = 0; #XXX use NA + optstring = opt[i]; + + + #long flag + if ( substr(optstring, 1, 2) == '--' ) { + if ( debug ) print(paste(" long option:",opt[i])); + + optstring = substring(optstring,3); + + this.flag = NA; + this.argument = NA; + kv = strsplit(optstring, '=')[[1]]; + if ( !is.na(kv[2]) ) { + this.flag = kv[1]; + this.argument = paste(kv[-1], collapse="="); + } else { + this.flag = optstring; + } + + rowmatch = grep( this.flag, spec[,col.long.name],fixed=TRUE ); + + #long flag is invalid, matches no options + if ( length(rowmatch) == 0 ) { + stop(paste('long flag "', this.flag, '" is invalid', sep='')); + + #long flag is ambiguous, matches too many options + } else if ( length(rowmatch) > 1 ) { + # check if there is an exact match and use that + rowmatch = which(this.flag == spec[,col.long.name]) + if(length(rowmatch) == 0) { + stop(paste('long flag "', this.flag, '" is ambiguous', sep='')); + } + } + + #if we have an argument + if ( !is.na(this.argument) ) { + #if we can't accept the argument, bail out + if ( spec[rowmatch, col.has.argument] == flag.no.argument ) { + stop(paste('long flag "', this.flag, '" accepts no arguments', sep='')); + + #otherwise assign the argument to the flag + } else { + storage.mode(this.argument) = spec[rowmatch, col.mode]; + #don't need here to remove the last value of the vector as argument is in the same string as + #the flag name "--flag=argument" so no spurious TRUE was added + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],this.argument); + i = i + 1; + next; + } + + #otherwise, we don't have an argument + } else { + #if we require an argument, bail out + ###if ( spec[rowmatch, col.has.argument] == flag.required.argument ) { + ### stop(paste('long flag "', this.flag, '" requires an argument', sep='')); + + #long flag has no attached argument. set flag as present. set current.flag so we can peek ahead later and consume the argument if it's there + ###} else { + result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + current.flag = rowmatch; + ###} + } + + #short flag(s) + } + #else if ( substr(optstring, 1, 1) == '-' ) { + # if ( debug ) print(paste(" short option:",opt[i])); + # + # these.flags = strsplit(optstring,'')[[1]]; + # + # done = FALSE; + # for ( j in 2:length(these.flags) ) { + # this.flag = these.flags[j]; + # rowmatch = grep( this.flag, spec[,col.short.name],fixed=TRUE ); + # + # #short flag is invalid, matches no options + # if ( length(rowmatch) == 0 ) { + # stop(paste('short flag "', this.flag, '" is invalid', sep='')); + # + # #short flag is ambiguous, matches too many options + # } else if ( length(rowmatch) > 1 ) { + # stop(paste('short flag "', this.flag, '" is ambiguous', sep='')); + # + # #short flag has an argument, but is not the last in a compound flag string + # } else if ( j < length(these.flags) & spec[rowmatch,col.has.argument] == flag.required.argument ) { + # stop(paste('short flag "', this.flag, '" requires an argument, but has none', sep='')); + # + # #short flag has no argument, flag it as present + # } else if ( spec[rowmatch,col.has.argument] == flag.no.argument ) { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # done = TRUE; + # + # #can't definitively process this flag yet, need to see if next option is an argument or not + # } else { + # result[[spec[rowmatch, col.long.name]]] = c(result[[spec[rowmatch, col.long.name]]],TRUE); + # current.flag = rowmatch; + # done = FALSE; + # } + # } + # if ( done ) { + # i = i + 1; + # next; + # } + # } + + #invalid opt + if ( current.flag == 0 ) { + stop(paste('"', optstring, '" is not a valid option, or does not support an argument', sep='')); + #TBD support for positional args + #if ( debug ) print(paste('"', optstring, '" not a valid option. It is appended to getopt(...)$ARGS', sep='')); + #result$ARGS = append(result$ARGS, optstring); + + # some dangling flag, handle it + } else if ( current.flag > 0 ) { + if ( debug ) print(' dangling flag'); + if ( length(opt) > i ) { + peek.optstring = opt[i + 1]; + if ( debug ) print(paste(' peeking ahead at: "',peek.optstring,'"',sep='')); + + #got an argument. attach it, increment the index, and move on to the next option. we don't allow arguments beginning with '-' UNLESS + #specfile indicates the value is an "integer" or "double", in which case we allow a leading dash (and verify trailing digits/decimals). + if ( substr(peek.optstring, 1, 1) != '-' | + #match negative double + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]*\\.?[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'double' + ) | + #match negative integer + ( substr(peek.optstring, 1, 1) == '-' + & regexpr('^-[0123456789]+$',peek.optstring) > 0 + & spec[current.flag, col.mode]== 'integer' + ) + ) { + if ( debug ) print(paste(' consuming argument *',peek.optstring,'*',sep='')); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #a lone dash + } else if ( substr(peek.optstring, 1, 1) == '-' & length(strsplit(peek.optstring,'')[[1]]) == 1 ) { + if ( debug ) print(' consuming "lone dash" argument'); + storage.mode(peek.optstring) = spec[current.flag, col.mode]; + #remove the last argument put in result for current.flag that should be a TRUE and concatenate argument with previous ones + result[[spec[current.flag, col.long.name]]] =c(result[[spec[current.flag, col.long.name]]][-length(result[[spec[current.flag, col.long.name]]])],peek.optstring); + i = i + 1; + + #no argument + } else { + if ( debug ) print(' no argument!'); + + #if we require an argument, bail out + if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #otherwise set flag as present. + } else if ( + spec[current.flag, col.has.argument] == flag.optional.argument | + spec[current.flag, col.has.argument] == flag.no.argument + ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop(paste("This should never happen.", + "Is your spec argument correct? Maybe you forgot to set", + "ncol=4, byrow=TRUE in your matrix call?")); + } + } + #trailing flag without required argument + } else if ( spec[current.flag, col.has.argument] == flag.required.argument ) { + stop(paste('flag "', this.flag, '" requires an argument', sep='')); + + #trailing flag without optional argument + } else if ( spec[current.flag, col.has.argument] == flag.optional.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + + #trailing flag without argument + } else if ( spec[current.flag, col.has.argument] == flag.no.argument ) { + x = TRUE; + storage.mode(x) = spec[current.flag, col.mode]; + result[[spec[current.flag, col.long.name]]] = c(result[[spec[current.flag, col.long.name]]],x); + } else { + stop("this should never happen (2). please inform the author."); + } + #no dangling flag, nothing to do. + } else { + } + + i = i+1; + } + return(result); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/heatMapClustering.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,896 @@ +# A command-line interface to plot heatmap based on expression or diff. exp. analysis +# written by Jimmy Vandel +# one of these arguments is required: +# +# +initial.options <- commandArgs(trailingOnly = FALSE) +file.arg.name <- "--file=" +script.name <- sub(file.arg.name, "", initial.options[grep(file.arg.name, initial.options)]) +script.basename <- dirname(script.name) +source(file.path(script.basename, "utils.R")) +source(file.path(script.basename, "getopt.R")) + +#addComment("Welcome R!") + +# setup R error handling to go to stderr +options( show.error.messages=F, error = function () { cat(geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) + +# we need that to not crash galaxy with an UTF8 error on German LC settings. +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +loc <- Sys.setlocale("LC_NUMERIC", "C") + +#get starting time +start.time <- Sys.time() + + +options(stringAsfactors = FALSE, useFancyQuotes = FALSE, OutDec=".") + +#get options +args <- commandArgs() + +# get options, using the spec as defined by the enclosed list. +# we read the options from the default: commandArgs(TRUE). +spec <- matrix(c( + "expressionFile", "x", 1, "character", + "diffAnalyseFile", "x", 1, "character", + "factorInfo","x", 1, "character", + "genericData","x", 0, "logical", + "comparisonName","x",1,"character", + "comparisonNameLow","x",1,"character", + "comparisonNameHigh","x",1,"character", + "filterInputOutput","x", 1, "character", + "FCthreshold","x", 1, "double", + "pvalThreshold","x", 1, "double", + "geneListFiltering","x",1,"character", + "clusterNumber","x",1,"integer", + "maxRows","x",1,"integer", + "sampleClusterNumber","x",1,"integer", + "dataTransformation","x",1,"character", + "distanceMeasure","x",1,"character", + "aggloMethod","x",1,"character", + "personalColors","x",1,"character", + "sideBarColorPalette","x",1,"character", + "format", "x", 1, "character", + "quiet", "x", 0, "logical", + "log", "x", 1, "character", + "outputFile" , "x", 1, "character"), + byrow=TRUE, ncol=4) +opt <- getoptLong(spec) + +# enforce the following required arguments +if (is.null(opt$log)) { + addComment("[ERROR]'log file' is required") + q( "no", 1, F ) +} +addComment("[INFO]Start of R script",T,opt$log,display=FALSE) +if (is.null(opt$format)) { + addComment("[ERROR]'output format' is required",T,opt$log) + q( "no", 1, F ) +} +if (is.null(opt$outputFile)) { + addComment("[ERROR]'output file' is required",T,opt$log) + q( "no", 1, F ) +} + +if(is.null(opt$expressionFile) && !is.null(opt$genericData)){ + addComment("[ERROR]generic data clustering is based on expression clustering",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$clusterNumber) || opt$clusterNumber<2) { + addComment("[ERROR]valid genes clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$sampleClusterNumber) || opt$sampleClusterNumber<1) { + addComment("[ERROR]valid samples clusters number is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$dataTransformation)) { + addComment("[ERROR]data transformation option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$distanceMeasure)) { + addComment("[ERROR]distance measure option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$aggloMethod)) { + addComment("[ERROR]agglomeration method option is required",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$maxRows) || opt$maxRows<2) { + addComment("[ERROR]valid plotted row number is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt[["comparisonName"]]) && nchar(opt[["comparisonName"]])==0){ + addComment("[ERROR]you have to specify comparison",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameLow) && nchar(opt$comparisonNameLow)==0){ + addComment("[ERROR]you have to specify comparisonLow",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$comparisonNameHigh) && nchar(opt$comparisonNameHigh)==0){ + addComment("[ERROR]you have to specify comparisonHigh",T,opt$log) + q( "no", 1, F ) +} + +if (is.null(opt$genericData) && (!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh))){ + addComment("[ERROR]comparisonLow and comparisonHigh can be specified only with generic data",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt[["comparisonName"]])){ + addComment("[ERROR]basic comparison cannot be specified for generic data",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt[["comparisonName"]]) || !is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)) && is.null(opt$diffAnalyseFile)) { + addComment("[ERROR]'diff. exp. analysis file' is required",T,opt$log) + q( "no", 1, F ) +} + +if (!is.null(opt$genericData) && !is.null(opt$diffAnalyseFile) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh)){ + addComment("[ERROR]Missing comparison information for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && (is.null(opt[["comparisonName"]]) && is.null(opt$comparisonNameLow) && is.null(opt$comparisonNameHigh))) { + addComment("[ERROR]'comparisons' are missing for filtering",T,opt$log) + q( "no", 1, F ) +} + +if ((!is.null(opt$FCthreshold) || !is.null(opt$pvalThreshold)) && !is.null(opt$geneListFiltering)) { + addComment("[ERROR]Cannot have two filtering strategies",T,opt$log) + q( "no", 1, F ) +} + +verbose <- if (is.null(opt$quiet)) { + TRUE +}else{ + FALSE} + +addComment("[INFO]Parameters checked!",T,opt$log,display=FALSE) + +addComment(c("[INFO]Working directory: ",getwd()),TRUE,opt$log,display=FALSE) +addComment(c("[INFO]Command line: ",args),TRUE,opt$log,display=FALSE) + +#directory for plots and HTML +dir.create(file.path(getwd(), "plotDir")) +dir.create(file.path(getwd(), "plotLyDir")) + +#silent package loading +suppressPackageStartupMessages({ + library("plotly") + library("dendextend") + #library("ggdendro") + #library("plyr") + library("ggplot2") + library("heatmaply") + library("circlize") + #library("RColorBrewer") + #source("https://bioconductor.org/biocLite.R") + #biocLite("ComplexHeatmap") + library("ComplexHeatmap") + #library("processx") +}) + +expressionToCluster=!is.null(opt$expressionFile) + +#load input data files +if(expressionToCluster){ + #first expression data + expressionMatrix=read.csv(file=opt$expressionFile,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames (to avoid X before colnames with header=T) + colNamesData=expressionMatrix[1,-1] + expressionMatrix=expressionMatrix[-1,] + #remove first colum to convert it as rownames + rowNamesData=expressionMatrix[,1] + expressionMatrix=expressionMatrix[,-1] + if(is.data.frame(expressionMatrix)){ + expressionMatrix=data.matrix(expressionMatrix) + }else{ + expressionMatrix=data.matrix(as.numeric(expressionMatrix)) + } + dimnames(expressionMatrix)=list(rowNamesData,colNamesData) + + #check input files + if (!is.numeric(expressionMatrix)) { + addComment("[ERROR]Expression data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Expression data loaded and checked") + addComment(c("[INFO]Dim of expression matrix:",dim(expressionMatrix)),T,opt$log,display=FALSE) +} + +nbComparisons=0 +nbColPerContrast=5 +comparisonMatrix=NULL +comparisonMatrixInfoGene=NULL +#if available comparisons +if(!is.null(opt[["comparisonName"]])){ + #load results from differential expression analysis + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove the second line also as it's information line (p-val,FDR.p-val,FC,logFC) + comparisonMatrix=comparisonMatrix[-c(1,2),] + #remove first and second colums, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + #and save second column content that contain geneInfo + comparisonMatrixInfoGene=as.character(unlist(comparisonMatrix[,2])) + names(comparisonMatrixInfoGene)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[,-c(1,2)] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (ncol(comparisonMatrix)%%nbColPerContrast != 0) { + addComment("[ERROR]Diff. exp. data does not contain good number of columns per contrast, should contains in this order:p-val,FDR.p-val,FC,log2(FC) and t-stat",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(max(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])>1 || min(comparisonMatrix[,c(seq(1,ncol(comparisonMatrix),nbColPerContrast),seq(2,ncol(comparisonMatrix),nbColPerContrast))])<0){ + addComment("[ERROR]Seem that diff. exp. data does not contain correct values for p-val and FDR.p-val columns, should be including in [0,1] interval",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Diff. exp. data is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from diff. exp. file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in diff. exp. file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Diff. exp. analysis loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original comparison matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=unlist(strsplit(opt[["comparisonName"]],",")) + #should be improved to avoid selection of column names starting too similarly + colToKeep=which(unlist(lapply(colnames(comparisonMatrix),function(x)any(startsWith(x,restrictedComparisons))))) + comparisonMatrix=matrix(comparisonMatrix[,colToKeep],ncol=length(colToKeep),dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[colToKeep])) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix)/nbColPerContrast + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + +#should be only the case with generic data +if(!is.null(opt$comparisonNameLow) || !is.null(opt$comparisonNameHigh)){ + #load generic data used for filtering + nbColPerContrast=1 + #consider first row contains column names + comparisonMatrix=read.csv(file=opt$diffAnalyseFile,header=F,sep="\t") + colnames(comparisonMatrix)=as.character(unlist(comparisonMatrix[1,])) + #remove first colum, convert the first one as rownames + rownames(comparisonMatrix)=as.character(unlist(comparisonMatrix[,1])) + comparisonMatrix=comparisonMatrix[-1,-1] + + comparisonMatrix=matrix(as.numeric(as.matrix(comparisonMatrix)),ncol=ncol(comparisonMatrix),dimnames = dimnames(comparisonMatrix)) + + if (!is.numeric(comparisonMatrix)) { + addComment("[ERROR]Filtering matrix is not fully numeric!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + if(expressionToCluster && length(setdiff(rownames(comparisonMatrix),rownames(expressionMatrix)))!=0){ + addComment("[WARNING]All genes from filtering file are not included in expression file",T,opt$log,display=FALSE) + } + + if(expressionToCluster && length(setdiff(rownames(expressionMatrix),rownames(comparisonMatrix)))!=0){ + addComment("[WARNING]All genes from expression file are not included in filtering file",T,opt$log,display=FALSE) + } + + addComment("[INFO]Filtering file loaded and checked",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of original filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) + + #restrict to user specified comparisons + restrictedComparisons=c() + if(!is.null(opt$comparisonNameLow))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameLow,",")))) + if(!is.null(opt$comparisonNameHigh))restrictedComparisons=unique(c(restrictedComparisons,unlist(strsplit(opt$comparisonNameHigh,",")))) + + if (!all(restrictedComparisons%in%colnames(comparisonMatrix))){ + addComment("[ERROR]Selected columns in filtering file are not present in filtering matrix!",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + comparisonMatrix=matrix(comparisonMatrix[,restrictedComparisons],ncol=length(restrictedComparisons),dimnames = list(rownames(comparisonMatrix),restrictedComparisons)) + + #get number of required comparisons + nbComparisons=ncol(comparisonMatrix) + + addComment(c("[INFO]Dim of effective filtering matrix:",dim(comparisonMatrix)),T,opt$log,display=FALSE) +} + + + +factorInfoMatrix=NULL +if(!is.null(opt$factorInfo)){ + #get group information + #load factors file + factorInfoMatrix=read.csv(file=opt$factorInfo,header=F,sep="\t",colClasses="character") + #remove first row to convert it as colnames + colnames(factorInfoMatrix)=factorInfoMatrix[1,] + factorInfoMatrix=factorInfoMatrix[-1,] + #use first colum to convert it as rownames but not removing it to avoid conversion as vector in unique factor case + rownames(factorInfoMatrix)=factorInfoMatrix[,1] + + factorBarColor=colnames(factorInfoMatrix)[2] + + if(ncol(factorInfoMatrix)>2){ + addComment("[ERROR]Factors file should not contain more than 2 columns",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all expression matrix column should be in the factor file + if(expressionToCluster && length(setdiff(colnames(expressionMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing samples in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + #factor file is used for color band on heatmap, so all comparison matrix column should be in the factor file + if(!expressionToCluster && length(setdiff(colnames(comparisonMatrix),rownames(factorInfoMatrix)))!=0){ + addComment("[ERROR]Missing differential contrasts in factor file",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Factors OK",T,opt$log,display=FALSE) + addComment(c("[INFO]Dim of factorInfo matrix:",dim(factorInfoMatrix)),T,opt$log,display=FALSE) +} + +if(!is.null(opt$personalColors)){ + ##parse personal colors + personalColors=unlist(strsplit(opt$personalColors,",")) + if(length(personalColors)==2){ + ##add medium color between two to get three colors + personalColors=c(personalColors[1],paste(c("#",as.character(as.hexmode(floor(apply(col2rgb(personalColors),1,mean))))),collapse=""),personalColors[2]) + } + if(length(personalColors)!=3){ + addComment("[ERROR]Personalized colors doesn't contain enough colors",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + +} + + +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="input"){ + #filter input data + + if(is.null(opt$geneListFiltering)){ + #filtering using stat thresholds + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]<opt$pvalThreshold))!=0))))) + } + } + }else{ + #filtering using user gene list + geneListFiltering=read.csv(opt$geneListFiltering,as.is = 1,header=F) + rowToKeep=unlist(c(geneListFiltering)) + } + + if(!is.null(comparisonMatrix) && !all(rowToKeep%in%rownames(comparisonMatrix))){ + #should arrive only with user gene list filtering with diff.exp. results clustering + addComment("[WARNING] some genes of the user defined list are not in the diff. exp. input file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(comparisonMatrix)) + } + + if(expressionToCluster && !all(rowToKeep%in%rownames(expressionMatrix))){ + addComment("[WARNING] some genes selected by the input filter are not in the expression file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(expressionMatrix)) + } + + if(length(rowToKeep)==0){ + addComment("[ERROR]No gene survived to the input filtering thresholds, execution will be aborted. + Please consider to change threshold values and re-run the tool.",T,opt$log) + q( "no", 1, F ) + } + + #filter comparison matrix + if(!is.null(comparisonMatrix)){ + comparisonMatrix=matrix(comparisonMatrix[rowToKeep,],ncol=ncol(comparisonMatrix),dimnames = list(rowToKeep,colnames(comparisonMatrix))) + if(!is.null(comparisonMatrixInfoGene))comparisonMatrixInfoGene=comparisonMatrixInfoGene[rowToKeep] + } + #then expression matrix + if(expressionToCluster)expressionMatrix=matrix(expressionMatrix[rowToKeep,],ncol=ncol(expressionMatrix),dimnames = list(rowToKeep,colnames(expressionMatrix))) + + if(!is.null(comparisonMatrix) && expressionToCluster && nrow(comparisonMatrix)!=nrow(expressionMatrix)){ + addComment("[ERROR]Problem during input filtering, please check code",T,opt$log,display=FALSE) + q( "no", 1, F ) + } + + addComment("[INFO]Filtering step done",T,opt$log,display=FALSE) + addComment(c("[INFO]Input filtering step:",length(rowToKeep),"remaining rows"),T,opt$log,display=FALSE) +} + + +addComment("[INFO]Ready to plot",T,opt$log,display=FALSE) + +##--------------------- + +#plot heatmap +if(expressionToCluster){ + #will make clustering based on expression value or generic value + dataToHeatMap=expressionMatrix + valueMeaning="Intensity" + if(!is.null(opt$genericData))valueMeaning="Value" +}else{ + #will make clustering on log2(FC) values + dataToHeatMap=matrix(comparisonMatrix[,seq(4,ncol(comparisonMatrix),nbColPerContrast)],ncol=nbComparisons,dimnames = list(rownames(comparisonMatrix),colnames(comparisonMatrix)[seq(1,ncol(comparisonMatrix),nbColPerContrast)])) + valueMeaning="Log2(FC)" +} +addComment(c("[INFO]Dim of heatmap matrix:",dim(dataToHeatMap)),T,opt$log,display=FALSE) + +if(nrow(dataToHeatMap)==1 && ncol(dataToHeatMap)==1){ + addComment("[ERROR]Cannot make clustering with unique cell tab",T,opt$log,display=FALSE) + q( "no", 1, F ) +} + + +#apply data transformation if needed +if(opt$dataTransformation=="log"){ + dataToHeatMap=log(dataToHeatMap) + valueMeaning=paste(c("log(",valueMeaning,")"),collapse="") + addComment("[INFO]Data to cluster and to display in the heatmap are log transformed",T,opt$log,display=FALSE) +} +if(opt$dataTransformation=="log2"){ + dataToHeatMap=log2(dataToHeatMap) + valueMeaning=paste(c("log2(",valueMeaning,")"),collapse="") + addComment("[INFO]Data to cluster and to display in the heatmap are log2 transformed",T,opt$log,display=FALSE) +} + +maxRowsToDisplay=opt$maxRows + +nbClusters=opt$clusterNumber +if(nbClusters>nrow(dataToHeatMap)){ + #correct number of clusters if needed + nbClusters=nrow(dataToHeatMap) + addComment(c("[WARNING]Not enough rows to reach required clusters number, it is reduced to number of rows:",nbClusters),T,opt$log,display=FALSE) +} + +nbSampleClusters=opt$sampleClusterNumber +if(nbSampleClusters>ncol(dataToHeatMap)){ + #correct number of clusters if needed + nbSampleClusters=ncol(dataToHeatMap) + addComment(c("[WARNING]Not enough columns to reach required conditions clusters number, it is reduced to number of columns:",nbSampleClusters),T,opt$log,display=FALSE) +} + +colClust=FALSE +rowClust=FALSE +effectiveRowClust=FALSE + +#make appropriate clustering if needed +if(nrow(dataToHeatMap)>1 && nbClusters>1)rowClust=hclust(distExtended(dataToHeatMap,method = opt$distanceMeasure),method = opt$aggloMethod) +if(ncol(dataToHeatMap)>1 && nbSampleClusters>1)colClust=hclust(distExtended(t(dataToHeatMap),method = opt$distanceMeasure),method = opt$aggloMethod) + +if(nrow(dataToHeatMap)>maxRowsToDisplay){ + #make subsampling based on preliminary global clustering + #clusteringResults=cutree(rowClust,nbClusters) + #heatMapGenesToKeep=unlist(lapply(seq(1,nbClusters),function(x)sample(which(clusteringResults==x),min(length(which(clusteringResults==x)),round(maxRowsToDisplay/nbClusters))))) + ##OR + #basic subsampling + heatMapGenesToKeep=sample(rownames(dataToHeatMap),maxRowsToDisplay) + effectiveDataToHeatMap=matrix(dataToHeatMap[heatMapGenesToKeep,],ncol=ncol(dataToHeatMap),dimnames=list(heatMapGenesToKeep,colnames(dataToHeatMap))) + effectiveNbClusters=min(nbClusters,maxRowsToDisplay) + if(nrow(effectiveDataToHeatMap)>1 && effectiveNbClusters>1)effectiveRowClust=hclust(distExtended(effectiveDataToHeatMap, method = opt$distanceMeasure),method = opt$aggloMethod) + addComment(c("[WARNING]Too many rows for efficient heatmap drawing",maxRowsToDisplay,"subsampling is done for vizualization only"),T,opt$log,display=FALSE) + rm(heatMapGenesToKeep) +}else{ + effectiveDataToHeatMap=dataToHeatMap + effectiveRowClust=rowClust + effectiveNbClusters=nbClusters +} + +addComment(c("[INFO]Dim of plotted heatmap matrix:",dim(effectiveDataToHeatMap)),T,opt$log,display=FALSE) + +personalized_hoverinfo=matrix("",ncol = ncol(effectiveDataToHeatMap),nrow = nrow(effectiveDataToHeatMap),dimnames = dimnames(effectiveDataToHeatMap)) +if(expressionToCluster){ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\n",valueMeaning,": ",effectiveDataToHeatMap[iRow,iCol]),collapse="")}} +}else{ + for(iCol in colnames(effectiveDataToHeatMap)){for(iRow in rownames(effectiveDataToHeatMap)){personalized_hoverinfo[iRow,iCol]=paste(c("Probe: ",iRow,"\nCondition: ",iCol,"\nFC: ",round(2^effectiveDataToHeatMap[iRow,iCol],2)),collapse="")}} +} + +#trying to overcome limitation of heatmaply package to modify xtick and ytick label, using directly plotly functions, but for now plotly do not permit to have personalized color for each x/y tick separately +test=FALSE +if(test==TRUE){ + + #define dendogram shapes + dd.row <- as.dendrogram(effectiveRowClust) + dd.col <- as.dendrogram(colClust) + + #and color them + dd.row=color_branches(dd.row, k = effectiveNbClusters, groupLabels = T) + dd.col=color_branches(dd.col, k = nbSampleClusters, groupLabels = T) + + #generating function for dendogram from segment list + ggdend <- function(df) { + ggplot() + + geom_segment(data = df, aes(x=x, y=y, xend=xend, yend=yend)) + + labs(x = "", y = "") + theme_minimal() + + theme(axis.text = element_blank(), axis.ticks = element_blank(), + panel.grid = element_blank()) + } + + # generate x/y dendogram plots + px <- ggdend(dendro_data(dd.col)$segments) + py <- ggdend(dendro_data(dd.row)$segments) + coord_flip() + + # reshape data matrix + col.ord <- order.dendrogram(dd.col) + row.ord <- order.dendrogram(dd.row) + xx <- effectiveDataToHeatMap[row.ord, col.ord] + # and also personalized_hoverinfo + personalized_hoverinfo=personalized_hoverinfo[row.ord, col.ord] + + # hide axis ticks and grid lines + eaxis <- list( + showticklabels = FALSE, + showgrid = FALSE, + zeroline = FALSE + ) + + #make the empty plot + p_empty <- plot_ly() %>% + layout(margin = list(l = 200), + xaxis = eaxis, + yaxis = eaxis) + + heatmap.plotly <- plot_ly( + z = xx, x = 1:ncol(xx), y = 1:nrow(xx), colors = viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno"), + type = "heatmap", showlegend = FALSE, text = personalized_hoverinfo, hoverinfo = "text", + colorbar = list( + # Capitalise first letter + title = valueMeaning, + tickmode = "array", + len = 0.3 + ) + ) %>% + layout( + xaxis = list( + tickfont = list(size = 10,color=get_leaves_branches_col(dd.row)), + tickangle = 45, + tickvals = 1:ncol(xx), ticktext = colnames(xx), + linecolor = "#ffffff", + range = c(0.5, ncol(xx) + 0.5), + showticklabels = TRUE + ), + yaxis = list( + tickfont = list(size = 10, color=get_leaves_branches_col(dd.col)), + tickangle = 0, + tickvals = 1:nrow(xx), ticktext = rownames(xx), + linecolor = "#ffffff", + range = c(0.5, nrow(xx) + 0.5), + showticklabels = TRUE + ) + ) + + #generate plotly + pp <- subplot(px, p_empty, heatmap.plotly, py, nrows = 2, margin = 0,widths = c(0.8,0.2),heights = c(0.2,0.8), shareX = TRUE, + shareY = TRUE) + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + + #htmlwidgets::saveWidget(as_widget(pp),"~/Bureau/test.html",selfcontained = F) + +}else{ #test + label_names=c("Probe","Condition",valueMeaning) + + # #color hclust objects + # dd.row=color_branches(effectiveRowClust, k = effectiveNbClusters) + # #rowColors=get_leaves_branches_col(dd.row) + # #rowColors[order.dendrogram(dd.row)]=rowColors + # rowGroup=cutree(effectiveRowClust, k = effectiveNbClusters) + # + # #get order of class as they will be displayed on the dendogram + # rowGroupRenamed=data.frame(cluster=mapvalues(rowGroup, unique(rowGroup[order.dendrogram(dd.row)[nleaves(dd.row):1]]), 1:effectiveNbClusters)) + # + # dd.col=color_branches(colClust, k = nbSampleClusters) + # #colColors=get_leaves_branches_col(dd.col) + # #colColors[order.dendrogram(dd.col)]=colColors + # colGroup=cutree(colClust, k = nbSampleClusters) + # + # # #get order of class as they will be displayed on the dendogram + # colGroupRenamed=data.frame(sampleCluster=mapvalues(colGroup, unique(colGroup[order.dendrogram(dd.col)[nleaves(dd.col):1]]), 1:nbSampleClusters)) + + + #while option is not correctly managed by heatmap apply, put personalized_hoverinfo to NULL + personalized_hoverinfo=NULL + + if(is.null(opt$personalColors)){ + heatmapColors=viridis(n = 101, alpha = 1, begin = 0, end = 1, option = "inferno") + }else{ + heatmapColors=personalColors + } + + colGroupRenamed=NULL + if(!is.null(factorInfoMatrix)){ + colGroupRenamed=eval(parse(text=(paste("data.frame(",factorBarColor,"=factorInfoMatrix[colnames(effectiveDataToHeatMap),2])",sep="")))) + sideBarGroupNb=length(table(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + sideBarColorPaletteName="Spectral" + if(!is.null(opt$sideBarColorPalette) && opt$sideBarColorPalette%in%rownames(RColorBrewer::brewer.pal.info)){ + sideBarColorPaletteName=opt$sideBarColorPalette + } + sideBarColorPalette=setNames(colorRampPalette(RColorBrewer::brewer.pal(RColorBrewer::brewer.pal.info[sideBarColorPaletteName,"maxcolors"], sideBarColorPaletteName))(sideBarGroupNb),unique(factorInfoMatrix[colnames(effectiveDataToHeatMap),2])) + } + + if(!is.null(colGroupRenamed)){ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,col_side_colors=colGroupRenamed,col_side_palette=sideBarColorPalette,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + }else{ + pp <- heatmaply(effectiveDataToHeatMap,key.title = valueMeaning,k_row=effectiveNbClusters,k_col=nbSampleClusters,Rowv=effectiveRowClust,Colv=colClust,label_names=label_names,custom_hovertext=personalized_hoverinfo,plot_method = "plotly",colors = heatmapColors) + } + + + #save image file + export(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + #rise a bug due to token stuf + #orca(pp, file = paste(c(file.path(getwd(), "plotDir"),"/Heatmap.",opt$format),collapse="")) + + + #save plotLy file + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/Heatmap.html"),collapse=""),selfcontained = F) + +} +addComment("[INFO]Heatmap drawn",T,opt$log,display=FALSE) + + +#plot circular heatmap +if(!class(effectiveRowClust)=="logical"){ + dendo=as.dendrogram(effectiveRowClust) + + if(is.null(opt$personalColors)){ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.01)), viridis(101,option = "inferno")) + }else{ + col_fun = colorRamp2(quantile(effectiveDataToHeatMap,probs = seq(0,1,0.5)), personalColors) + } + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/circularPlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/circularPlot.png"),collapse="")) + } + + circos.par(cell.padding = c(0, 0, 0, 0), gap.degree = 5) + circos.initialize(c(rep("a",nrow(effectiveDataToHeatMap)),"b"),xlim=cbind(c(0,0),c(nrow(effectiveDataToHeatMap),5))) + circos.track(ylim = c(0, 1), bg.border = NA, panel.fun = function(x, y) { + if(CELL_META$sector.index=="a"){ + nr = ncol(effectiveDataToHeatMap) + nc = nrow(effectiveDataToHeatMap) + circos.text(1:nc- 0.5, rep(0,nc), adj = c(0, 0), + rownames(effectiveDataToHeatMap)[order.dendrogram(dendo)], facing = "clockwise", niceFacing = TRUE, cex = 0.3) + } + }) + + circos.track(ylim = c(0, ncol(effectiveDataToHeatMap)), bg.border = NA, panel.fun = function(x, y) { + + m = t(matrix(effectiveDataToHeatMap[order.dendrogram(dendo),],ncol=ncol(effectiveDataToHeatMap))) + col_mat = col_fun(m) + nr = nrow(m) + nc = ncol(m) + if(CELL_META$sector.index=="a"){ + for(i in 1:nr) { + circos.rect(1:nc - 1, rep(nr - i, nc), + 1:nc, rep(nr - i + 1, nc), + border = col_mat[i, ], col = col_mat[i, ]) + } + }else{ + circos.text(rep(1,nr), seq(nr,1,-1) , colnames(effectiveDataToHeatMap),cex = 0.3) + } + }) + + #dendo = color_branches(dendo, k = effectiveNbClusters, col = colorRampPalette(brewer.pal(12,"Set3"))(effectiveNbClusters)) + dendo = color_branches(dendo, k = effectiveNbClusters, col = rev(colorspace::rainbow_hcl(effectiveNbClusters))) + + + circos.track(ylim = c(0, attributes(dendo)$height), bg.border = NA, track.height = 0.25, + panel.fun = function(x, y) { + if(CELL_META$sector.index=="a")circos.dendrogram(dendo)} ) + + circos.clear() + ##add legend + lgd_links = Legend(at = seq(ceiling(min(effectiveDataToHeatMap)),floor(max(effectiveDataToHeatMap)),ceiling((floor(max(effectiveDataToHeatMap))-ceiling(min(effectiveDataToHeatMap)))/4)), col_fun = col_fun, + title_position = "topleft", grid_width = unit(5, "mm") ,title = valueMeaning) + + pushViewport(viewport(x = 0.85, y = 0.80, + width = 0.1, + height = 0.1, + just = c("left", "bottom"))) + grid.draw(lgd_links) + upViewport() + + + dev.off() + + addComment("[INFO]Circular heatmap drawn",T,opt$log,display=FALSE) + loc <- Sys.setlocale("LC_NUMERIC","C") +}else{ + addComment(c("[WARNING]Circular plot will not be plotted considering row or cluster number < 2"),T,opt$log,display=FALSE) +} +rm(effectiveDataToHeatMap,effectiveRowClust,effectiveNbClusters) + +#plot screeplot +if(class(rowClust)!="logical" && nrow(dataToHeatMap)>2){ + screePlotData=c() + for(iNbClusters in 2:(nbClusters+min(10,max(0,nrow(dataToHeatMap)-nbClusters)))){ + clusteringResults=cutree(rowClust,iNbClusters) + #clusteringResults=kmeans(dataToHeatMap,iNbClusters)$cluster + + #compute variance between each intra-class points amongst themselves (need at least 3 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>2){var(dist(dataToHeatMap[temp,]))}else{0}}))) ) + #compute variance between each intra-class points and fictive mean point (need at least 2 points by cluster) + #screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ var(dist(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]))[1:length(temp)]) }else{0}}))) ) + if(ncol(dataToHeatMap)>1)screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((distExtended(rbind(apply(dataToHeatMap[temp,],2,mean),dataToHeatMap[temp,]),method = opt$distanceMeasure)[1:length(temp)])^2) }else{0}}))) ) + else screePlotData=c(screePlotData,sum(unlist(lapply(seq(1,iNbClusters),function(x){temp=which(clusteringResults==x);if(length(temp)>1){ sum((dataToHeatMap[temp,]-mean(dataToHeatMap[temp,]))^2) }else{0}}))) ) + } + + dataToPlot=data.frame(clusterNb=seq(2,length(screePlotData)+1),wcss=screePlotData) + p <- ggplot(data=dataToPlot, aes(clusterNb,wcss)) + geom_point(colour="#EE4444") + geom_line(colour="#DD9999") + + ggtitle("Scree plot") + theme_bw() + xlab(label="Cluster number") + ylab(label="Within cluster sum of squares") + + theme(panel.border=element_blank(),plot.title = element_text(hjust = 0.5),legend.position = "none") + + scale_x_continuous(breaks=seq(min(dataToPlot$clusterNb), max(dataToPlot$clusterNb), 1)) + + #save plotly files + pp <- ggplotly(p) + + if(opt$format=="pdf"){ + pdf(paste(c("./plotDir/screePlot.pdf"),collapse=""))}else{ + png(paste(c("./plotDir/screePlot.png"),collapse="")) + } + plot(p) + dev.off() + + #save plotly files + htmlwidgets::saveWidget(as_widget(pp), paste(c(file.path(getwd(), "plotLyDir"),"/screePlot.html"),collapse=""),selfcontained = F) + + addComment("[INFO]Scree plot drawn",T,opt$log,display=FALSE) +}else{ + addComment(c("[WARNING]Scree plot will not be plotted considering row number <= 2"),T,opt$log,display=FALSE) +} + +##---------------------- + +#filter output based on parameters + +rowToKeep=rownames(dataToHeatMap) +if(!is.null(opt$filterInputOutput) && opt$filterInputOutput=="output"){ + #rowToKeep=intersect(which(comparisonMatrix[,seq(2,ncol(comparisonMatrix),4)]<=opt$pvalThreshold),which(abs(comparisonMatrix[,seq(4,ncol(comparisonMatrix),4)])>=log2(opt$FCthreshold))) + if(is.null(opt$geneListFiltering)){ + if(is.null(opt$genericData)){ + #diff. expression matrix + rowToKeep=names(which(unlist(apply(comparisonMatrix,1,function(x)length(intersect(which(x[seq(2,length(x),nbColPerContrast)]<=opt$pvalThreshold),which(abs(x[seq(4,length(x),nbColPerContrast)])>=log2(opt$FCthreshold))))!=0)))) + }else{ + #generic filtering matrix + rowToKeep=rownames(comparisonMatrix) + if(!is.null(opt$comparisonNameLow)){ + restrictedLowComparisons=unlist(strsplit(opt$comparisonNameLow,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedLowComparisons]>opt$FCthreshold))!=0))))) + } + if(!is.null(opt$comparisonNameHigh)){ + restrictedHighComparisons=unlist(strsplit(opt$comparisonNameHigh,",")) + rowToKeep=intersect(rowToKeep,names(which(unlist(apply(comparisonMatrix,1,function(x)length(which(x[restrictedHighComparisons]<opt$pvalThreshold))!=0))))) + } + } + }else{ + geneListFiltering=read.csv(opt$geneListFiltering,as.is = 1,header=F) + rowToKeep=unlist(c(geneListFiltering)) + } + if(!is.null(comparisonMatrix) && !all(rowToKeep%in%rownames(comparisonMatrix))){ + #should arrive only with user gene list filtering with diff.exp. results clustering + addComment("[WARNING] some genes of the user defined list are not in the diff. exp. input file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(comparisonMatrix)) + } + + if(expressionToCluster && !all(rowToKeep%in%rownames(expressionMatrix))){ + addComment("[WARNING] some genes selected by the output filter are not in the expression file",T,opt$log) + rowToKeep=intersect(rowToKeep,rownames(expressionMatrix)) + } + addComment(c("[INFO]Output filtering step:",length(rowToKeep),"remaining rows"),T,opt$log,display=FALSE) +} + +#we add differential analysis info in output if it was directly used for clustering or when it was used for filtering with expression + +#in case of expression or generic data clustering without filtering based on external stats +if(expressionToCluster && is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(c("Gene","Cluster","noGene","noClustering"),ncol=2,nrow=2,byrow = TRUE) + }else{ + outputData=matrix(0,ncol=2,nrow=length(rowToKeep)+1) + outputData[1,]=c("Gene","Cluster") + outputData[2:(length(rowToKeep)+1),1]=rowToKeep + if(class(rowClust)!="logical" ){ + outputData[2:(length(rowToKeep)+1),2]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[2:(length(rowToKeep)+1),2]=0 + } + } +} + +#in case of generic data clustering with filtering based on generic external data +if(!is.null(opt$genericData) && !is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(c("Gene","Cluster","noGene","noClustering"),ncol=2,nrow=2,byrow = TRUE) + }else{ + outputData=matrix(0,ncol=2+nbComparisons,nrow=length(rowToKeep)+1) + outputData[1,]=c("Gene","Cluster",colnames(comparisonMatrix)) + outputData[2:(length(rowToKeep)+1),1]=rowToKeep + if(class(rowClust)!="logical" ){ + outputData[2:(length(rowToKeep)+1),2]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[2:(length(rowToKeep)+1),2]=0 + } + outputData[2:(length(rowToKeep)+1),3:(ncol(comparisonMatrix)+2)]=prettyNum(comparisonMatrix[rowToKeep,],digits=4) + } +} + +#in case of expression data clustering with filtering based on diff. exp. results or diff. exp. results clustering +if(is.null(opt$genericData) && !is.null(comparisonMatrix)){ + if(length(rowToKeep)==0){ + addComment("[WARNING]No more gene after output filtering step, tabular output will be empty",T,opt$log,display=FALSE) + outputData=matrix(0,ncol=3,nrow=3) + outputData[1,]=c("","","Comparison") + outputData[2,]=c("Gene","Info","Cluster") + outputData[3,]=c("noGene","noInfo","noClustering") + }else{ + outputData=matrix(0,ncol=3+nbComparisons*nbColPerContrast,nrow=length(rowToKeep)+2) + outputData[1,]=c("","","Comparison",rep(colnames(comparisonMatrix)[seq(1,ncol(comparisonMatrix),nbColPerContrast)],each=nbColPerContrast)) + outputData[2,]=c("Gene","Info","Cluster",rep(c("p-val","FDR.p-val","FC","log2(FC)","t-stat"),nbComparisons)) + outputData[3:(length(rowToKeep)+2),1]=rowToKeep + outputData[3:(length(rowToKeep)+2),2]=comparisonMatrixInfoGene[rowToKeep] + if(class(rowClust)!="logical" ){ + outputData[3:(length(rowToKeep)+2),3]=cutree(rowClust,nbClusters)[rowToKeep] + }else{ + outputData[3:(length(rowToKeep)+2),3]=0 + } + outputData[3:(length(rowToKeep)+2),4:(ncol(comparisonMatrix)+3)]=prettyNum(comparisonMatrix[rowToKeep,],digits=4) + } +} + +addComment("[INFO]Formated output",T,opt$log,display=FALSE) +write.table(outputData,file=opt$outputFile,quote=FALSE,sep="\t",col.names = F,row.names = F) + +##---------------------- + +end.time <- Sys.time() +addComment(c("[INFO]Total execution time for R script:",as.numeric(end.time - start.time,units="mins"),"mins"),T,opt$log,display=FALSE) + + +addComment("[INFO]End of R script",T,opt$log,display=FALSE) + +printSessionInfo(opt$log) + +#sessionInfo() + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/utils.R Fri Jun 26 09:36:46 2020 -0400 @@ -0,0 +1,143 @@ +# Copyright (c) 2011-2013 Trevor L. Davis <trevor.l.davis@stanford.edu> +# +# This file is free software: you may copy, redistribute and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 2 of the License, or (at your +# option) any later version. +# +# This file is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +#extendedDist function to correlation measure +distExtended <- function(x,method) { + if(method %in% c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski"))return(dist(x,method = method)) + if(method %in% c("pearson", "spearman", "kendall"))return(as.dist(1-cor(t(x),method=method))/2) + if(method %in% c("absPearson", "absSpearman", "absKendall"))return(as.dist(1-abs(cor(t(x),method=method)))) + return(NULL) +} + +##comment function to display message and optionnaly add it to log file + +addComment <- function(text,addToFile=FALSE,fileName=NULL,append=TRUE,display=TRUE){ + if(display)cat(paste(c(text,"\n"),collapse = " ")) + if(addToFile)write(paste(text,collapse = " "),fileName,append=append) +} + +printSessionInfo <- function(fileName=NULL,append=TRUE){ + addComment("[INFO]R session info :",T,fileName,display=FALSE) + tempInfo=sessionInfo() + write(paste(tempInfo$R.version$version.string),fileName,append=append) + write(paste("Platform",tempInfo$platform,sep = " : "),fileName,append=append) + write(paste("Running under",tempInfo$running,sep = " : "),fileName,append=append) + write(paste("Local variables",tempInfo$locale,sep = " : "),fileName,append=append) + write(paste("Attached base packages",paste(tempInfo$basePkgs,collapse = "; "),sep = " : "),fileName,append=append) + if(length(tempInfo$otherPkgs)>0){ + lineToPrint="" + for(iPack in tempInfo$otherPkgs){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Other attached packages",lineToPrint,sep = " : "),fileName,append=append) + } + if(length(tempInfo$loadedOnly)>0){ + lineToPrint="" + for(iPack in tempInfo$loadedOnly){ + lineToPrint=paste(lineToPrint,iPack$Package," ",iPack$Version,"; ",sep = "") + } + write(paste("Loaded packages",lineToPrint,sep = " : "),fileName,append=append) + } +} + +##negative of a mathematical expression +negativeExpression <- function(expression){ + expression=gsub("\\+","_toMinus_",expression) + expression=gsub("\\-","+",expression) + expression=gsub("_toMinus_","-",expression) + if(substr(expression,1,1)!="-" && substr(expression,1,1)!="+"){ + expression=paste(c("-",expression),collapse="") + } + + return(expression) +} + +#' Returns file name of calling Rscript +#' +#' \code{get_Rscript_filename} returns the file name of calling Rscript +#' @return A string with the filename of the calling script. +#' If not found (i.e. you are in a interactive session) returns NA. +#' +#' @export +get_Rscript_filename <- function() { + prog <- sub("--file=", "", grep("--file=", commandArgs(), value=TRUE)[1]) + if( .Platform$OS.type == "windows") { + prog <- gsub("\\\\", "\\\\\\\\", prog) + } + prog +} + +#' Recursively sorts a list +#' +#' \code{sort_list} returns a sorted list +#' @param unsorted_list A list. +#' @return A sorted list. +#' @export +sort_list <- function(unsorted_list) { + for(ii in seq(along=unsorted_list)) { + if(is.list(unsorted_list[[ii]])) { + unsorted_list[[ii]] <- sort_list(unsorted_list[[ii]]) + } + } + unsorted_list[sort(names(unsorted_list))] +} + + +# Multiple plot function +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + library(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +}