view lib_metams.r @ 5:b8d4129dd2a6 draft

planemo upload for repository https://github.com/workflow4metabolomics/metaMS commit c7a518686137f6d62b7415715152e8d5a9953ed7
author yguitton
date Fri, 06 Sep 2019 06:09:10 -0400
parents c10824185547
children d1ce2634135f
line wrap: on
line source

# lib_metams.r version 2.1.1
# R function for metaMS runGC under W4M
# author Yann GUITTON CNRS IRISA/LINA Idealg project 2014-2015
# author Yann GUITTON Oniris Laberca 2015-2017


#@author G. Le Corguille
# This function will
# - load the packages
# - display the sessionInfo
loadAndDisplayPackages <- function(pkgs) {
    for(pkg in pkgs) suppressPackageStartupMessages( stopifnot( library(pkg, quietly=TRUE, logical.return=TRUE, character.only=TRUE)))

    sessioninfo = sessionInfo()
    cat(sessioninfo$R.version$version.string,"\n")
    cat("Main packages:\n")
    for (pkg in names(sessioninfo$otherPkgs)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n")
    cat("Other loaded packages:\n")
    for (pkg in names(sessioninfo$loadedOnly)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n")
}

#This function list the compatible files within the directory as xcms did
#@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
getMSFiles <- function (directory) {
    filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
    filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|")
    info <- file.info(directory)
    listed <- list.files(directory[info$isdir], pattern=filepattern,recursive=TRUE, full.names=TRUE)
    files <- c(directory[!info$isdir], listed)
    exists <- file.exists(files)
    files <- files[exists]
    return(files)
}

# This function retrieve a xset like object
#@author Gildas Le Corguille lecorguille@sb-roscoff.fr
getxcmsSetObject <- function(xobject) {
    # XCMS 1.x
    if (class(xobject) == "xcmsSet")
        return (xobject)
    # XCMS 3.x
    if (class(xobject) == "XCMSnExp") {
        # Get the legacy xcmsSet object
        suppressWarnings(xset <- as(xobject, 'xcmsSet'))
        if (!is.null(xset@phenoData$sample_group))
            sampclass(xset) <- xset@phenoData$sample_group
        else
            sampclass(xset) <- "."
        return (xset)
    }
}

#J.Saint-Vanne
#Function to correct the file names which can be written like "..alg8.mzData" and we just want "alg8"
getCorrectFileName <- function(peaktable,sampleMetadata){

    #Try to start for the first sample, avoid description of line with colnamesdontwant
    i <- 1
    while(!(sampleMetadata[1,1] %in% strsplit(colnames(peaktable)[i],"\\.")[[1]])) {
        if(i < length(peaktable)) {
            i <- i + 1
        } else {
            break
        }
    }
    #i now correspond to the first column with a sample
    for(j in 1:(nrow(sampleMetadata))) {
        col <- j + i-1 #minus 1 cause i is the good column to start and j start at 1
        if(col <= length(colnames(peaktable))) {
            newname <- gsub("(^.*)(\\..*$)","\\1",colnames(peaktable)[col])
            if(newname != sampleMetadata[j,1]){
                #Correction for 2 points starting the name (I don't know why they are here...)
                if(".." == gsub("(^\\.+)(.*)","\\1",newname)){
                    newname <- sub("(^\\.+)(.*)","\\2",newname)
                }
            }
            colnames(peaktable)[col] <- newname
        }
    }
    return(peaktable)
}


#@author Gildas Le Corguille lecorguille@sb-roscoff.fr
# This function get the raw file path from the arguments
getRawfilePathFromArguments <- function(singlefile, zipfile, listArguments) {
    if (!is.null(listArguments[["zipfile"]]))           zipfile = listArguments[["zipfile"]]
    if (!is.null(listArguments[["zipfilePositive"]]))   zipfile = listArguments[["zipfilePositive"]]
    if (!is.null(listArguments[["zipfileNegative"]]))   zipfile = listArguments[["zipfileNegative"]]

    if (!is.null(listArguments[["singlefile_galaxyPath"]])) {
        singlefile_galaxyPaths = listArguments[["singlefile_galaxyPath"]];
        singlefile_sampleNames = listArguments[["singlefile_sampleName"]]
    }
    if (!is.null(listArguments[["singlefile_galaxyPathPositive"]])) {
        singlefile_galaxyPaths = listArguments[["singlefile_galaxyPathPositive"]];
        singlefile_sampleNames = listArguments[["singlefile_sampleNamePositive"]]
    }
    if (!is.null(listArguments[["singlefile_galaxyPathNegative"]])) {
        singlefile_galaxyPaths = listArguments[["singlefile_galaxyPathNegative"]];
        singlefile_sampleNames = listArguments[["singlefile_sampleNameNegative"]]
    }
    if (exists("singlefile_galaxyPaths")){
        singlefile_galaxyPaths = unlist(strsplit(singlefile_galaxyPaths,","))
        singlefile_sampleNames = unlist(strsplit(singlefile_sampleNames,","))

        singlefile=NULL
        for (singlefile_galaxyPath_i in seq(1:length(singlefile_galaxyPaths))) {
            singlefile_galaxyPath=singlefile_galaxyPaths[singlefile_galaxyPath_i]
            singlefile_sampleName=singlefile_sampleNames[singlefile_galaxyPath_i]
            singlefile[[singlefile_sampleName]] = singlefile_galaxyPath
        }
    }
    for (argument in c("zipfile","zipfilePositive","zipfileNegative","singlefile_galaxyPath","singlefile_sampleName","singlefile_galaxyPathPositive","singlefile_sampleNamePositive","singlefile_galaxyPathNegative","singlefile_sampleNameNegative")) {
        listArguments[[argument]]=NULL
    }
    return(list(zipfile=zipfile, singlefile=singlefile, listArguments=listArguments))
}


#@author Gildas Le Corguille lecorguille@sb-roscoff.fr
# This function retrieve the raw file in the working directory
#   - if zipfile: unzip the file with its directory tree
#   - if singlefiles: set symlink with the good filename
retrieveRawfileInTheWorkingDirectory <- function(singlefile, zipfile) {
    if(!is.null(singlefile) && (length("singlefile")>0)) {
        for (singlefile_sampleName in names(singlefile)) {
            singlefile_galaxyPath = singlefile[[singlefile_sampleName]]
            if(!file.exists(singlefile_galaxyPath)){
                error_message=paste("Cannot access the sample:",singlefile_sampleName,"located:",singlefile_galaxyPath,". Please, contact your administrator ... if you have one!")
                print(error_message); stop(error_message)
            }
            file.symlink(singlefile_galaxyPath,singlefile_sampleName)
        }
        directory = "."
    }
    if(!is.null(zipfile) && (zipfile!="")) {
        if(!file.exists(zipfile)){
            error_message=paste("Cannot access the Zip file:",zipfile,". Please, contact your administrator ... if you have one!")
            print(error_message)
            stop(error_message)
        }

        #list all file in the zip file
        #zip_files=unzip(zipfile,list=T)[,"Name"]

        #unzip
        suppressWarnings(unzip(zipfile, unzip="unzip"))

        #get the directory name
        filesInZip=unzip(zipfile, list=T);
        directories=unique(unlist(lapply(strsplit(filesInZip$Name,"/"), function(x) x[1])));
        directories=directories[!(directories %in% c("__MACOSX")) & file.info(directories)$isdir]
        directory = "."
        if (length(directories) == 1) directory = directories

        cat("files_root_directory\t",directory,"\n")
    }
    return (directory)
}

##ADDITIONS FROM Y. Guitton
getBPC <- function(file,rtcor=NULL, ...) {
    object <- xcmsRaw(file)
    sel <- profRange(object, ...)
    cbind(if (is.null(rtcor)) object@scantime[sel$scanidx] else rtcor ,xcms:::colMax(object@env$profile[sel$massidx,sel$scanidx,drop=FALSE]))
}

getBPC2s <- function (files, xset = NULL, pdfname="BPCs.pdf", rt = c("raw","corrected"), scanrange=NULL) {
    require(xcms)

    #create sampleMetadata, get sampleMetadata and class
    if(!is.null(xset)) {
    	#When files come from XCMS3 directly before metaMS
    	sampleMetadata <- xset@phenoData
    } else {
    	#When files come from a zip directory with raw files before metaMS
	    sampleMetadata<-xcms:::phenoDataFromPaths(files)
    }
    class<-unique(sampleMetadata[,"class"]) #create phenoData like table
    classnames<-vector("list",length(class))
    for (i in 1:length(class)){
        classnames[[i]]<-which( sampleMetadata[,"class"]==class[i])
    }

    N <- dim(sampleMetadata)[1]
    BPC <- vector("list",N)

    for (j in 1:N) {
        BPC[[j]] <- getBPC(files[j])
        #good for raw 
        # seems strange for corrected
        #errors if scanrange used in xcmsSetgeneration
        if (!is.null(xcmsSet) && rt == "corrected"){
            rtcor <- xcmsSet@rt$corrected[[j]] 
        }else{
            rtcor <- NULL
        }
        BPC[[j]] <- getBPC(files[j],rtcor=rtcor)
    }

    pdf(pdfname,w=16,h=10)
    cols <- rainbow(N)
    lty = 1:N
    pch = 1:N
    #search for max x and max y in BPCs

    xlim = range(sapply(BPC, function(x) range(x[,1])))
    ylim = range(sapply(BPC, function(x) range(x[,2])))

    ylim = c(-ylim[2], ylim[2])

    ##plot start
    if (length(class)>2){
        for (k in 1:(length(class)-1)){
            for (l in (k+1):length(class)){
                cat(paste(class[k],"vs",class[l],sep=" ","\n")) 
                plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC")
                colvect<-NULL
                for (j in 1:length(classnames[[k]])) {
                    bpc <- BPC[[classnames[[k]][j]]]
                    # points(bpc[,1]/60, bpc[,2], col = cols[i], pch = pch[i], type="l")
                    points(bpc[,1]/60, bpc[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
                    colvect<-append(colvect,cols[classnames[[k]][j]])
                }
                for (j in 1:length(classnames[[l]])) {
                    # i=class2names[j]
                    bpc <- BPC[[classnames[[l]][j]]]
                    points(bpc[,1]/60, -bpc[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
                    colvect<-append(colvect,cols[classnames[[l]][j]])
                }
                legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]],classnames[[l]])]))), col = colvect, lty = lty, pch = pch)
            }
        }
    }#end if length >2

    if (length(class)==2){
        k=1
		l=2
        colvect<-NULL
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC")

        for (j in 1:length(classnames[[k]])) {
            bpc <- BPC[[classnames[[k]][j]]]
            # points(bpc[,1]/60, bpc[,2], col = cols[i], pch = pch[i], type="l")
            points(bpc[,1]/60, bpc[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        for (j in 1:length(classnames[[l]])) {
            # i=class2names[j]
            bpc <- BPC[[classnames[[l]][j]]]
            points(bpc[,1]/60, -bpc[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[l]][j]])
        }
        legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]],classnames[[l]])]))), col = colvect, lty = lty, pch = pch)
    }#end length ==2
    
    if (length(class)==1){
        k=1

		ylim = range(sapply(BPC, function(x) range(x[,2])))

        colvect<-NULL
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k], sep=""), xlab = "Retention Time (min)", ylab = "BPC")

        for (j in 1:length(classnames[[k]])) {
            bpc <- BPC[[classnames[[k]][j]]]
            # points(bpc[,1]/60, bpc[,2], col = cols[i], pch = pch[i], type="l")
            points(bpc[,1]/60, bpc[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]])]))), col = colvect, lty = lty, pch = pch)
    }#end length ==1
    dev.off()
}

getTIC <- function(file,rtcor=NULL) {
    object <- xcmsRaw(file)
    cbind(if (is.null(rtcor)) object@scantime else rtcor, rawEIC(object,mzrange=range(object@env$mz))$intensity)
}

##  overlay TIC from all files in current folder or from xcmsSet, create pdf
getTIC2s <- function(files, xset=NULL, pdfname="TICs.pdf", rt=c("raw","corrected")) {
    require(xcms)

    #create sampleMetadata, get sampleMetadata and class
    if(!is.null(xset)){
		#When files come from XCMS3 before metaMS treatment
    	sampleMetadata<-xset@phenoData
    } else {
		#When files come from a zip directory with raw files before metaMS
    	sampleMetadata<-xcms:::phenoDataFromPaths(files)
    }
    class<-as.vector(levels(sampleMetadata[,"class"])) #create phenoData like table
    classnames<-vector("list",length(class))
    for (i in 1:length(class)){
        classnames[[i]]<-which( sampleMetadata[,"class"]==class[i])
    }
        
    N <- dim(sampleMetadata)[1]
    TIC <- vector("list",N)

    for (i in 1:N) {
        if (!is.null(xcmsSet) && rt == "corrected")
            rtcor <- xcmsSet@rt$corrected[[i]]
        else
            rtcor <- NULL
        TIC[[i]] <- getTIC(files[i],rtcor=rtcor)
    }
 
    pdf(pdfname,w=16,h=10)
    cols <- rainbow(N)
    lty = 1:N
    pch = 1:N
    #search for max x and max y in TICs
    xlim = range(sapply(TIC, function(x) range(x[,1])))
    ylim = range(sapply(TIC, function(x) range(x[,2])))
    ylim = c(-ylim[2], ylim[2])  
	  
    ##plot start
    if (length(class)>2){
        for (k in 1:(length(class)-1)){
            for (l in (k+1):length(class)){
                cat(paste(class[k],"vs",class[l],"\n",sep=" ")) 
                plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
                colvect<-NULL
                for (j in 1:length(classnames[[k]])) {
                    tic <- TIC[[classnames[[k]][j]]]
                    # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
                    points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
                    colvect<-append(colvect,cols[classnames[[k]][j]])
                }
                for (j in 1:length(classnames[[l]])) {
                    # i=class2names[j]
                    tic <- TIC[[classnames[[l]][j]]]
                    points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
                    colvect<-append(colvect,cols[classnames[[l]][j]])
                }
                legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]],classnames[[l]])]))), col = colvect, lty = lty, pch = pch)
            }
        }
    }#end if length >2

    if (length(class)==2){
        k=1
        l=2
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
        colvect<-NULL
        for (j in 1:length(classnames[[k]])) {
            tic <- TIC[[classnames[[k]][j]]]
            # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
            points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        for (j in 1:length(classnames[[l]])) {
            # i=class2names[j]
            tic <- TIC[[classnames[[l]][j]]]
            points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[l]][j]])
        }
        legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]],classnames[[l]])]))), col = colvect, lty = lty, pch = pch)
    }#end length ==2

    if (length(class)==1){
        k=1
        ylim = range(sapply(TIC, function(x) range(x[,2])))
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
        colvect<-NULL
        for (j in 1:length(classnames[[k]])) {
            tic <- TIC[[classnames[[k]][j]]]
            # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
            points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
            colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        legend("topright",paste(gsub("(^.+)\\..*$","\\1",basename(files[c(classnames[[k]])]))), col = colvect, lty = lty, pch = pch)
    }#end length ==1
    dev.off()
}


#Update by J.Saint-Vanne
##addition for quality control of peak picking
#metaMS EIC and pspectra plotting option
#version 20190520
#only for Galaxy 
plotUnknowns<-function(resGC, unkn="", DB=NULL, fileFrom=NULL){

    ##Annotation table each value is a pcgrp associated to the unknown 
    ##NOTE pcgrp index are different between xcmsSet and resGC due to filtering steps in metaMS
    ##R. Wehrens give me some clues on that and we found a correction
    
	#correction of annotation matrix due to pcgrp removal by quality check in runGCresult
	#matrix of correspondance between an@pspectra and filtered pspectra from runGC
    #Select only pspectra which correpond to them select in metaMS
    # col1 = filtered spectra from runGC and col2 = an@spectra
	allPCGRPs <-lapply(1:length(resGC$xset),
				function(i) {
					an <- resGC$xset[[i]]
					huhn <- an@pspectra[which(sapply(an@pspectra, length) >=
					metaSetting(resGC$settings,
					"DBconstruction.minfeat"))]
					matCORR<-cbind(1:length(huhn), match(huhn, an@pspectra))
				})
    #Build a new annotation list with sampnames and pseudospectra number from xset
    helpannotation <- list()
    for(j in 1:length(resGC$xset)){
        helpannotation[[j]] <- resGC$annotation[[j]][1:2]
        pspvector <- vector()
        for(i in 1: nrow(helpannotation[[j]])){
            #Find corresponding pspec
            psplink <- allPCGRPs[[j]][match(helpannotation[[j]][i,1],allPCGRPs[[j]]),2]
            pspvector <- c(pspvector,psplink)
            #Change the annotation column into sampname column
            if(helpannotation[[j]][i,2] < 0){
                #It's an unknown
                new_name <- paste("Unknown",abs(as.integer(helpannotation[[j]][i,2])))
                helpannotation[[j]][i,2] <- new_name
            }else{
                #It has been found in local database
                for(k in 1:length(DB)){
                    if(helpannotation[[j]][i,2] == k){
                        helpannotation[[j]][i,2] <- DB[[k]]$Name
                        break
                    }
                }
            }
        }
        helpannotation[[j]] <- cbind(helpannotation[[j]],pspvector)
        names(helpannotation)[j] <- names(resGC$annotation[j])
    }
    peaktable <- resGC$PeakTable
		
	par (mar=c(5, 4, 4, 2) + 0.1)
	#For each unknown
	for (l in 1:length(unkn)){
		#recordPlot
		perpage=3 #if change change layout also!
		dev.new(width=21/2.54, height=29.7/2.54, file=paste("Unknown_",unkn[l],".pdf", sep="")) #A4 pdf
		# par(mfrow=c(perpage,2))
		layout(matrix(c(1,1,2,3,4,4,5,6,7,7,8,9), 6, 2, byrow = TRUE), widths=rep(c(1,1),perpage), heights=rep(c(1,5),perpage))
		# layout.show(6)
		oma.saved <- par("oma")
		par(oma = rep.int(0, 4))
		par(oma = oma.saved)
		o.par <- par(mar = rep.int(0, 4))
		on.exit(par(o.par))
		#For each sample
		for (c in 1:length(resGC$xset)) {	
			#get sample name
			sampname<-basename(resGC$xset[[c]]@xcmsSet@filepaths)
			#remove .cdf, .mzXML filepattern
			filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", 
									"[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
			filepattern <- paste(paste("\\.", filepattern, "$", sep = ""), collapse = "|")
			sampname<-gsub(filepattern, "",sampname)					 
			title1<-paste(peaktable[unkn[l],1],"from",sampname, sep = " ")
			an<-resGC$xset[[c]]
    		if(fileFrom == "zipfile") {
				an@xcmsSet@filepaths <- paste0("./",an@xcmsSet@phenoData[,"class"],"/",basename(an@xcmsSet@filepaths))
				}#else {
        			#print(an@xcmsSet@filepaths)
					#an@xcmsSet@filepaths <- paste0("./",basename(an@xcmsSet@filepaths))
				#}
			#Find the good annotation for this sample
            for(a in 1:length(helpannotation)){    
                if(gsub(filepattern, "", names(helpannotation)[a]) == paste0("./",sampname)){
                    #Find the unkn or the matched std in this sample
                    findunkn <- FALSE
                    for(r in 1:nrow(helpannotation[[a]])){
                        if(helpannotation[[a]][r,"annotation"] == peaktable[unkn[l],1]){
                            findunkn <- TRUE
                            pcgrp <- helpannotation[[a]][r,"pspvector"]
							par (mar=c(0, 0, 0, 0) + 0.1)
							#Write title
							plot.new()
							box()
							text(0.5, 0.5, title1, cex=2)						
							par (mar=c(3, 2.5, 3, 1.5) + 0.1)
							#Window for EIC
							plotEICs(an, pspec=pcgrp, maxlabel=2)
							#Window for pseudospectra
							plotPsSpectrum(an, pspec=pcgrp, maxlabel=2)
						}
					}
					if(!findunkn) {
						par (mar=c(0, 0, 0, 0) + 0.1)
                   		#Write title
						plot.new()
						box()
						text(0.5, 0.5, title1, cex=2)
                   		#Window for EIC
                   		plot.new()
                   		box()
						text(0.5, 0.5, "NOT FOUND", cex=2)
						#Window for pseudospectra
						plot.new()
						box()
						text(0.5, 0.5, "NOT FOUND", cex=2)
					}
					break
				}
    		}
		}
		graphics.off()
	}#end  for unkn[l]
}#end function