view lib.r @ 11:91311aa08cdc draft

planemo upload for repository https://github.com/workflow4metabolomics/xcms commit 08e7f269a5c59687a7768be8db5fcb4e4d736093
author lecorguille
date Mon, 30 Jan 2017 08:52:59 -0500
parents 0888f7ef739a
children 15646e937936
line wrap: on
line source

#Authors ABiMS TEAM
#Lib.r for Galaxy Workflow4Metabolomics xcms tools
#
#version 2.4: lecorguille
#   add getPeaklistW4M
#version 2.3: yguitton
#   correction for empty PDF when only 1 class
#version 2.2
#   correct bug in Base Peak Chromatogram (BPC) option, not only TIC when scanrange used in xcmsSet
#   Note if scanrange is used a warning is prompted in R console but do not stop PDF generation
#version 2.1: yguitton
#   Modifications made by Guitton Yann


#@author G. Le Corguille
#This function convert if it is required the Retention Time in minutes
RTSecondToMinute <- function(variableMetadata, convertRTMinute) {
    if (convertRTMinute){
        #converting the retention times (seconds) into minutes
        print("converting the retention times into minutes in the variableMetadata")
        variableMetadata[,"rt"]=variableMetadata[,"rt"]/60
        variableMetadata[,"rtmin"]=variableMetadata[,"rtmin"]/60
        variableMetadata[,"rtmax"]=variableMetadata[,"rtmax"]/60
    }
    return (variableMetadata)
}

#@author G. Le Corguille
#This function format ions identifiers
formatIonIdentifiers <- function(dataData, numDigitsRT=0, numDigitsMZ=0) {
    return(make.unique(paste0("M",round(dataData[,"mz"],numDigitsMZ),"T",round(dataData[,"rt"],numDigitsRT))))
}

#@author G. Le Corguille
# value: intensity values to be used into, maxo or intb
getPeaklistW4M <- function(xset, intval="into",convertRTMinute=F,numDigitsMZ=4,numDigitsRT=0,variableMetadataOutput,dataMatrixOutput) {
    groups <- xset@groups
    values <- groupval(xset, "medret", value=intval)
    
    # renamming of the column rtmed to rt to fit with camera peaklist function output
    colnames(groups)[colnames(groups)=="rtmed"] <- "rt"
    colnames(groups)[colnames(groups)=="mzmed"] <- "mz"
    
    ids <- formatIonIdentifiers(groups, numDigitsRT=numDigitsRT, numDigitsMZ=numDigitsMZ)
    groups = RTSecondToMinute(groups, convertRTMinute)

    rownames(groups) = ids
    rownames(values) = ids

    #@TODO: add "name" as the first column name
    #colnames(groups)[1] = "name"
    #colnames(values)[1] = "name"

    write.table(groups, file=variableMetadataOutput,sep="\t",quote=F,row.names = T,col.names = NA)
    write.table(values, file=dataMatrixOutput,sep="\t",quote=F,row.names = T,col.names = NA)
}

#@author Y. Guitton
getBPC <- function(file,rtcor=NULL, ...) {
  object <- xcmsRaw(file)
  sel <- profRange(object, ...)
  cbind(if (is.null(rtcor)) object@scantime[sel$scanidx] else rtcor ,xcms:::colMax(object@env$profile[sel$massidx,sel$scanidx,drop=FALSE]))
  #plotChrom(xcmsRaw(file), base=T)
}

#@author Y. Guitton
getBPCs <- function (xcmsSet=NULL, pdfname="BPCs.pdf",rt=c("raw","corrected"), scanrange=NULL) {
  cat("Creating BIC pdf...\n")

  if (is.null(xcmsSet)) {
    cat("Enter an xcmsSet \n")
    stop()
  } else {
    files <- filepaths(xcmsSet)
  }

  class<-as.vector(levels(xcmsSet@phenoData[,1])) #sometime phenoData have more than 1 column use first as class

  classnames<-vector("list",length(class))
  for (i in 1:length(class)){
    classnames[[i]]<-which( xcmsSet@phenoData[,1]==class[i])
  }

  N <- dim(phenoData(xcmsSet))[1]

  TIC <- vector("list",N)


  for (j in 1:N) {

    TIC[[j]] <- getBPC(files[j])
    #good for raw
    # seems strange for corrected
    #errors if scanrange used in xcmsSetgeneration
    if (!is.null(xcmsSet) && rt == "corrected")
    rtcor <- xcmsSet@rt$corrected[[j]] else
    rtcor <- NULL

    TIC[[j]] <- getBPC(files[j],rtcor=rtcor)
    # TIC[[j]][,1]<-rtcor
  }



  pdf(pdfname,w=16,h=10)
  cols <- rainbow(N)
  lty = 1:N
  pch = 1:N
  #search for max x and max y in BPCs
  xlim = range(sapply(TIC, function(x) range(x[,1])))
  ylim = range(sapply(TIC, function(x) range(x[,2])))
  ylim = c(-ylim[2], ylim[2])


  ##plot start

  if (length(class)>2){
    for (k in 1:(length(class)-1)){
      for (l in (k+1):length(class)){
        #print(paste(class[k],"vs",class[l],sep=" "))
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC")
        colvect<-NULL
        for (j in 1:length(classnames[[k]])) {
          tic <- TIC[[classnames[[k]][j]]]
          # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
          points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
          colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        for (j in 1:length(classnames[[l]])) {
          # i=class2names[j]
          tic <- TIC[[classnames[[l]][j]]]
          points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
          colvect<-append(colvect,cols[classnames[[l]][j]])
        }
        legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch)
      }
    }
  }#end if length >2

  if (length(class)==2){
    k=1
    l=2
    colvect<-NULL
    plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC")

    for (j in 1:length(classnames[[k]])) {

      tic <- TIC[[classnames[[k]][j]]]
      # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
      points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[k]][j]])
    }
    for (j in 1:length(classnames[[l]])) {
      # i=class2names[j]
      tic <- TIC[[classnames[[l]][j]]]
      points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[l]][j]])
    }
    legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch)

  }#end length ==2

  #case where only one class
  if (length(class)==1){
    k=1
		ylim = range(sapply(TIC, function(x) range(x[,2])))
    colvect<-NULL
    plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k], sep=""), xlab = "Retention Time (min)", ylab = "BPC")

    for (j in 1:length(classnames[[k]])) {
      tic <- TIC[[classnames[[k]][j]]]
      # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
      points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[k]][j]])
    }

    legend("topright",paste(basename(files[c(classnames[[k]])])), col = colvect, lty = lty, pch = pch)

  }#end length ==1

  dev.off() #pdf(pdfname,w=16,h=10)

  invisible(TIC)
}



#@author Y. Guitton
getTIC <- function(file,rtcor=NULL) {
  object <- xcmsRaw(file)
  cbind(if (is.null(rtcor)) object@scantime else rtcor, rawEIC(object,mzrange=range(object@env$mz))$intensity)
}

##
##  overlay TIC from all files in current folder or from xcmsSet, create pdf
##
#@author Y. Guitton
getTICs <- function(xcmsSet=NULL,files=NULL, pdfname="TICs.pdf",rt=c("raw","corrected")) {
  cat("Creating TIC pdf...\n")

  if (is.null(xcmsSet)) {
    filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", "[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
    filepattern <- paste(paste("\\.", filepattern, "$", sep = ""), collapse = "|")
    if (is.null(files))
      files <- getwd()
    info <- file.info(files)
    listed <- list.files(files[info$isdir], pattern = filepattern, recursive = TRUE, full.names = TRUE)
    files <- c(files[!info$isdir], listed)
  } else {
    files <- filepaths(xcmsSet)
  }

  class<-as.vector(levels(xcmsSet@phenoData[,1])) #sometime phenoData have more than 1 column use first as class

  classnames<-vector("list",length(class))
  for (i in 1:length(class)){
    classnames[[i]]<-which( xcmsSet@phenoData[,1]==class[i])
  }

  N <- length(files)
  TIC <- vector("list",N)

  for (i in 1:N) {
    if (!is.null(xcmsSet) && rt == "corrected")
      rtcor <- xcmsSet@rt$corrected[[i]] else
    rtcor <- NULL
    TIC[[i]] <- getTIC(files[i],rtcor=rtcor)
  }

  pdf(pdfname,w=16,h=10)
  cols <- rainbow(N)
  lty = 1:N
  pch = 1:N
  #search for max x and max y in TICs
  xlim = range(sapply(TIC, function(x) range(x[,1])))
  ylim = range(sapply(TIC, function(x) range(x[,2])))
  ylim = c(-ylim[2], ylim[2])


  ##plot start
  if (length(class)>2){
    for (k in 1:(length(class)-1)){
      for (l in (k+1):length(class)){
        #print(paste(class[k],"vs",class[l],sep=" "))
        plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
        colvect<-NULL
        for (j in 1:length(classnames[[k]])) {

          tic <- TIC[[classnames[[k]][j]]]
          # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
          points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
          colvect<-append(colvect,cols[classnames[[k]][j]])
        }
        for (j in 1:length(classnames[[l]])) {
          # i=class2names[j]
          tic <- TIC[[classnames[[l]][j]]]
          points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
          colvect<-append(colvect,cols[classnames[[l]][j]])
        }
        legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch)
      }
    }
  }#end if length >2
  if (length(class)==2){
    k=1
    l=2

    plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
    colvect<-NULL
    for (j in 1:length(classnames[[k]])) {
      tic <- TIC[[classnames[[k]][j]]]
      # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
      points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[k]][j]])
    }
    for (j in 1:length(classnames[[l]])) {
      # i=class2names[j]
      tic <- TIC[[classnames[[l]][j]]]
      points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[l]][j]])
    }
    legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch)

  }#end length ==2

  #case where only one class
  if (length(class)==1){
	  k=1
	  ylim = range(sapply(TIC, function(x) range(x[,2])))

	  plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k], sep=""), xlab = "Retention Time (min)", ylab = "TIC")
    colvect<-NULL
		for (j in 1:length(classnames[[k]])) {
      tic <- TIC[[classnames[[k]][j]]]
			# points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l")
			points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l")
      colvect<-append(colvect,cols[classnames[[k]][j]])
	  }

		legend("topright",paste(basename(files[c(classnames[[k]])])), col = colvect, lty = lty, pch = pch)

	}#end length ==1

  dev.off() #pdf(pdfname,w=16,h=10)

  invisible(TIC)
}



##
##  Get the polarities from all the samples of a condition
#@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
#@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
getSampleMetadata <- function(xcmsSet=NULL, sampleMetadataOutput="sampleMetadata.tsv") {
  cat("Creating the sampleMetadata file...\n")

  #Create the sampleMetada dataframe
  sampleMetadata=xset@phenoData
  sampleNamesOrigin=rownames(sampleMetadata)
  sampleNamesMakeNames=make.names(sampleNamesOrigin)

  if (any(duplicated(sampleNamesMakeNames))) {
    write("\n\nERROR: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names().\nIn your case, at least two columns after the renaming obtain the same name, thus XCMS will collapse those columns per name.", stderr())
    for (sampleName in sampleNamesOrigin) {
      write(paste(sampleName,"\t->\t",make.names(sampleName)),stderr())
    }
    stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.")
  }

  if (!all(sampleNamesOrigin == sampleNamesMakeNames)) {
    cat("\n\nWARNING: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names()\nIn your case, one or more sample names will be renamed in the sampleMetadata and dataMatrix files:\n")
    for (sampleName in sampleNamesOrigin) {
      cat(paste(sampleName,"\t->\t",make.names(sampleName),"\n"))
    }
  }

  sampleMetadata$sampleMetadata=sampleNamesMakeNames
  sampleMetadata=cbind(sampleMetadata["sampleMetadata"],sampleMetadata["class"]) #Reorder columns
  rownames(sampleMetadata)=NULL

  #Create a list of files name in the current directory
  list_files=xset@filepaths
  #For each sample file, the following actions are done
  for (file in list_files){
    #Check if the file is in the CDF format
    if (!mzR:::netCDFIsFile(file)){

      # If the column isn't exist, with add one filled with NA
      if (is.null(sampleMetadata$polarity)) sampleMetadata$polarity=NA

      #Create a simple xcmsRaw object for each sample
      xcmsRaw=xcmsRaw(file)
      #Extract the polarity (a list of polarities)
      polarity=xcmsRaw@polarity
      #Verify if all the scans have the same polarity
      uniq_list=unique(polarity)
      if (length(uniq_list)>1){
        polarity="mixed"
      } else {
        polarity=as.character(uniq_list)
      }
      #Transforms the character to obtain only the sample name
      filename=basename(file)
      library(tools)
      samplename=file_path_sans_ext(filename)

      #Set the polarity attribute
      sampleMetadata$polarity[sampleMetadata$sampleMetadata==samplename]=polarity

      #Delete xcmsRaw object because it creates a bug for the fillpeaks step
      rm(xcmsRaw)
    }

  }

  write.table(sampleMetadata, sep="\t", quote=FALSE, row.names=FALSE, file=sampleMetadataOutput)

  return(list("sampleNamesOrigin"=sampleNamesOrigin,"sampleNamesMakeNames"=sampleNamesMakeNames))

}


##
## This function check if xcms will found all the files
##
#@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
checkFilesCompatibilityWithXcms <- function(directory) {
  cat("Checking files filenames compatibilities with xmcs...\n")
  # WHAT XCMS WILL FIND
  filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
  filepattern <- paste(paste("\\.", filepattern, "$", sep = ""),collapse = "|")
  info <- file.info(directory)
  listed <- list.files(directory[info$isdir], pattern = filepattern,recursive = TRUE, full.names = TRUE)
  files <- c(directory[!info$isdir], listed)
  files_abs <- file.path(getwd(), files)
  exists <- file.exists(files_abs)
  files[exists] <- files_abs[exists]
  files[exists] <- sub("//","/",files[exists])

  # WHAT IS ON THE FILESYSTEM
  filesystem_filepaths=system(paste("find $PWD/",directory," -not -name '\\.*' -not -path '*conda-env*' -type f -name \"*\"", sep=""), intern=T)
  filesystem_filepaths=filesystem_filepaths[grep(filepattern, filesystem_filepaths, perl=T)]

  # COMPARISON
  if (!is.na(table(filesystem_filepaths %in% files)["FALSE"])) {
    write("\n\nERROR: List of the files which will not be imported by xcmsSet",stderr())
    write(filesystem_filepaths[!(filesystem_filepaths %in% files)],stderr())
    stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.")

  }
}



##
## This function check if XML contains special caracters. It also checks integrity and completness.
##
#@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
checkXmlStructure <- function (directory) {
  cat("Checking XML structure...\n")

  cmd=paste("IFS=$'\n'; for xml in $(find",directory,"-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'); do if [ $(xmllint --nonet --noout \"$xml\" 2> /dev/null; echo $?) -gt 0 ]; then echo $xml;fi; done;")
  capture=system(cmd,intern=TRUE)

  if (length(capture)>0){
    #message=paste("The following mzXML or mzML file is incorrect, please check these files first:",capture)
    write("\n\nERROR: The following mzXML or mzML file(s) are incorrect, please check these files first:", stderr())
    write(capture, stderr())
    stop("ERROR: xcmsSet cannot continue with incorrect mzXML or mzML files")
  }

}


##
## This function check if XML contain special characters
##
#@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
deleteXmlBadCharacters<- function (directory) {
  cat("Checking Non ASCII characters in the XML...\n")

  processed=F
  l=system( paste("find",directory, "-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'"),intern=TRUE)
  for (i in l){
    cmd=paste("LC_ALL=C grep '[^ -~]' \"",i,"\"",sep="")
    capture=suppressWarnings(system(cmd,intern=TRUE))
    if (length(capture)>0){
      cmd=paste("perl -i -pe 's/[^[:ascii:]]//g;'",i)
      print( paste("WARNING: Non ASCII characters have been removed from the ",i,"file") )
      c=system(cmd,intern=TRUE)
      capture=""
      processed=T
    }
  }
  if (processed) cat("\n\n")
  return(processed)
}


##
## This function will compute MD5 checksum to check the data integrity
##
#@author Gildas Le Corguille lecorguille@sb-roscoff.fr
getMd5sum <- function (directory) {
  cat("Compute md5 checksum...\n")
  # WHAT XCMS WILL FIND
  filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
  filepattern <- paste(paste("\\.", filepattern, "$", sep = ""),collapse = "|")
  info <- file.info(directory)
  listed <- list.files(directory[info$isdir], pattern = filepattern,recursive = TRUE, full.names = TRUE)
  files <- c(directory[!info$isdir], listed)
  exists <- file.exists(files)
  files <- files[exists]

  library(tools)

  #cat("\n\n")

  return(as.matrix(md5sum(files)))
}