Repository 'intensity_checks'
hg clone https://toolshed.g2.bx.psu.edu/repos/melpetera/intensity_checks

Changeset 0:c2c2e1be904a (2018-10-11)
Next changeset 1:4973a2104cfd (2018-12-05)
Commit message:
Uploaded
added:
Intchecks/RcheckLibrary.R
Intchecks/Script_intensity_check.R
Intchecks/wrapper_intensity_check.R
Intchecks/xml_intensity_check.xml
b
diff -r 000000000000 -r c2c2e1be904a Intchecks/RcheckLibrary.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Intchecks/RcheckLibrary.R Thu Oct 11 05:33:19 2018 -0400
[
@@ -0,0 +1,124 @@
+######################################################
+# R check library
+# Coded by: M.Petera, 
+# - -
+# R functions to use in R scripts  
+# (management of various generic subroutines)
+# - -
+# V0: script structure + first functions
+# V1: More detailed error messages in match functions
+######################################################
+
+
+# Generic function to return an error if problems have been encountered - - - -
+
+check.err <- function(err.stock){
+
+ # err.stock = vector of results returned by check functions
+
+ if(length(err.stock)!=0){ stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n") }
+
+}
+
+
+
+
+# Table match check functions - - - - - - - - - - - - - - - - - - - - - - - - -
+
+# To check if dataMatrix and (variable or sample)Metadata match regarding identifiers
+match2 <- function(dataMatrix, Metadata, Mtype){
+
+ # dataMatrix = data.frame containing dataMatrix
+ # Metadata = data.frame containing sampleMetadata or variableMetadata
+ # Mtype = "sample" or "variable" depending on Metadata content
+
+ err.stock <- NULL # error vector
+
+ id2 <- Metadata[,1]
+ if(Mtype=="sample"){ id1 <- colnames(dataMatrix)[-1] }
+ if(Mtype=="variable"){ id1 <- dataMatrix[,1] }
+
+ if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){
+ err.stock <- c("\nData matrix and ",Mtype," metadata do not match regarding ",Mtype," identifiers.")
+ if(length(which(id1%in%id2))!=length(id1)){
+ if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the data matrix\n",
+ "    do not appear in the ",Mtype," metadata file:\n")
+ identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ if(length(which(id2%in%id1))!=length(id2)){
+ if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the ",Mtype," metadata file\n",
+ "    do not appear in the data matrix:\n")
+ identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ err.stock <- c(err.stock,"\nPlease check your data.\n")
+ }
+
+ return(err.stock)
+
+}
+
+# To check if the 3 standard tables match regarding identifiers
+match3 <- function(dataMatrix, sampleMetadata, variableMetadata){
+
+ # dataMatrix = data.frame containing dataMatrix
+ # sampleMetadata = data.frame containing sampleMetadata
+ # variableMetadata = data.frame containing variableMetadata
+
+ err.stock <- NULL # error vector
+
+ id1 <- colnames(dataMatrix)[-1]
+ id2 <- sampleMetadata[,1]
+ id3 <- dataMatrix[,1]
+ id4 <- variableMetadata[,1]
+
+ if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){
+ err.stock <- c(err.stock,"\nData matrix and sample metadata do not match regarding sample identifiers.")
+ if(length(which(id1%in%id2))!=length(id1)){
+ if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the data matrix\n",
+ "    do not appear in the sample metadata file:\n")
+ identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ if(length(which(id2%in%id1))!=length(id2)){
+ if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the sample metadata file\n",
+ "    do not appear in the data matrix:\n")
+ identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ }
+
+ if( length(which(id3%in%id4))!=length(id3) || length(which(id4%in%id3))!=length(id4) ){
+ err.stock <- c(err.stock,"\nData matrix and variable metadata do not match regarding variable identifiers.")
+ if(length(which(id3%in%id4))!=length(id3)){
+ if(length(which(!(id3%in%id4)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the data matrix\n",
+ "    do not appear in the variable metadata file:\n")
+ identif <- id3[which(!(id3%in%id4))][1:min(3,length(which(!(id3%in%id4))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ if(length(which(id4%in%id3))!=length(id4)){
+ if(length(which(!(id4%in%id3)))<4){ err.stock <- c(err.stock,"\n    The ")
+ }else{ err.stock <- c(err.stock,"\n    For example, the ") }
+ err.stock <- c(err.stock,"following identifiers found in the variable metadata file\n",
+ "    do not appear in the data matrix:\n")
+ identif <- id4[which(!(id4%in%id3))][1:min(3,length(which(!(id4%in%id3))))]
+ err.stock <- c(err.stock,"    ",paste(identif,collapse="\n    "),"\n")
+ }
+ }
+
+ if(length(err.stock)!=0){ err.stock <- c(err.stock,"\nPlease check your data.\n") }
+
+ return(err.stock)
+
+}
\ No newline at end of file
b
diff -r 000000000000 -r c2c2e1be904a Intchecks/Script_intensity_check.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Intchecks/Script_intensity_check.R Thu Oct 11 05:33:19 2018 -0400
[
@@ -0,0 +1,264 @@
+####################################################################
+# SCRIPT INTENSITY CHECK
+# V1: Fold and NA
+#
+# Input: Data Matrix, VariableMetadata, SampleMetadata
+# Output: VariableMetadata, Graphics (barplots and boxplots)
+#
+# Dependencies: RcheckLibrary.R 
+#
+####################################################################
+
+
+# Parameters (for dev)
+if(FALSE){
+  
+  rm(list = ls())
+  setwd("Y:\\Developpement\\Intensity check\\Pour tests")
+  
+  DM.name <- "DM_NA.tabular"
+  SM.name <- "SM_NA.tabular"
+  VM.name <- "vM_NA.tabular"
+  type <- "One_class"
+  class.col <- "2"
+  class1 <- "Blanks"
+  VM.output <- "new_VM.txt"
+  graphs.output <- "Barplots_and_Boxplots.pdf"
+}
+
+intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){
+  # This function allows to check the intensities considering classes with a fold calculation, the number and
+  # the proportion of NA
+  #
+  # Two options: 
+  # - one class (selected by the user) against all the remaining samples ("One_class")
+  # - tests on each class ("Each_class")
+  #
+  # Parameters:
+  # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
+  # type: "One_class" or "Each_class"
+  # class.col: number of the sampleMetadata's column with classes
+  # class1: name of the class if type="One_class"
+  # VM.output: output file's access (VM with the new columns)
+  # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values
+  
+  
+  
+  
+  # Input ---------------------------------------------------------
+  
+  DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
+  SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
+  VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
+  
+  
+  
+  # Table match check with Rchecklibrary
+  table.check <- match3(DM, SM, VM)
+  check.err(table.check)
+  
+  
+  rownames(DM) <- DM[,1]
+  var_names <- DM[,1]
+  DM <- DM[,-1]
+  DM <- data.frame(t(DM))
+  
+  class.col <- colnames(SM)[as.numeric(class.col)]
+  
+  # check class.col, class1 and the number of classes---------------
+  
+  if(!(class.col %in% colnames(SM))){
+    stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
+  }
+  
+  c_class <- SM[,class.col]
+  c_class <- as.factor(c_class)
+  nb_class <- nlevels(c_class)
+  classnames <- levels(c_class)
+  
+  
+  if((nb_class > (nrow(SM))/3)){
+    class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
+                   with few samples \n")
+    cat(class.err)
+  }
+
+  if(type == "One_class"){  
+    if(!(class1 %in% classnames)){
+      list.class1 <- c("\n Classes:",classnames,"\n")
+      cat(list.class1)
+      err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
+      stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
+    }
+  }
+  
+  #If type is "one_class", change others classes in "other"
+  if(type == "One_class"){
+    for(i in 1:length(c_class)){
+      if(c_class[i]!=class1){
+        c_class <- as.character(c_class)
+        c_class[i] <- "Other"
+        c_class <- as.factor(c_class)
+        nb_class <- nlevels(c_class)
+        classnames <- levels(c_class)
+        
+      }
+    }
+  }
+
+  DM <- cbind(DM,c_class)
+  
+  # fold -------------------------------------------------------
+  n <- 1
+  fold <- data.frame()
+  for(j in 1:(nb_class-1)){
+    for(k in (j+1):nb_class) {
+      for (i in 1:(length(DM)-1)){
+        fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/
+          mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)}
+      names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_")
+      n <- n + 1}
+  }
+  
+  # NA ---------------------------------------------------------
+  calcul_NA <- data.frame()
+  pct_NA <- data.frame()
+  for (i in 1:(length(DM)-1)){
+    for (j in 1:nb_class){
+      n <- 0
+      new_DM <- DM[which(DM$c_class==classnames[j]),i]
+      for(k in 1:length(new_DM)){
+        if (is.na(new_DM[k])){
+          n <- n + 1}
+        calcul_NA[i,j] <- n
+        pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
+    }
+  }
+  names(calcul_NA) <- paste("Nb_NA",classnames, sep="_")
+  names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
+  
+  # Alert message if there is no NA
+  
+  sumNA <- colSums(calcul_NA)
+  sum_total <- sum(sumNA)
+  
+  alerte <- NULL
+  if(sum_total==0){
+    alerte <- c(alerte, "Data Matrix contains no NA.\n")
+  }
+  
+  if(length(alerte) != 0){ 
+    cat(alerte,"\n")
+  }
+  
+  table_NA <- cbind(calcul_NA, pct_NA)
+  
+  # check columns names ----------------------------------------
+  
+  # Fold
+  VM.names <- colnames(VM)
+  fold.names <- colnames(fold)
+  
+  for (i in 1:length(VM.names)){
+    for (j in 1:length(fold.names)){
+      if (VM.names[i]==fold.names[j]){
+        fold.names[j] <- paste(fold.names[j],"2", sep="_")
+      }
+    }
+  }
+  colnames(fold) <- fold.names
+  
+  # NA
+  NA.names <- colnames(table_NA)
+  
+  for (i in 1:length(VM.names)){
+    for (j in 1:length(NA.names)){
+      if (VM.names[i]==NA.names[j]){
+        NA.names[j] <- paste(NA.names[j],"2", sep="_")
+      }
+    }
+  }
+  colnames(table_NA) <- NA.names
+  
+  #for NA barplots ---------------------------------------------
+  
+  data_bp <- data.frame()
+  
+  for (j in 1:ncol(pct_NA)){
+    Nb_NA_0_20 <- 0
+    Nb_NA_20_40 <- 0
+    Nb_NA_40_60 <- 0
+    Nb_NA_60_80 <- 0
+    Nb_NA_80_100 <- 0
+    for (i in 1:nrow(pct_NA)){
+      
+      if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
+        Nb_NA_0_20=Nb_NA_0_20+1
+      }
+      
+      if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
+        Nb_NA_20_40=Nb_NA_20_40+1}
+      
+      if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
+        Nb_NA_40_60=Nb_NA_40_60+1}
+      
+      if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
+        Nb_NA_60_80=Nb_NA_60_80+1}   
+      
+      if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
+        Nb_NA_80_100=Nb_NA_80_100+1}   
+    }
+    data_bp[1,j] <- Nb_NA_0_20
+    data_bp[2,j] <- Nb_NA_20_40
+    data_bp[3,j] <- Nb_NA_40_60
+    data_bp[4,j] <- Nb_NA_60_80
+    data_bp[5,j] <- Nb_NA_80_100
+  }
+  rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%")
+  colnames(data_bp) <- classnames
+  data_bp <- as.matrix(data_bp)
+  
+  
+  # Output--------------------------------------------------------
+  
+  VM <- cbind(VM,fold,table_NA)
+  
+  write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
+  
+  #graphics pdf
+  
+  pdf(graphs.output)
+  
+  #Boxplots for NA
+  
+  par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
+  
+  bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
+  legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
+  
+  
+  stock=0
+  for (i in 1:nrow(data_bp)){
+    text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white")
+    stock <- stock+data_bp[i,]
+  }
+  
+  
+  #Boxplots for fold test
+  for (j in 1:ncol(fold)){
+    title=paste(fold.names[j])
+    boxplot(fold[j], main=title)
+  }
+  
+  dev.off()
+
+}  
+
+
+# Function call---------------
+
+#setwd("Y:\\Developpement\\Intensity check\\Pour tests")
+#intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", 
+#"Barplots_and_Boxplots")
+
+  
\ No newline at end of file
b
diff -r 000000000000 -r c2c2e1be904a Intchecks/wrapper_intensity_check.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Intchecks/wrapper_intensity_check.R Thu Oct 11 05:33:19 2018 -0400
[
@@ -0,0 +1,49 @@
+#!/usr/bin/Rscript --vanilla --slave --no-site-file
+
+####################################################################
+# WRAPPER for INTENSITY CHECK
+#Script: Script_intensity_check_v1.R
+#Xml: xml_intensity_check_v1.xml
+#
+# V1: Fold and NA
+#
+# Input: Data Matrix, VariableMetadata, SampleMetadata
+# Output: VariableMetadata, Graphics (barplots and boxplots) 
+#
+#
+####################################################################
+
+
+library(batch) #necessary for parseCommandArgs function
+args = parseCommandArgs(evaluate=FALSE) #interpretation of arguments given in command line as an R list of objects
+
+source_local <- function(...){
+  argv <- commandArgs(trailingOnly = FALSE)
+  base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
+  for(i in 1:length(list(...))){source(paste(base_dir, list(...)[[i]], sep="/"))}
+}
+#Import the different functions
+source_local("Script_intensity_check.R", "RcheckLibrary.R")
+
+
+if(length(args) < 7){ stop("NOT enough argument !!!") }
+
+class1 <- NULL
+if(args$type == "One_class"){
+  class1 <- args$class1
+}
+
+#print args
+cat("\n-------------------------------\nIntensity Check parameters:\n\n")
+print(args)
+cat("--------------------------------\n")
+
+
+intens_check(args$dataMatrix_in, args$sampleMetadata_in, args$variableMetadata_in, args$type,
+          args$class_col, class1, args$variableMetadata_out, args$graphs_out)
+
+sessionInfo()
+cat("--------------------------------\n")
+
+#delete the parameters to avoid the passage to the next tool in .RData image
+rm(args)
b
diff -r 000000000000 -r c2c2e1be904a Intchecks/xml_intensity_check.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Intchecks/xml_intensity_check.xml Thu Oct 11 05:33:19 2018 -0400
b
@@ -0,0 +1,125 @@
+<tool id="intens_check" name="Intensity Check" version="1.0.1">
+ <description>Adding informations about intensities in the Variable metadata</description>
+ <requirements>
+ <requirement type="package" version="1.1_4">r-batch</requirement>
+ </requirements>
+ <command interpreter="Rscript">
+  
+ wrapper_intensity_check.R
+
+ dataMatrix_in "$dataMatrix_in"
+ sampleMetadata_in "$sampleMetadata_in"
+ variableMetadata_in "$variableMetadata_in"
+
+ class_col "$class_col"
+
+ type "${type_cond.type}"
+ #if $type_cond.type == "One_class" :
+ class1 "${type_cond.class1}"
+ #end if
+
+ variableMetadata_out "$variableMetadata_out"
+ graphs_out "$graphs_out"
+ </command>
+
+ <inputs>
+ <param name="dataMatrix_in" type="data" label="Data Matrix file" help="" format="tabular" />
+ <param name="sampleMetadata_in" type="data" label="Sample metadata file" help="" format="tabular" />
+ <param name="variableMetadata_in" type="data" label="Variable metadata file" help="" format="tabular" />
+
+ <param name="class_col" type="data_column" data_ref="sampleMetadata_in" use_header_names="true" label="Class column" help="Class column in Sample metadata" />
+
+ <conditional name="type_cond">
+ <param name="type" type="select" label="Type" display="radio" help="Which class do you want to test ?">
+ <option value="One_class">Tests between one class and the remaining samples </option>
+ <option value="Each_class">Tests for each class </option>
+ </param>
+ <when value="One_class">
+ <param name="class1" type="text" label="Selected class" help="This class is the numerator for the fold test" />
+ </when>
+ <when value="Each_class">
+ </when>
+ </conditional>
+
+ </inputs>
+
+ <outputs>
+ <data name="variableMetadata_out" label="IC_${variableMetadata_in.name}" format="tabular" />
+ <data name="graphs_out" label="IC_Graphs" format="pdf" />
+ </outputs>
+
+ <help>
+
+.. class:: infomark
+
+**Authors** 
+  | Anthony Fernandes - PFEM ; INRA 
+
+---------------------------------------------------
+
+========================
+Intensity Check
+========================
+
+-----------
+Description
+-----------
+
+This tool performs two tests: the fold calculation, the number and the proportion of missing values. 
+
+**Fold:**
+The test calculates the ratio between two classes.
+In the column name, the first class specified is the one used like numerator for the ratio.
+
+**Missing values:**
+This tool calculates the number and the proportion of missing values considering the class. 
+Missing values in numerical columns of data must be coded NA.
+
+**Two types of tests:**
+ | - Between **one class** and the remaining samples: if you have only two classes or if you want to test only one class.
+ | - **Each class**: if the column class contains at least three classes and you want to test each of them.
+
+-----------
+Input files
+-----------
+
++----------------------------+------------+
+| Parameter                  |   Format   |
++============================+============+
+| 1 : Data Matrix file       |   tabular  |
++----------------------------+------------+
+| 2 : Sample metadata file   |   tabular  |
++----------------------------+------------+
+| 3 : Variable metadata file |   tabular  |
++----------------------------+------------+
+
+----------
+Parameters
+----------
+
+**Class column**
+ | Select the class column in Sample metadata.
+
+**Type**
+ |  Two options:
+ |     - "One class" allows to perform tests on one class against the remaining samples.
+ |     - "Each class" allows to add several columns with the ratio and the number of missing values for each class.
+
+**Selected class**
+ | If the type is "one class", specify it to calculate the ratio and the number of missing values. 
+
+------------
+Output file
+------------
+
+**Variable metadata file**
+ | Contains the previous columns in variable metadata and the new ones with fold tests, numbers and proportion of missing values.
+
+**Graphs file**
+ | Contains barplots with the proportion of NA considering classes and boxplots with the folds values
+
+ </help>
+
+
+</tool>
+
\ No newline at end of file