Mercurial > repos > melpetera > intensity_checks
changeset 0:c2c2e1be904a draft
Uploaded
author | melpetera |
---|---|
date | Thu, 11 Oct 2018 05:33:19 -0400 |
parents | |
children | 4973a2104cfd |
files | Intchecks/RcheckLibrary.R Intchecks/Script_intensity_check.R Intchecks/wrapper_intensity_check.R Intchecks/xml_intensity_check.xml |
diffstat | 4 files changed, 562 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Intchecks/RcheckLibrary.R Thu Oct 11 05:33:19 2018 -0400 @@ -0,0 +1,124 @@ +###################################################### +# R check library +# Coded by: M.Petera, +# - - +# R functions to use in R scripts +# (management of various generic subroutines) +# - - +# V0: script structure + first functions +# V1: More detailed error messages in match functions +###################################################### + + +# Generic function to return an error if problems have been encountered - - - - + +check.err <- function(err.stock){ + + # err.stock = vector of results returned by check functions + + if(length(err.stock)!=0){ stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n") } + +} + + + + +# Table match check functions - - - - - - - - - - - - - - - - - - - - - - - - - + +# To check if dataMatrix and (variable or sample)Metadata match regarding identifiers +match2 <- function(dataMatrix, Metadata, Mtype){ + + # dataMatrix = data.frame containing dataMatrix + # Metadata = data.frame containing sampleMetadata or variableMetadata + # Mtype = "sample" or "variable" depending on Metadata content + + err.stock <- NULL # error vector + + id2 <- Metadata[,1] + if(Mtype=="sample"){ id1 <- colnames(dataMatrix)[-1] } + if(Mtype=="variable"){ id1 <- dataMatrix[,1] } + + if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){ + err.stock <- c("\nData matrix and ",Mtype," metadata do not match regarding ",Mtype," identifiers.") + if(length(which(id1%in%id2))!=length(id1)){ + if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the data matrix\n", + " do not appear in the ",Mtype," metadata file:\n") + identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + if(length(which(id2%in%id1))!=length(id2)){ + if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the ",Mtype," metadata file\n", + " do not appear in the data matrix:\n") + identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + err.stock <- c(err.stock,"\nPlease check your data.\n") + } + + return(err.stock) + +} + +# To check if the 3 standard tables match regarding identifiers +match3 <- function(dataMatrix, sampleMetadata, variableMetadata){ + + # dataMatrix = data.frame containing dataMatrix + # sampleMetadata = data.frame containing sampleMetadata + # variableMetadata = data.frame containing variableMetadata + + err.stock <- NULL # error vector + + id1 <- colnames(dataMatrix)[-1] + id2 <- sampleMetadata[,1] + id3 <- dataMatrix[,1] + id4 <- variableMetadata[,1] + + if( length(which(id1%in%id2))!=length(id1) || length(which(id2%in%id1))!=length(id2) ){ + err.stock <- c(err.stock,"\nData matrix and sample metadata do not match regarding sample identifiers.") + if(length(which(id1%in%id2))!=length(id1)){ + if(length(which(!(id1%in%id2)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the data matrix\n", + " do not appear in the sample metadata file:\n") + identif <- id1[which(!(id1%in%id2))][1:min(3,length(which(!(id1%in%id2))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + if(length(which(id2%in%id1))!=length(id2)){ + if(length(which(!(id2%in%id1)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the sample metadata file\n", + " do not appear in the data matrix:\n") + identif <- id2[which(!(id2%in%id1))][1:min(3,length(which(!(id2%in%id1))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + } + + if( length(which(id3%in%id4))!=length(id3) || length(which(id4%in%id3))!=length(id4) ){ + err.stock <- c(err.stock,"\nData matrix and variable metadata do not match regarding variable identifiers.") + if(length(which(id3%in%id4))!=length(id3)){ + if(length(which(!(id3%in%id4)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the data matrix\n", + " do not appear in the variable metadata file:\n") + identif <- id3[which(!(id3%in%id4))][1:min(3,length(which(!(id3%in%id4))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + if(length(which(id4%in%id3))!=length(id4)){ + if(length(which(!(id4%in%id3)))<4){ err.stock <- c(err.stock,"\n The ") + }else{ err.stock <- c(err.stock,"\n For example, the ") } + err.stock <- c(err.stock,"following identifiers found in the variable metadata file\n", + " do not appear in the data matrix:\n") + identif <- id4[which(!(id4%in%id3))][1:min(3,length(which(!(id4%in%id3))))] + err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") + } + } + + if(length(err.stock)!=0){ err.stock <- c(err.stock,"\nPlease check your data.\n") } + + return(err.stock) + +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Intchecks/Script_intensity_check.R Thu Oct 11 05:33:19 2018 -0400 @@ -0,0 +1,264 @@ +#################################################################### +# SCRIPT INTENSITY CHECK +# V1: Fold and NA +# +# Input: Data Matrix, VariableMetadata, SampleMetadata +# Output: VariableMetadata, Graphics (barplots and boxplots) +# +# Dependencies: RcheckLibrary.R +# +#################################################################### + + +# Parameters (for dev) +if(FALSE){ + + rm(list = ls()) + setwd("Y:\\Developpement\\Intensity check\\Pour tests") + + DM.name <- "DM_NA.tabular" + SM.name <- "SM_NA.tabular" + VM.name <- "vM_NA.tabular" + type <- "One_class" + class.col <- "2" + class1 <- "Blanks" + VM.output <- "new_VM.txt" + graphs.output <- "Barplots_and_Boxplots.pdf" +} + +intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){ + # This function allows to check the intensities considering classes with a fold calculation, the number and + # the proportion of NA + # + # Two options: + # - one class (selected by the user) against all the remaining samples ("One_class") + # - tests on each class ("Each_class") + # + # Parameters: + # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access + # type: "One_class" or "Each_class" + # class.col: number of the sampleMetadata's column with classes + # class1: name of the class if type="One_class" + # VM.output: output file's access (VM with the new columns) + # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values + + + + + # Input --------------------------------------------------------- + + DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) + SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) + VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) + + + + # Table match check with Rchecklibrary + table.check <- match3(DM, SM, VM) + check.err(table.check) + + + rownames(DM) <- DM[,1] + var_names <- DM[,1] + DM <- DM[,-1] + DM <- data.frame(t(DM)) + + class.col <- colnames(SM)[as.numeric(class.col)] + + # check class.col, class1 and the number of classes--------------- + + if(!(class.col %in% colnames(SM))){ + stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") + } + + c_class <- SM[,class.col] + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + + + if((nb_class > (nrow(SM))/3)){ + class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those + with few samples \n") + cat(class.err) + } + + if(type == "One_class"){ + if(!(class1 %in% classnames)){ + list.class1 <- c("\n Classes:",classnames,"\n") + cat(list.class1) + err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) + stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") + } + } + + #If type is "one_class", change others classes in "other" + if(type == "One_class"){ + for(i in 1:length(c_class)){ + if(c_class[i]!=class1){ + c_class <- as.character(c_class) + c_class[i] <- "Other" + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + + } + } + } + + DM <- cbind(DM,c_class) + + # fold ------------------------------------------------------- + n <- 1 + fold <- data.frame() + for(j in 1:(nb_class-1)){ + for(k in (j+1):nb_class) { + for (i in 1:(length(DM)-1)){ + fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/ + mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)} + names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_") + n <- n + 1} + } + + # NA --------------------------------------------------------- + calcul_NA <- data.frame() + pct_NA <- data.frame() + for (i in 1:(length(DM)-1)){ + for (j in 1:nb_class){ + n <- 0 + new_DM <- DM[which(DM$c_class==classnames[j]),i] + for(k in 1:length(new_DM)){ + if (is.na(new_DM[k])){ + n <- n + 1} + calcul_NA[i,j] <- n + pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} + } + } + names(calcul_NA) <- paste("Nb_NA",classnames, sep="_") + names(pct_NA) <- paste("Pct_NA", classnames, sep="_") + + # Alert message if there is no NA + + sumNA <- colSums(calcul_NA) + sum_total <- sum(sumNA) + + alerte <- NULL + if(sum_total==0){ + alerte <- c(alerte, "Data Matrix contains no NA.\n") + } + + if(length(alerte) != 0){ + cat(alerte,"\n") + } + + table_NA <- cbind(calcul_NA, pct_NA) + + # check columns names ---------------------------------------- + + # Fold + VM.names <- colnames(VM) + fold.names <- colnames(fold) + + for (i in 1:length(VM.names)){ + for (j in 1:length(fold.names)){ + if (VM.names[i]==fold.names[j]){ + fold.names[j] <- paste(fold.names[j],"2", sep="_") + } + } + } + colnames(fold) <- fold.names + + # NA + NA.names <- colnames(table_NA) + + for (i in 1:length(VM.names)){ + for (j in 1:length(NA.names)){ + if (VM.names[i]==NA.names[j]){ + NA.names[j] <- paste(NA.names[j],"2", sep="_") + } + } + } + colnames(table_NA) <- NA.names + + #for NA barplots --------------------------------------------- + + data_bp <- data.frame() + + for (j in 1:ncol(pct_NA)){ + Nb_NA_0_20 <- 0 + Nb_NA_20_40 <- 0 + Nb_NA_40_60 <- 0 + Nb_NA_60_80 <- 0 + Nb_NA_80_100 <- 0 + for (i in 1:nrow(pct_NA)){ + + if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){ + Nb_NA_0_20=Nb_NA_0_20+1 + } + + if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){ + Nb_NA_20_40=Nb_NA_20_40+1} + + if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){ + Nb_NA_40_60=Nb_NA_40_60+1} + + if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){ + Nb_NA_60_80=Nb_NA_60_80+1} + + if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){ + Nb_NA_80_100=Nb_NA_80_100+1} + } + data_bp[1,j] <- Nb_NA_0_20 + data_bp[2,j] <- Nb_NA_20_40 + data_bp[3,j] <- Nb_NA_40_60 + data_bp[4,j] <- Nb_NA_60_80 + data_bp[5,j] <- Nb_NA_80_100 + } + rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%") + colnames(data_bp) <- classnames + data_bp <- as.matrix(data_bp) + + + # Output-------------------------------------------------------- + + VM <- cbind(VM,fold,table_NA) + + write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) + + #graphics pdf + + pdf(graphs.output) + + #Boxplots for NA + + par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) + + bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") + legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) + + + stock=0 + for (i in 1:nrow(data_bp)){ + text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white") + stock <- stock+data_bp[i,] + } + + + #Boxplots for fold test + for (j in 1:ncol(fold)){ + title=paste(fold.names[j]) + boxplot(fold[j], main=title) + } + + dev.off() + +} + + +# Function call--------------- + +#setwd("Y:\\Developpement\\Intensity check\\Pour tests") +#intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", +#"Barplots_and_Boxplots") + + \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Intchecks/wrapper_intensity_check.R Thu Oct 11 05:33:19 2018 -0400 @@ -0,0 +1,49 @@ +#!/usr/bin/Rscript --vanilla --slave --no-site-file + +#################################################################### +# WRAPPER for INTENSITY CHECK +#Script: Script_intensity_check_v1.R +#Xml: xml_intensity_check_v1.xml +# +# V1: Fold and NA +# +# Input: Data Matrix, VariableMetadata, SampleMetadata +# Output: VariableMetadata, Graphics (barplots and boxplots) +# +# +#################################################################### + + +library(batch) #necessary for parseCommandArgs function +args = parseCommandArgs(evaluate=FALSE) #interpretation of arguments given in command line as an R list of objects + +source_local <- function(...){ + argv <- commandArgs(trailingOnly = FALSE) + base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) + for(i in 1:length(list(...))){source(paste(base_dir, list(...)[[i]], sep="/"))} +} +#Import the different functions +source_local("Script_intensity_check.R", "RcheckLibrary.R") + + +if(length(args) < 7){ stop("NOT enough argument !!!") } + +class1 <- NULL +if(args$type == "One_class"){ + class1 <- args$class1 +} + +#print args +cat("\n-------------------------------\nIntensity Check parameters:\n\n") +print(args) +cat("--------------------------------\n") + + +intens_check(args$dataMatrix_in, args$sampleMetadata_in, args$variableMetadata_in, args$type, + args$class_col, class1, args$variableMetadata_out, args$graphs_out) + +sessionInfo() +cat("--------------------------------\n") + +#delete the parameters to avoid the passage to the next tool in .RData image +rm(args)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Intchecks/xml_intensity_check.xml Thu Oct 11 05:33:19 2018 -0400 @@ -0,0 +1,125 @@ +<tool id="intens_check" name="Intensity Check" version="1.0.1"> + <description>Adding informations about intensities in the Variable metadata</description> + <requirements> + <requirement type="package" version="1.1_4">r-batch</requirement> + </requirements> + <command interpreter="Rscript"> + + wrapper_intensity_check.R + + dataMatrix_in "$dataMatrix_in" + sampleMetadata_in "$sampleMetadata_in" + variableMetadata_in "$variableMetadata_in" + + class_col "$class_col" + + type "${type_cond.type}" + #if $type_cond.type == "One_class" : + class1 "${type_cond.class1}" + #end if + + variableMetadata_out "$variableMetadata_out" + graphs_out "$graphs_out" + </command> + + <inputs> + <param name="dataMatrix_in" type="data" label="Data Matrix file" help="" format="tabular" /> + <param name="sampleMetadata_in" type="data" label="Sample metadata file" help="" format="tabular" /> + <param name="variableMetadata_in" type="data" label="Variable metadata file" help="" format="tabular" /> + + <param name="class_col" type="data_column" data_ref="sampleMetadata_in" use_header_names="true" label="Class column" help="Class column in Sample metadata" /> + + <conditional name="type_cond"> + <param name="type" type="select" label="Type" display="radio" help="Which class do you want to test ?"> + <option value="One_class">Tests between one class and the remaining samples </option> + <option value="Each_class">Tests for each class </option> + </param> + <when value="One_class"> + <param name="class1" type="text" label="Selected class" help="This class is the numerator for the fold test" /> + </when> + <when value="Each_class"> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="variableMetadata_out" label="IC_${variableMetadata_in.name}" format="tabular" /> + <data name="graphs_out" label="IC_Graphs" format="pdf" /> + </outputs> + + <help> + +.. class:: infomark + +**Authors** + | Anthony Fernandes - PFEM ; INRA + +--------------------------------------------------- + +======================== +Intensity Check +======================== + +----------- +Description +----------- + +This tool performs two tests: the fold calculation, the number and the proportion of missing values. + +**Fold:** +The test calculates the ratio between two classes. +In the column name, the first class specified is the one used like numerator for the ratio. + +**Missing values:** +This tool calculates the number and the proportion of missing values considering the class. +Missing values in numerical columns of data must be coded NA. + +**Two types of tests:** + | - Between **one class** and the remaining samples: if you have only two classes or if you want to test only one class. + | - **Each class**: if the column class contains at least three classes and you want to test each of them. + +----------- +Input files +----------- + ++----------------------------+------------+ +| Parameter | Format | ++============================+============+ +| 1 : Data Matrix file | tabular | ++----------------------------+------------+ +| 2 : Sample metadata file | tabular | ++----------------------------+------------+ +| 3 : Variable metadata file | tabular | ++----------------------------+------------+ + +---------- +Parameters +---------- + +**Class column** + | Select the class column in Sample metadata. + +**Type** + | Two options: + | - "One class" allows to perform tests on one class against the remaining samples. + | - "Each class" allows to add several columns with the ratio and the number of missing values for each class. + +**Selected class** + | If the type is "one class", specify it to calculate the ratio and the number of missing values. + +------------ +Output file +------------ + +**Variable metadata file** + | Contains the previous columns in variable metadata and the new ones with fold tests, numbers and proportion of missing values. + +**Graphs file** + | Contains barplots with the proportion of NA considering classes and boxplots with the folds values + + </help> + + +</tool> + \ No newline at end of file