diff R/chi2test_MutSpecStat_Galaxy.r @ 7:eda59b985b1c draft default tip

Uploaded
author iarc
date Mon, 13 Mar 2017 08:21:19 -0400 (2017-03-13)
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/R/chi2test_MutSpecStat_Galaxy.r	Mon Mar 13 08:21:19 2017 -0400
@@ -0,0 +1,173 @@
+#!/usr/bin/Rscript
+
+#---------------------------------------#
+# Author: Maude                         #
+# Script: chi2test_MutSpecStat_Galaxy.r #
+# Last update: 18/10/16                 #
+#---------------------------------------#
+
+
+#########################################################################################################################################
+#                                  					 					Calculate the chi2 test for the strand bias            		          		  				#
+#########################################################################################################################################
+
+#-------------------------------------------------------------------------------
+# Load library for recovering the arguments
+#-------------------------------------------------------------------------------
+suppressMessages(suppressWarnings(require("getopt")))
+
+
+#-------------------------------------------------------------------------------
+# Recover the arguments
+#-------------------------------------------------------------------------------
+spec = matrix(c(
+                "folderChi2", "folderChi2", 1, "character",
+                "help",       "h",          0, "logical"
+               ),
+               byrow=TRUE, ncol=4
+            )
+
+opt = getopt(spec)
+
+# No argument is pass to the command line
+if(length(opt) == 1)
+{
+  cat(paste("Usage:\n chi2test_MutSpecStat_Galaxy.r --folderChi2 <path_to_folder> \n",sep=""))
+  q(status=1)
+}
+
+# Help was asked for.
+if ( !is.null(opt$help) )
+{
+  # print a friendly message and exit with a non-zero error code
+  cat(paste("Usage:\n  chi2test_MutSpecStat_Galaxy.r --folderChi2 <path_to_folder> \n",sep=""))
+  q(status=1)
+}
+
+
+
+
+
+## Load the data. There is one column with the mutation type and the sample name but it's just for knowing what is corresponding to each line. The two columns with the number of variant per strand would be sufficient.
+inputChi2 <- paste0(opt$folderChi2, "/Input_chi2_strandBias.txt")
+strBias<-read.delim(inputChi2, dec=".")
+
+# Chi2
+pValChi2       <- c() # First I create an empty vector and then I apply a for on the data load
+pValChi2_round <- c() # Empty vector with the rounded p-values
+confInt        <- c() # Empty vector for the confident interval
+proportion     <- c() # Empty vector for the proportion of NonTr compared to the (NonTr+Tr)
+sampleSize     <- c() # Empty vector for the count of samples in NonTr and Tr
+# For Pool_Data save the p-values in a different vector for not having them for the FDR
+pValChi2_PoolData       <- c()
+pValChi2_PoolData_Round <- c()
+
+j = 1 # Timer for pValChi2_PoolData vector
+k = 1 # Timer for pValChi2
+
+for(i in 1:nrow(strBias))
+{
+	if(! sum(strBias[i,2:3]) == 0)
+	{
+		# For Pool_Data
+		if(strBias[i,1] == "Pool_Data")
+		{
+			pValChi2_PoolData[j] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
+			j <- j+1
+		}
+		# For the other sample(s)
+		else
+		{
+			# Calculate the p-value
+			pValChi2[k] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
+			k <- k+1
+		}
+
+		# Calculate the confidence interval
+		temp       <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$conf.int
+		confInt[i] <- paste0("[", round(temp[1],2), "-", round(temp[2],2), "]") # Same as paste(sep="")
+
+		# Save the proportion
+		proportion[i] <- strBias[i,2] / sum(strBias[i,2:3])
+
+		# Save the sample size (count on NonTr and Tr)
+		sampleSize[i]  <- paste(strBias[i,2], strBias[i,3], sep="-")
+	} else
+	{
+		if(strBias[i,1] == "Pool_Data")
+		{
+			pValChi2_PoolData[j]       <- NA
+			pValChi2_PoolData_Round[j] <- NA
+			j <- j+1
+		}
+		else
+		{
+			# Not enough effective for the test
+			pValChi2[k]       <- NA
+			pValChi2_round[k] <- NA
+			k <- k+1
+		}
+
+		confInt[i]        <- NA
+		proportion[i]     <- NA
+		sampleSize[i]     <- NA
+	}
+}
+
+# Adjust with FDR
+FDR<-p.adjust(pValChi2, method="BH")
+
+# Rount the p-value
+for(i in 1:nrow(strBias))
+{
+	pValChi2_round[i] <- format(pValChi2[i], scientific=T, digits=3)
+}
+
+# The option for the pool is specified
+if(!is.null(pValChi2_PoolData))
+{
+	# Round the p-value for Pool_Data
+	for(i in 1:6)
+	{
+		pValChi2_PoolData_Round[i] <- format(pValChi2_PoolData[i], scientific=T, digits=3)
+	}
+}
+
+
+# I create a dataframe for add what I want
+outputChi2 <- data.frame(round(strBias[,2]/strBias[,3], digits=2), sampleSize, round(proportion, 3), confInt)
+outputChi2$Mut.type   <- strBias$Alteration
+outputChi2$SampleName <- strBias$SampleName
+colnames(outputChi2)[1:6]<-c("Strand_Bias", "NonTr-Tr", "Proportion", "Confidence Interval", "Mutation_Type", "SampleName")
+
+# Transform the data frame into a matrix for adding the p-value for the samples and Pool_Data
+matrix         <- as.matrix(outputChi2)
+tempColPValFDR <- matrix(, nrow=length(sampleSize), ncol = 2) # Create an empty matrix with 2 columns for adding the p-value and the FDR
+matrix         <- cbind(matrix, tempColPValFDR)
+j = 1 # Timer for all the sample
+k = 1 # Timer for Pool_Data
+for(i in 1:nrow(matrix))
+{
+	if(matrix[i,6] == "Pool_Data")
+	{
+	  matrix[i,7] <- pValChi2_PoolData_Round[k]
+	  matrix[i,8] <- "NA" # No FDR for Pool_Data
+	  k = k+1
+	}
+	else
+	{
+	  matrix[i,7] <- pValChi2_round[j]
+	  matrix[i,8] <- round(FDR[j], 3)
+	  j = j+1
+	}
+}
+
+# Reorder the columns
+matrix <- cbind(matrix[,1:3], matrix[,7], matrix[,8], matrix[,4:6])
+colnames(matrix)[4] <- "P-val-Chi2"
+colnames(matrix)[5] <- "FDR"
+
+# Export the file
+# dec=".": Set the separator for the decimal by "."
+outputFileChi2 <- paste0(opt$folderChi2, "/Output_chi2_strandBias.txt")
+write.table(matrix,file=outputFileChi2,quote = FALSE,sep="\t",row.names = FALSE,dec=".")