view R/chi2test_MutSpecStat_Galaxy.r @ 7:eda59b985b1c draft default tip

Uploaded
author iarc
date Mon, 13 Mar 2017 08:21:19 -0400
parents
children
line wrap: on
line source

#!/usr/bin/Rscript

#---------------------------------------#
# Author: Maude                         #
# Script: chi2test_MutSpecStat_Galaxy.r #
# Last update: 18/10/16                 #
#---------------------------------------#


#########################################################################################################################################
#                                  					 					Calculate the chi2 test for the strand bias            		          		  				#
#########################################################################################################################################

#-------------------------------------------------------------------------------
# Load library for recovering the arguments
#-------------------------------------------------------------------------------
suppressMessages(suppressWarnings(require("getopt")))


#-------------------------------------------------------------------------------
# Recover the arguments
#-------------------------------------------------------------------------------
spec = matrix(c(
                "folderChi2", "folderChi2", 1, "character",
                "help",       "h",          0, "logical"
               ),
               byrow=TRUE, ncol=4
            )

opt = getopt(spec)

# No argument is pass to the command line
if(length(opt) == 1)
{
  cat(paste("Usage:\n chi2test_MutSpecStat_Galaxy.r --folderChi2 <path_to_folder> \n",sep=""))
  q(status=1)
}

# Help was asked for.
if ( !is.null(opt$help) )
{
  # print a friendly message and exit with a non-zero error code
  cat(paste("Usage:\n  chi2test_MutSpecStat_Galaxy.r --folderChi2 <path_to_folder> \n",sep=""))
  q(status=1)
}





## Load the data. There is one column with the mutation type and the sample name but it's just for knowing what is corresponding to each line. The two columns with the number of variant per strand would be sufficient.
inputChi2 <- paste0(opt$folderChi2, "/Input_chi2_strandBias.txt")
strBias<-read.delim(inputChi2, dec=".")

# Chi2
pValChi2       <- c() # First I create an empty vector and then I apply a for on the data load
pValChi2_round <- c() # Empty vector with the rounded p-values
confInt        <- c() # Empty vector for the confident interval
proportion     <- c() # Empty vector for the proportion of NonTr compared to the (NonTr+Tr)
sampleSize     <- c() # Empty vector for the count of samples in NonTr and Tr
# For Pool_Data save the p-values in a different vector for not having them for the FDR
pValChi2_PoolData       <- c()
pValChi2_PoolData_Round <- c()

j = 1 # Timer for pValChi2_PoolData vector
k = 1 # Timer for pValChi2

for(i in 1:nrow(strBias))
{
	if(! sum(strBias[i,2:3]) == 0)
	{
		# For Pool_Data
		if(strBias[i,1] == "Pool_Data")
		{
			pValChi2_PoolData[j] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
			j <- j+1
		}
		# For the other sample(s)
		else
		{
			# Calculate the p-value
			pValChi2[k] <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$p.value
			k <- k+1
		}

		# Calculate the confidence interval
		temp       <- prop.test(x=strBias[i,2],n=sum(strBias[i,2:3]),p=0.5)$conf.int
		confInt[i] <- paste0("[", round(temp[1],2), "-", round(temp[2],2), "]") # Same as paste(sep="")

		# Save the proportion
		proportion[i] <- strBias[i,2] / sum(strBias[i,2:3])

		# Save the sample size (count on NonTr and Tr)
		sampleSize[i]  <- paste(strBias[i,2], strBias[i,3], sep="-")
	} else
	{
		if(strBias[i,1] == "Pool_Data")
		{
			pValChi2_PoolData[j]       <- NA
			pValChi2_PoolData_Round[j] <- NA
			j <- j+1
		}
		else
		{
			# Not enough effective for the test
			pValChi2[k]       <- NA
			pValChi2_round[k] <- NA
			k <- k+1
		}

		confInt[i]        <- NA
		proportion[i]     <- NA
		sampleSize[i]     <- NA
	}
}

# Adjust with FDR
FDR<-p.adjust(pValChi2, method="BH")

# Rount the p-value
for(i in 1:nrow(strBias))
{
	pValChi2_round[i] <- format(pValChi2[i], scientific=T, digits=3)
}

# The option for the pool is specified
if(!is.null(pValChi2_PoolData))
{
	# Round the p-value for Pool_Data
	for(i in 1:6)
	{
		pValChi2_PoolData_Round[i] <- format(pValChi2_PoolData[i], scientific=T, digits=3)
	}
}


# I create a dataframe for add what I want
outputChi2 <- data.frame(round(strBias[,2]/strBias[,3], digits=2), sampleSize, round(proportion, 3), confInt)
outputChi2$Mut.type   <- strBias$Alteration
outputChi2$SampleName <- strBias$SampleName
colnames(outputChi2)[1:6]<-c("Strand_Bias", "NonTr-Tr", "Proportion", "Confidence Interval", "Mutation_Type", "SampleName")

# Transform the data frame into a matrix for adding the p-value for the samples and Pool_Data
matrix         <- as.matrix(outputChi2)
tempColPValFDR <- matrix(, nrow=length(sampleSize), ncol = 2) # Create an empty matrix with 2 columns for adding the p-value and the FDR
matrix         <- cbind(matrix, tempColPValFDR)
j = 1 # Timer for all the sample
k = 1 # Timer for Pool_Data
for(i in 1:nrow(matrix))
{
	if(matrix[i,6] == "Pool_Data")
	{
	  matrix[i,7] <- pValChi2_PoolData_Round[k]
	  matrix[i,8] <- "NA" # No FDR for Pool_Data
	  k = k+1
	}
	else
	{
	  matrix[i,7] <- pValChi2_round[j]
	  matrix[i,8] <- round(FDR[j], 3)
	  j = j+1
	}
}

# Reorder the columns
matrix <- cbind(matrix[,1:3], matrix[,7], matrix[,8], matrix[,4:6])
colnames(matrix)[4] <- "P-val-Chi2"
colnames(matrix)[5] <- "FDR"

# Export the file
# dec=".": Set the separator for the decimal by "."
outputFileChi2 <- paste0(opt$folderChi2, "/Output_chi2_strandBias.txt")
write.table(matrix,file=outputFileChi2,quote = FALSE,sep="\t",row.names = FALSE,dec=".")