view R/compareSignature_Galaxy.r @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 916846f73e25
line wrap: on
line source

#!/usr/bin/Rscript

#-----------------------------------#
# Author: Maude                     #
# Script: compareSignature_Galaxy.r #
# Last update: 29/10/15             #
#-----------------------------------#


#########################################################################################################################################
#                                   Compare new signatures with published one using the cosine similarity method                        #
#########################################################################################################################################


#-------------------------------------------------------------------------------
# Print a usage message if there is no argument pass to the command line
#-------------------------------------------------------------------------------
args <- commandArgs(TRUE)
usage <- function() 
{
  msg <- paste0('Usage:\n',
               ' compareSignature_Galaxy.r Published_Signature New_Signature Output_Folder\n'
              )
  cat(msg, '\n', file="/dev/stderr")
  quit(status=1)
}

input = args[length(args)]

if (length(args) == 0) { usage() }


#-------------------------------------------------------------------------------
# Load library
#-------------------------------------------------------------------------------
suppressMessages(suppressWarnings(library(lsa)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(reshape)))

#-------------------------------------------------------------------------------
# Recover the arguments
#-------------------------------------------------------------------------------
published_signature_file <- args[1] # The matrix with the published signatures
unknown_signature_file   <- args[2] # The matrix W from NMF from which we want to compare the signatures
dir								       <- args[3] # html directory


#-------------------------------------------------------------------------------
# Set the variables
#-------------------------------------------------------------------------------
# Create the outputs
output_cosineRes <- paste0(dir, "/Similarity_Matrix.txt")
output_png       <- paste0(dir, "/Similarity_Matrix.png")


#-------------------------------------------------------------------------------
# Calculate the cosine similarity and represent it with a heatmap
#-------------------------------------------------------------------------------
# Published signatures
dataFrame1 <- read.table(published_signature_file, header=T, sep="\t")
# Remove the first three colmumns (Substitution Type, Trinucleotide  Somatic, Mutation Type)
dataFrame1 <- dataFrame1[,4:ncol(dataFrame1)]
matrix1    <- as.matrix(dataFrame1)
  
# Unkown signatures
dataFrame2 <- read.table(unknown_signature_file, header=T, sep="\t")
# Remove the first two columns (alteration, context)
dataFrame2 <- dataFrame2[,3:ncol(dataFrame2)]
matrix2    <- as.matrix(dataFrame2)
# Recover the number of new signatures
NbNewSignature   <- ncol(dataFrame2) - 1

# Combined the two matrices (published and unknown signatures)
input_matrix_cos <- cbind(matrix1, matrix2)
# Calculate the cosine similarity
cosine_res <- cosine(input_matrix_cos)

# Keep only the comparison between the two matrices
nbSign            <- ncol(matrix1)+1 # +1 for havng the first signature of the matrix1
cosine_res_subset <- cosine_res[nbSign:nrow(cosine_res), 1:ncol(matrix1)]
  
# Save the matrix
write.table(cosine_res_subset, file=output_cosineRes, quote=F, sep="\t", col.names=T, row.names=T)

# Transform the matrix in a suitable format for ggplot2
cosineRes_subset_melt <- melt(cosine_res_subset)
# Rename the columns
colnames(cosineRes_subset_melt) <- c("Unknown_Signatures", "Published_Signatures", "Similarity")
# Reorder the Signature for having the same order as in the matrix. Turn your 'signature' column into a character vector
cosineRes_subset_melt$Published_Signatures <- as.character(cosineRes_subset_melt$Published_Signatures)
#Then turn it back into an ordered factor
cosineRes_subset_melt$Published_Signatures <- factor(cosineRes_subset_melt$Published_Signatures, levels=rev(unique(cosineRes_subset_melt$Published_Signature)))
  
# Base plot: heatmap
p1 <- ggplot(cosineRes_subset_melt, aes(x=Published_Signatures, y=Unknown_Signatures, fill=Similarity)) + geom_tile(colour="yellow") +scale_fill_gradientn(colours=c("yellow", "red")) + theme_classic()
  
# Rename the signatures
if(basename(published_signature_file) == "Frequency-COSMICv72-Hupki.txt")
{
  p1 <- p1 + scale_x_discrete(breaks = c("Signature.1", "Signature.2", "Signature.3", "Signature.4", "Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9",
                                         "Signature.10", "Signature.11", "Signature.12", "Signature.13", "Signature.14", "Signature.15", "Signature.16", "Signature.17",
                                         "Signature.18", "Signature.19", "Signature.20", "Signature.21", "Signature.22", "Signature.23", "Signature.24", "Signature.25",
                                         "Signature.26", "Signature.27", "Signature.28", "Signature.29", "Signature.30",
                                         "Signature.1.MEF", "Signature.2.MEF", "Signature.3.MEF", "Signature.5.MEF"),
                              labels = c("(Age) Sign 1", "(AID/APOBEC) Sign 2", "(BRCA1/2) Sign 3", "(Smoking) Sign 4", "Sign 5", "(DNA MMR deficiency) Sign 6", "(UV) Sign 7",
                                         "Sign 8", "(IgG) Sign 9", "(pol e) Sign 10", "(temozolomide) Sign 11", "Sign 12", "(AID/APOBEC) Sign 13", "Sign 14",
                                         "(DNA MMR deficiency) Sign 15", "Sign 16", "Sign 17", "Sign 18", "Sign 19", "(DNA MMR deficiency) Sign 20", "Sign 21", "(AA) Sign 22",
                                         "Sign 23", "(Aflatoxin) Sign 24", "Sign 25", "(DNA MMR deficiency) Sign 26", "Sign 27", "Sign 28", "(Tobacco chewing) Sign 29", "Sign 30",
                                         "(AA) Sign 1 MEF", "(AID) Sign 2 MEF", "(BaP) Sign 3 MEF", "(MNNG) Sign 5 MEF")
                              )
}

# Flipped cartesian coordinates so that horizontal becomes vertical, and vertical, horizontal
p1 <- p1 +  coord_flip()
# Remove the x axis line
p1 <- p1 + theme(axis.line.x=element_blank(), axis.line.y=element_blank())
# Add the cosine value only if >= 0.9
cosResLabel <- subset(cosineRes_subset_melt, round(cosineRes_subset_melt$Similarity, digits=2) >= 0.9) # Subset the data for keeping only the values greater thant 0.9
p1 <- p1 + geom_text(data = cosResLabel, aes(x = Published_Signatures, y = Unknown_Signatures, label = round(cosResLabel$Similarity, 2)))

graphics.off()
options(bitmapType='cairo')
png(output_png, width=3000, height=2000, res=300)
plot(p1)
invisible( dev.off() )