annotate R/compareSignature_Galaxy.r @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 916846f73e25
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
1 #!/usr/bin/Rscript
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
2
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
3 #-----------------------------------#
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
4 # Author: Maude #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
5 # Script: compareSignature_Galaxy.r #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
6 # Last update: 29/10/15 #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
7 #-----------------------------------#
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
8
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
9
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
10 #########################################################################################################################################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
11 # Compare new signatures with published one using the cosine similarity method #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
12 #########################################################################################################################################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
13
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
14
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
15 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
16 # Print a usage message if there is no argument pass to the command line
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
17 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
18 args <- commandArgs(TRUE)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
19 usage <- function()
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
20 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
21 msg <- paste0('Usage:\n',
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
22 ' compareSignature_Galaxy.r Published_Signature New_Signature Output_Folder\n'
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
23 )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
24 cat(msg, '\n', file="/dev/stderr")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
25 quit(status=1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
26 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
27
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
28 input = args[length(args)]
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
29
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
30 if (length(args) == 0) { usage() }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
31
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
32
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
33 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
34 # Load library
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
35 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
36 suppressMessages(suppressWarnings(library(lsa)))
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
37 suppressMessages(suppressWarnings(library(ggplot2)))
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
38 suppressMessages(suppressWarnings(library(reshape)))
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
39
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
40 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
41 # Recover the arguments
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
42 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
43 published_signature_file <- args[1] # The matrix with the published signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
44 unknown_signature_file <- args[2] # The matrix W from NMF from which we want to compare the signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
45 dir <- args[3] # html directory
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
46
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
47
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
48 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
49 # Set the variables
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
50 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
51 # Create the outputs
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
52 output_cosineRes <- paste0(dir, "/Similarity_Matrix.txt")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
53 output_png <- paste0(dir, "/Similarity_Matrix.png")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
54
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
55
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
56 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
57 # Calculate the cosine similarity and represent it with a heatmap
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
58 #-------------------------------------------------------------------------------
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
59 # Published signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
60 dataFrame1 <- read.table(published_signature_file, header=T, sep="\t")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
61 # Remove the first three colmumns (Substitution Type, Trinucleotide Somatic, Mutation Type)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
62 dataFrame1 <- dataFrame1[,4:ncol(dataFrame1)]
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
63 matrix1 <- as.matrix(dataFrame1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
64
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
65 # Unkown signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
66 dataFrame2 <- read.table(unknown_signature_file, header=T, sep="\t")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
67 # Remove the first two columns (alteration, context)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
68 dataFrame2 <- dataFrame2[,3:ncol(dataFrame2)]
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
69 matrix2 <- as.matrix(dataFrame2)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
70 # Recover the number of new signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
71 NbNewSignature <- ncol(dataFrame2) - 1
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
72
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
73 # Combined the two matrices (published and unknown signatures)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
74 input_matrix_cos <- cbind(matrix1, matrix2)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
75 # Calculate the cosine similarity
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
76 cosine_res <- cosine(input_matrix_cos)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
77
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
78 # Keep only the comparison between the two matrices
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
79 nbSign <- ncol(matrix1)+1 # +1 for havng the first signature of the matrix1
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
80 cosine_res_subset <- cosine_res[nbSign:nrow(cosine_res), 1:ncol(matrix1)]
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
81
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
82 # Save the matrix
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
83 write.table(cosine_res_subset, file=output_cosineRes, quote=F, sep="\t", col.names=T, row.names=T)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
84
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
85 # Transform the matrix in a suitable format for ggplot2
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
86 cosineRes_subset_melt <- melt(cosine_res_subset)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
87 # Rename the columns
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
88 colnames(cosineRes_subset_melt) <- c("Unknown_Signatures", "Published_Signatures", "Similarity")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
89 # Reorder the Signature for having the same order as in the matrix. Turn your 'signature' column into a character vector
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
90 cosineRes_subset_melt$Published_Signatures <- as.character(cosineRes_subset_melt$Published_Signatures)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
91 #Then turn it back into an ordered factor
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
92 cosineRes_subset_melt$Published_Signatures <- factor(cosineRes_subset_melt$Published_Signatures, levels=rev(unique(cosineRes_subset_melt$Published_Signature)))
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
93
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
94 # Base plot: heatmap
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
95 p1 <- ggplot(cosineRes_subset_melt, aes(x=Published_Signatures, y=Unknown_Signatures, fill=Similarity)) + geom_tile(colour="yellow") +scale_fill_gradientn(colours=c("yellow", "red")) + theme_classic()
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
96
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
97 # Rename the signatures
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
98 if(basename(published_signature_file) == "Frequency-COSMICv72-Hupki.txt")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
99 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
100 p1 <- p1 + scale_x_discrete(breaks = c("Signature.1", "Signature.2", "Signature.3", "Signature.4", "Signature.5", "Signature.6", "Signature.7", "Signature.8", "Signature.9",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
101 "Signature.10", "Signature.11", "Signature.12", "Signature.13", "Signature.14", "Signature.15", "Signature.16", "Signature.17",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
102 "Signature.18", "Signature.19", "Signature.20", "Signature.21", "Signature.22", "Signature.23", "Signature.24", "Signature.25",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
103 "Signature.26", "Signature.27", "Signature.28", "Signature.29", "Signature.30",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
104 "Signature.1.MEF", "Signature.2.MEF", "Signature.3.MEF", "Signature.5.MEF"),
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
105 labels = c("(Age) Sign 1", "(AID/APOBEC) Sign 2", "(BRCA1/2) Sign 3", "(Smoking) Sign 4", "Sign 5", "(DNA MMR deficiency) Sign 6", "(UV) Sign 7",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
106 "Sign 8", "(IgG) Sign 9", "(pol e) Sign 10", "(temozolomide) Sign 11", "Sign 12", "(AID/APOBEC) Sign 13", "Sign 14",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
107 "(DNA MMR deficiency) Sign 15", "Sign 16", "Sign 17", "Sign 18", "Sign 19", "(DNA MMR deficiency) Sign 20", "Sign 21", "(AA) Sign 22",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
108 "Sign 23", "(Aflatoxin) Sign 24", "Sign 25", "(DNA MMR deficiency) Sign 26", "Sign 27", "Sign 28", "(Tobacco chewing) Sign 29", "Sign 30",
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
109 "(AA) Sign 1 MEF", "(AID) Sign 2 MEF", "(BaP) Sign 3 MEF", "(MNNG) Sign 5 MEF")
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
110 )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
111 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
112
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
113 # Flipped cartesian coordinates so that horizontal becomes vertical, and vertical, horizontal
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
114 p1 <- p1 + coord_flip()
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
115 # Remove the x axis line
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
116 p1 <- p1 + theme(axis.line.x=element_blank(), axis.line.y=element_blank())
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
117 # Add the cosine value only if >= 0.9
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
118 cosResLabel <- subset(cosineRes_subset_melt, round(cosineRes_subset_melt$Similarity, digits=2) >= 0.9) # Subset the data for keeping only the values greater thant 0.9
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
119 p1 <- p1 + geom_text(data = cosResLabel, aes(x = Published_Signatures, y = Unknown_Signatures, label = round(cosResLabel$Similarity, 2)))
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
120
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
121 graphics.off()
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
122 options(bitmapType='cairo')
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
123 png(output_png, width=3000, height=2000, res=300)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
124 plot(p1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
125 invisible( dev.off() )