Mercurial > repos > bornea > saint_bubblebeam
comparison bubbles_v9_NSAF_natural_log.R @ 25:ab602bbf4ac5 draft
Uploaded
author | bornea |
---|---|
date | Fri, 29 Jan 2016 09:33:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
24:64b822045467 | 25:ab602bbf4ac5 |
---|---|
1 rm(list = ls()) | |
2 ################################################################################################### | |
3 # R-code: Multi-bubble graph generation from SAINTexpress output | |
4 # Author: Brent Kuenzi | |
5 ################################################################################################### | |
6 # This Script generates the bubble graphs based upon Saint output. | |
7 ################################################################################################### | |
8 # Copyright (C) Brent Kuenzi. | |
9 # Permission is granted to copy, distribute and/or modify this document | |
10 # under the terms of the GNU Free Documentation License, Version 1.3 | |
11 # or any later version published by the Free Software Foundation; | |
12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | |
13 # A copy of the license is included in the section entitled "GNU | |
14 # Free Documentation License". | |
15 ################################################################################################### | |
16 ## REQUIRED INPUT ## | |
17 | |
18 # 1) listfile: SAINTexpress generated "list.txt" file | |
19 # 2) preyfile: SAINT pre-processing generated "prey.txt" file used to run SAINTexpress | |
20 | |
21 ## OPTIONAL INPUT ## | |
22 | |
23 # 3) crapome: raw output from crapome Workflow 1 query (http://www.crapome.org) | |
24 # 4) color: bubble color (default = "red") | |
25 # - color = "crapome": color bubbles based on Crapome(%) | |
26 # - Also recognizes any color within R's built-in colors() vector | |
27 # 5) label: Adds gene name labels to bubbles within the "zoomed in" graphs (default = FALSE) | |
28 # 6) cutoff: Saintscore cutoff to be assigned for filtering the "zoomed in" graphs (default = 0.8) | |
29 # 7) type: Specifies if the data is MaxQuant (MQ) or Scaffold (SC) data (default = "SC") | |
30 # 8) inc_file: Selects only the uniprot ids in the provided list (default ="None") | |
31 # 9) exc_file: Removes the proteins in the list (default = "None") | |
32 ################################################################################################### | |
33 | |
34 | |
35 ins_check_run <- function(){ | |
36 if ('dplyr' %in% rownames(installed.packages())){} | |
37 else { | |
38 install.packages('dplyr', repos = 'http://cran.us.r-project.org') | |
39 } | |
40 if ('tidyr' %in% rownames(installed.packages())){} | |
41 else { | |
42 install.packages('tidyr', repos = 'http://cran.us.r-project.org') | |
43 } | |
44 if ('ggplot2' %in% rownames(installed.packages())){} | |
45 else { | |
46 install.packages('ggplot2', repos = 'http://cran.us.r-project.org') | |
47 } | |
48 } | |
49 | |
50 ins_check_run() | |
51 library(dplyr); library(tidyr); library(ggplot2) | |
52 | |
53 | |
54 main <- function(listfile, preyfile, crapome = FALSE, color = "red", label = FALSE, cutoff = 0.8, type = "SC", inc_file = "None", exc_file = "None" ) { | |
55 cutoff_check(cutoff) | |
56 listfile <- list_type(listfile, inc_file, exc_file) | |
57 if(type == "SC") { | |
58 df <- merge_files_sc(listfile, preyfile, crapome) | |
59 } | |
60 if(type == "MQ") { | |
61 df <- merge_files_mq(listfile, preyfile, crapome) | |
62 } | |
63 bubble_NSAF(df, color) | |
64 bubble_SAINT(df, color) | |
65 bubble_zoom_SAINT(df, color, label, cutoff) | |
66 bubble_zoom_NSAF(df, color, label, cutoff) | |
67 write.table(df, "output.txt", sep = "\t", quote = FALSE, row.names = FALSE) | |
68 } | |
69 ################################################################################################### | |
70 # Include and Exclude list filtering | |
71 ################################################################################################### | |
72 list_type <- function(df, inc_file, exc_file) { | |
73 Saint <- read.delim(df, stringsAsFactors = FALSE) | |
74 if (inc_file != "None") { | |
75 if (exc_file == "None") { | |
76 inc_prots <- read.delim(inc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE) | |
77 print(inc_prots[, 1]) | |
78 print(Saint$Prey) | |
79 filtered_df = subset(Saint, Saint$Prey == inc_prots[, 1]) | |
80 } | |
81 else { | |
82 inc_prots <- read.delim(inc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE) | |
83 exc_prots <- read.delim(exc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE) | |
84 filtered_df = subset(Saint, Saint$Prey == inc_prots[, 1]) | |
85 filtered_df = subset(filtered_df, filtered_df$Prey != exc_prots[, 1]) | |
86 } | |
87 } | |
88 else if (exc_file != "None") { | |
89 exc_prots <- read.delim(exc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE) | |
90 filtered_df = subset(Saint, Saint$Prey != exc_prots[, 1]) | |
91 } | |
92 else { | |
93 filtered_df = Saint | |
94 } | |
95 return(filtered_df) | |
96 | |
97 } | |
98 ################################################################################################### | |
99 # Merge input files and caculate Crapome(%) and NSAF for each protein for each bait | |
100 ################################################################################################### | |
101 merge_files_mq <- function(SAINT, prey_DF, crapome = FALSE) { | |
102 #SAINT <- read.table(SAINT_DF, sep = '\t', header = TRUE) | |
103 #Some of these read.table()'s don't use stringsAsFactors = FALSE. Is this on purpose? Factors give rise to some really weird and unpredictable behavior; suggest always using stringsAsFactors = FALSE | |
104 prey <- read.table(prey_DF, sep = '\t', header = FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene") | |
105 DF <- merge(SAINT, prey) | |
106 DF$SpecSum <- log2(DF$SpecSum) | |
107 | |
108 if(crapome != FALSE) { | |
109 crapome <- read.table(crapome, sep = '\t', header = TRUE) | |
110 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC") | |
111 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL; | |
112 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL # Removes unnecessary columns. | |
113 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) # Replace blank values with 0 / 1. | |
114 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") # Split into 2 columns. | |
115 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) # Calculate the crapome %. | |
116 } | |
117 DF$SAF <- DF$AvgSpec / DF$Length | |
118 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF)) | |
119 DF$NSAF = DF2$NSAF | |
120 return(DF) | |
121 } | |
122 | |
123 merge_files_sc <- function(SAINT, prey_DF, crapome = FALSE) { | |
124 #SAINT <- read.table(SAINT_DF, sep = '\t', header = TRUE) | |
125 prey <- read.table(prey_DF, sep = '\t', header = FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene") | |
126 DF <- merge(SAINT, prey) | |
127 | |
128 if(crapome != FALSE) { | |
129 crapome <- read.table(crapome, sep = '\t', header = TRUE) | |
130 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC") | |
131 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL; | |
132 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL # Removes unnecessary columns. | |
133 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) # Replace blank values with 0 / 1. | |
134 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") # Split into 2 columns. | |
135 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) # Calculate the crapome %. | |
136 } | |
137 DF$SAF <- DF$AvgSpec / DF$Length | |
138 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF)) | |
139 DF$NSAF = DF2$NSAF | |
140 return(DF) | |
141 } | |
142 ################################################################################################### | |
143 # Plot all proteins for each bait by x = ln(NSAF), y = Log2(FoldChange) | |
144 ################################################################################################### | |
145 bubble_NSAF <- function(data, color) { | |
146 if(color == "crapome") { | |
147 a <- subset(data, CrapomePCT < 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) | |
148 b <- subset(data, CrapomePCT >= 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) | |
149 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) + scale_size(range = c(1, 10)) + | |
150 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) | |
151 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} # multiple graphs if multiple baits | |
152 #The text says ln() which is log base e, but the code uses log base 10. Fix code or the axis label. | |
153 p <- p + geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) + | |
154 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") + | |
155 labs(colour = "CRAPome Probability \nof Specific Interaction (%)", x = "ln(NSAF)") + | |
156 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b) | |
157 return(ggsave(p, width = 8, height = 4, filename = "bubble_NSAF.png")) | |
158 } | |
159 if(color != "crapome") { | |
160 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = data, colour = I(color), size = SpecSum) + scale_size(range = c(1, 10)) + | |
161 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = data) + # add bubble outlines | |
162 labs(x = "ln(NSAF)") | |
163 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
164 return(ggsave(p, width = 8, height = 4, filename = "bubble_NSAF.png")) | |
165 } | |
166 } | |
167 ################################################################################################### | |
168 # Plot all proteins for each bait by x = Saintscore, y = Log2(FoldChange) | |
169 ################################################################################################### | |
170 bubble_SAINT <- function(data, color) { | |
171 if(color == "crapome") { | |
172 a <- subset(data, CrapomePCT < 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) #filter on CRAPome | |
173 b <- subset(data, CrapomePCT >= 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) | |
174 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) + | |
175 scale_size(range = c(1, 10)) + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) | |
176 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
177 p <- p + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) + | |
178 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") + | |
179 labs(colour = "CRAPome Probability \nof Specific Interaction (%)") + | |
180 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b) | |
181 return(ggsave(p, width = 8, height = 4, filename = "bubble_SAINT.png")) | |
182 } | |
183 if(color != "crapome") { | |
184 p <- qplot(x = SaintScore, y = log2(FoldChange), data = data, colour = I(color), size = SpecSum) + | |
185 scale_size(range = c(1, 10)) + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = data) | |
186 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
187 return(ggsave(p, width = 8, height = 4, filename = "bubble_SAINT.png")) | |
188 } | |
189 } | |
190 ################################################################################################### | |
191 # Filter proteins on Saintscore cutoff and plot for each bait x = Saintscore, y = Log2(FoldChange) | |
192 ################################################################################################### | |
193 bubble_zoom_SAINT <- function(data, color, label = FALSE, cutoff = 0.8) { | |
194 if(color == "crapome") { | |
195 a <- subset(data, CrapomePCT < 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene)) | |
196 b <- subset(data, CrapomePCT >= 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene)) | |
197 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) + | |
198 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score")+geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) | |
199 if(label == TRUE & length(a$NSAF != 0)) { | |
200 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black") | |
201 } | |
202 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
203 p <- p + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) + | |
204 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") + | |
205 labs(colour = "CRAPome Probability \nof Specific Interaction (%)") + | |
206 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b) | |
207 if(label == TRUE & length(b$NSAF != 0)) { | |
208 p <- p + geom_text(data = b, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE) | |
209 } | |
210 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_SAINT.png")) | |
211 } | |
212 if(color != "crapome") { | |
213 a <- subset(data, SaintScore >= cutoff, select = c(NSAF, SpecSum, FoldChange, SaintScore, Bait, PreyGene)) | |
214 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I(color), size = SpecSum) + | |
215 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") + | |
216 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) | |
217 if(label == TRUE & length(a$NSAF != 0)) { | |
218 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE) | |
219 } | |
220 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
221 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_SAINT.png")) | |
222 } | |
223 } | |
224 ################################################################################################### | |
225 # Filter proteins on Saintscore cutoff and plot for each bait x = log(NSAF), y = Log2(FoldChange) | |
226 ################################################################################################### | |
227 bubble_zoom_NSAF <- function(data, color, label = FALSE, cutoff = 0.8) { | |
228 if(color == "crapome") { | |
229 a <- subset(data, CrapomePCT < 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene)) | |
230 b <- subset(data, CrapomePCT >= 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene)) | |
231 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) + | |
232 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") + | |
233 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) | |
234 if(label == TRUE & length(a$NSAF != 0)) { | |
235 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black") | |
236 } | |
237 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
238 p <- p + geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) + | |
239 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") + | |
240 labs(colour = "CRAPome Probability \nof Specific Interaction (%)", x = "ln(NSAF)") + | |
241 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b) | |
242 if(label == TRUE & length(b$NSAF != 0)) { | |
243 p <- p + geom_text(data = b, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE) | |
244 } | |
245 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_NSAF.png")) | |
246 } | |
247 if(color != "crapome") { | |
248 a <- subset(data, SaintScore >= cutoff, select = c(NSAF, SpecSum, FoldChange, SaintScore, Bait, PreyGene)) | |
249 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I(color), size = SpecSum) + | |
250 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") + | |
251 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) + | |
252 labs(x = "ln(NSAF)") | |
253 if(label == TRUE & length(a$NSAF != 0)) { | |
254 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE) | |
255 } | |
256 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} | |
257 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_NSAF.png")) | |
258 } | |
259 } | |
260 ################################################################################################### | |
261 # Check Saintscore cutoff and stop program if not between 0 and 1 | |
262 ################################################################################################### | |
263 cutoff_check <- function(cutoff){ | |
264 if( any(cutoff < 0 | cutoff > 1) ) stop('SAINT score cutoff not between 0 and 1. Please correct and try again') | |
265 } | |
266 | |
267 args <- commandArgs(trailingOnly = TRUE) | |
268 main(args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9]) |