3
|
1 rm(list=ls())
|
|
2 ###################################################################################################
|
|
3 # R-code: Multi-bubble graph generation from SAINTexpress output
|
|
4 # Author: Brent Kuenzi
|
|
5 ###################################################################################################
|
10
|
6 ins_check_run()
|
3
|
7 library(dplyr); library(tidyr); library(ggplot2)
|
|
8 ###################################################################################################
|
|
9 ### Run program ###
|
|
10
|
|
11 ## REQUIRED INPUT ##
|
|
12 # 1) listfile: SAINTexpress generated "list.txt" file
|
|
13 # 2) preyfile: SAINT pre-processing generated "prey.txt" file used to run SAINTexpress
|
|
14 ## OPTIONAL INPUT ##
|
|
15 # 3) crapome: raw output from crapome Workflow 1 query (http://www.crapome.org)
|
|
16 # 4) color: bubble color (default = "red")
|
|
17 # - color= "crapome": color bubbles based on Crapome(%)
|
|
18 # - Also recognizes any color within R's built-in colors() vector
|
|
19 # 5) label: Adds gene name labels to bubbles within the "zoomed in" graphs (default = FALSE)
|
|
20 # 6) cutoff: Saintscore cutoff to be assigned for filtering the "zoomed in" graphs (default = 0.8)
|
|
21 ###################################################################################################
|
|
22 main <- function(listfile, preyfile , crapome=FALSE, color="red", label=FALSE, cutoff=0.8, type="SC", inc_file = "None", exc_file = "None" ) {
|
|
23 cutoff_check(cutoff)
|
|
24 listfile <- list_type(listfile, inc_file, exc_file)
|
|
25 if(type == "SC") {
|
|
26 df <- merge_files_sc(listfile, preyfile, crapome)
|
|
27 }
|
|
28 if(type == "MQ") {
|
|
29 df <- merge_files_mq(listfile, preyfile, crapome)
|
|
30 }
|
|
31 bubble_NSAF(df,color)
|
|
32 bubble_SAINT(df,color)
|
|
33 bubble_zoom_SAINT(df, color, label, cutoff)
|
|
34 bubble_zoom_NSAF(df, color, label, cutoff)
|
|
35 write.table(df,"output.txt",sep="\t",quote=FALSE, row.names=FALSE)
|
|
36 }
|
|
37
|
|
38 list_type <- function(df, inc_file, exc_file) {
|
|
39 Saint <- read.delim(df, stringsAsFactors=FALSE)
|
|
40 if (inc_file != "None") {
|
|
41 if (exc_file == "None"){
|
|
42 inc_prots <- read.delim(inc_file, sep='\t', header=FALSE, stringsAsFactors=FALSE)
|
|
43 print(inc_prots[,1])
|
|
44 print(Saint$Prey)
|
|
45 filtered_df = subset(Saint, Saint$Prey == inc_prots[,1])
|
|
46 }
|
|
47 else {
|
|
48 inc_prots <- read.delim(inc_file, sep='\t', header=FALSE, stringsAsFactors=FALSE)
|
|
49 exc_prots <- read.delim(exc_file, sep='\t', header=FALSE, stringsAsFactors=FALSE)
|
|
50 filtered_df = subset(Saint, Saint$Prey == inc_prots[,1])
|
|
51 filtered_df = subset(filtered_df, filtered_df$Prey != exc_prots[,1])
|
|
52 }
|
|
53 }
|
|
54 else if (exc_file != "None") {
|
|
55 exc_prots <- read.delim(exc_file, sep='\t', header=FALSE, stringsAsFactors=FALSE)
|
|
56 filtered_df = subset(Saint, Saint$Prey != exc_prots[,1])
|
|
57 }
|
|
58 else {
|
|
59 filtered_df = Saint
|
|
60 }
|
|
61 return(filtered_df)
|
|
62
|
|
63 }
|
|
64 ###################################################################################################
|
|
65 # Merge input files and caculate Crapome(%) and NSAF for each protein for each bait
|
|
66 ###################################################################################################
|
|
67 merge_files_mq <- function(SAINT, prey_DF, crapome=FALSE) {
|
|
68 #SAINT <- read.table(SAINT_DF, sep='\t', header=TRUE)
|
|
69 prey <- read.table(prey_DF, sep='\t', header=FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene")
|
|
70 DF <- merge(SAINT,prey)
|
|
71 DF$SpecSum <- log2(DF$SpecSum)
|
|
72
|
|
73 if(crapome!=FALSE) {
|
|
74 crapome <- read.table(crapome, sep='\t', header=TRUE)
|
|
75 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC")
|
|
76 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL;
|
|
77 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL #remove unnecessary columns
|
|
78 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) #replace blank values with 0 / 1
|
|
79 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") #split into 2 columns
|
|
80 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) #calculate crapome %
|
|
81 }
|
|
82 DF$SAF <- DF$AvgSpec / DF$Length
|
|
83 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF))
|
|
84 DF$NSAF = DF2$NSAF
|
|
85 return(DF)
|
|
86 }
|
|
87
|
|
88 merge_files_sc <- function(SAINT, prey_DF, crapome=FALSE) {
|
|
89 #SAINT <- read.table(SAINT_DF, sep='\t', header=TRUE)
|
|
90 prey <- read.table(prey_DF, sep='\t', header=FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene")
|
|
91 DF <- merge(SAINT,prey)
|
|
92
|
|
93 if(crapome!=FALSE) {
|
|
94 crapome <- read.table(crapome, sep='\t', header=TRUE)
|
|
95 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC")
|
|
96 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL;
|
|
97 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL #remove unnecessary columns
|
|
98 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) #replace blank values with 0 / 1
|
|
99 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") #split into 2 columns
|
|
100 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) #calculate crapome %
|
|
101 }
|
|
102 DF$SAF <- DF$AvgSpec / DF$Length
|
|
103 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF))
|
|
104 DF$NSAF = DF2$NSAF
|
|
105 return(DF)
|
|
106 }
|
|
107 ###################################################################################################
|
|
108 # Plot all proteins for each bait by x=ln(NSAF), y=Log2(FoldChange)
|
|
109 ###################################################################################################
|
|
110 bubble_NSAF <- function(data, color) {
|
|
111 if(color=="crapome") {
|
|
112 a <- subset(data, CrapomePCT <80, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
|
|
113 b <- subset(data, CrapomePCT>=80, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
|
|
114 p <- qplot(x=log(NSAF), y=log2(FoldChange), data=a, colour=I("tan"),size=SpecSum) + scale_size(range=c(1,10)) +
|
|
115 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a)
|
|
116 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")} # multiple graphs if multiple baits
|
|
117 p <- p + geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum, color=CrapomePCT), data=b) +
|
|
118 scale_colour_gradient(limits=c(80, 100), low="tan", high="red") +
|
|
119 labs(colour="CRAPome Probability \nof Specific Interaction (%)", x="ln(NSAF)") +
|
|
120 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=b)
|
|
121 return(ggsave(p, width=8,height=4,filename = "bubble_NSAF.png"))
|
|
122 }
|
|
123 if(color != "crapome") {
|
|
124 p <- qplot(x=log(NSAF), y=log2(FoldChange), data=data, colour=I(color),size=SpecSum) + scale_size(range=c(1,10)) +
|
|
125 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=data) + # add bubble outlines
|
|
126 labs(x="ln(NSAF)")
|
|
127 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
128 return(ggsave(p, width=8,height=4,filename = "bubble_NSAF.png"))
|
|
129 }
|
|
130 }
|
|
131 ###################################################################################################
|
|
132 # Plot all proteins for each bait by x=Saintscore, y=Log2(FoldChange)
|
|
133 ###################################################################################################
|
|
134 bubble_SAINT <- function(data, color) {
|
|
135 if(color=="crapome") {
|
|
136 a <- subset(data, CrapomePCT <80, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) #filter on CRAPome
|
|
137 b <- subset(data, CrapomePCT >=80, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
|
|
138 p <- qplot(x=SaintScore, y=log2(FoldChange), data=a, colour=I("tan"),size=SpecSum) +
|
|
139 scale_size(range=c(1,10)) + geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a)
|
|
140 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
141 p <- p + geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum, color=CrapomePCT), data=b) +
|
|
142 scale_colour_gradient(limits=c(80, 100), low="tan", high="red") +
|
|
143 labs(colour="CRAPome Probability \nof Specific Interaction (%)") +
|
|
144 geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=b)
|
|
145 return(ggsave(p, width=8,height=4,filename = "bubble_SAINT.png"))
|
|
146 }
|
|
147 if(color != "crapome") {
|
|
148 p <- qplot(x=SaintScore, y=log2(FoldChange), data=data, colour=I(color),size=SpecSum) +
|
|
149 scale_size(range=c(1,10)) + geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=data)
|
|
150 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
151 return(ggsave(p, width=8,height=4,filename = "bubble_SAINT.png"))
|
|
152 }
|
|
153 }
|
|
154 ###################################################################################################
|
|
155 # Filter proteins on Saintscore cutoff and plot for each bait x=Saintscore, y=Log2(FoldChange)
|
|
156 ###################################################################################################
|
|
157 bubble_zoom_SAINT <- function(data, color, label=FALSE, cutoff=0.8) {
|
|
158 if(color=="crapome") {
|
|
159 a <- subset(data, CrapomePCT <80 & SaintScore>=cutoff, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
|
|
160 b <- subset(data, CrapomePCT >=80 & SaintScore >=cutoff, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
|
|
161 p <- qplot(x=SaintScore, y=log2(FoldChange), data=a, colour=I("tan"),size=SpecSum) +
|
|
162 scale_size(range=c(1,10)) + ggtitle("Filtered on SAINT score")+geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a)
|
|
163 if(label==TRUE & length(a$NSAF!=0)) {
|
|
164 p <- p + geom_text(data=a, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black")
|
|
165 }
|
|
166 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
167 p <- p + geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum, color=CrapomePCT), data=b) +
|
|
168 scale_colour_gradient(limits=c(80, 100), low="tan", high="red") +
|
|
169 labs(colour="CRAPome Probability \nof Specific Interaction (%)") +
|
|
170 geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=b)
|
|
171 if(label==TRUE & length(b$NSAF!=0)) {
|
|
172 p <- p + geom_text(data=b, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black", show_guide=FALSE)
|
|
173 }
|
|
174 return(ggsave(p, width=8,height=4,filename = "bubble_zoom_SAINT.png"))
|
|
175 }
|
|
176 if(color != "crapome") {
|
|
177 a <- subset(data, SaintScore>=cutoff, select = c(NSAF,SpecSum, FoldChange, SaintScore, Bait, PreyGene))
|
|
178 p <- qplot(x=SaintScore, y=log2(FoldChange), data=a, colour=I(color),size=SpecSum) +
|
|
179 scale_size(range=c(1,10)) + ggtitle("Filtered on SAINT score") +
|
|
180 geom_point(aes(x=SaintScore,y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a)
|
|
181 if(label==TRUE & length(a$NSAF!=0)) {
|
|
182 p <- p + geom_text(data=a, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black", show_guide=FALSE)
|
|
183 }
|
|
184 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
185 return(ggsave(p, width=8,height=4,filename = "bubble_zoom_SAINT.png"))
|
|
186 }
|
|
187 }
|
|
188 ###################################################################################################
|
|
189 # Filter proteins on Saintscore cutoff and plot for each bait x=log(NSAF), y=Log2(FoldChange)
|
|
190 ###################################################################################################
|
|
191 bubble_zoom_NSAF <- function(data, color, label=FALSE, cutoff=0.8) {
|
|
192 if(color=="crapome") {
|
|
193 a <- subset(data, CrapomePCT <80 & SaintScore>=cutoff, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
|
|
194 b <- subset(data, CrapomePCT >=80 & SaintScore >=cutoff, select = c(NSAF,SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
|
|
195 p <- qplot(x=log(NSAF), y=log2(FoldChange), data=a, colour=I("tan"),size=SpecSum) +
|
|
196 scale_size(range=c(1,10)) + ggtitle("Filtered on SAINT score") +
|
|
197 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a)
|
|
198 if(label==TRUE & length(a$NSAF!=0)) {
|
|
199 p <- p + geom_text(data=a, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black")
|
|
200 }
|
|
201 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
202 p <- p + geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum, color=CrapomePCT), data=b) +
|
|
203 scale_colour_gradient(limits=c(80, 100), low="tan", high="red") +
|
|
204 labs(colour="CRAPome Probability \nof Specific Interaction (%)", x="ln(NSAF)") +
|
|
205 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=b)
|
|
206 if(label==TRUE & length(b$NSAF!=0)) {
|
|
207 p <- p + geom_text(data=b, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black", show_guide=FALSE)
|
|
208 }
|
|
209 return(ggsave(p, width=8,height=4,filename = "bubble_zoom_NSAF.png"))
|
|
210 }
|
|
211 if(color != "crapome") {
|
|
212 a <- subset(data, SaintScore>=cutoff, select = c(NSAF,SpecSum, FoldChange, SaintScore, Bait, PreyGene))
|
|
213 p <- qplot(x=log(NSAF), y=log2(FoldChange), data=a, colour=I(color), size=SpecSum) +
|
|
214 scale_size(range=c(1,10)) + ggtitle("Filtered on SAINT score") +
|
|
215 geom_point(aes(x=log(NSAF),y=log2(FoldChange), size=SpecSum), colour="black", shape=21, data=a) +
|
|
216 labs(x="ln(NSAF)")
|
|
217 if(label==TRUE & length(a$NSAF!=0)) {
|
|
218 p <- p + geom_text(data=a, aes(label=PreyGene, size=10, vjust=0, hjust=0),colour="black", show_guide=FALSE)
|
|
219 }
|
|
220 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales="free_y")}
|
|
221 return(ggsave(p, width=8,height=4,filename = "bubble_zoom_NSAF.png"))
|
|
222 }
|
|
223 }
|
|
224 ###################################################################################################
|
|
225 # Check Saintscore cutoff and stop program if not between 0 and 1
|
|
226 ###################################################################################################
|
|
227 cutoff_check <- function(cutoff){
|
|
228 if( any(cutoff < 0 | cutoff > 1) ) stop('SAINT score cutoff not between 0 and 1. Please correct and try again')
|
|
229 }
|
10
|
230 ins_check_run <- function() {
|
|
231 if ('dplyr' %in% rownames(installed.packages())){}
|
|
232 else {
|
|
233 install.packages('dplyr', repos='http://cran.us.r-project.org')
|
|
234 }
|
|
235 if ('tidyr' %in% rownames(installed.packages())){}
|
|
236 else {
|
|
237 install.packages('tidyr', repos='http://cran.us.r-project.org')
|
|
238 }
|
|
239 if ('ggplot2' %in% rownames(installed.packages())){}
|
|
240 else {
|
|
241 install.packages('ggplot2', repos='http://cran.us.r-project.org')
|
|
242 }
|
|
243 }
|
3
|
244
|
10
|
245 ins_check_run()
|
4
|
246 args <- commandArgs(trailingOnly = TRUE)
|
|
247 main(args[1],args[2],args[3],args[4],args[5],args[6],args[7],args[8],args[9])
|
3
|
248
|
|
249 #main("test_list.txt", "preytest.txt", crapome="craptest.txt", color="crapome", label=TRUE)
|
|
250 #main("Crizo_list.txt", "prey_cr.txt", crapome = "crizo_crap.txt", color="crapome", label=TRUE, cutoff=0.7)
|
|
251 #main("test_list.txt", "preytest.txt", crapome=FALSE, color="magenta", label=FALSE, cutoff=1.1) |