25
+ − 1 rm(list = ls())
+ − 2 ###################################################################################################
+ − 3 # R-code: Multi-bubble graph generation from SAINTexpress output
+ − 4 # Author: Brent Kuenzi
+ − 5 ###################################################################################################
+ − 6 # This Script generates the bubble graphs based upon Saint output.
+ − 7 ###################################################################################################
+ − 8 # Copyright (C) Brent Kuenzi.
+ − 9 # Permission is granted to copy, distribute and/or modify this document
+ − 10 # under the terms of the GNU Free Documentation License, Version 1.3
+ − 11 # or any later version published by the Free Software Foundation;
+ − 12 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
+ − 13 # A copy of the license is included in the section entitled "GNU
+ − 14 # Free Documentation License".
+ − 15 ###################################################################################################
+ − 16 ## REQUIRED INPUT ##
+ − 17
+ − 18 # 1) listfile: SAINTexpress generated "list.txt" file
+ − 19 # 2) preyfile: SAINT pre-processing generated "prey.txt" file used to run SAINTexpress
+ − 20
+ − 21 ## OPTIONAL INPUT ##
+ − 22
+ − 23 # 3) crapome: raw output from crapome Workflow 1 query (http://www.crapome.org)
+ − 24 # 4) color: bubble color (default = "red")
+ − 25 # - color = "crapome": color bubbles based on Crapome(%)
+ − 26 # - Also recognizes any color within R's built-in colors() vector
+ − 27 # 5) label: Adds gene name labels to bubbles within the "zoomed in" graphs (default = FALSE)
+ − 28 # 6) cutoff: Saintscore cutoff to be assigned for filtering the "zoomed in" graphs (default = 0.8)
+ − 29 # 7) type: Specifies if the data is MaxQuant (MQ) or Scaffold (SC) data (default = "SC")
+ − 30 # 8) inc_file: Selects only the uniprot ids in the provided list (default ="None")
+ − 31 # 9) exc_file: Removes the proteins in the list (default = "None")
+ − 32 ###################################################################################################
+ − 33
+ − 34
+ − 35 ins_check_run <- function(){
+ − 36 if ('dplyr' %in% rownames(installed.packages())){}
+ − 37 else {
+ − 38 install.packages('dplyr', repos = 'http://cran.us.r-project.org')
+ − 39 }
+ − 40 if ('tidyr' %in% rownames(installed.packages())){}
+ − 41 else {
+ − 42 install.packages('tidyr', repos = 'http://cran.us.r-project.org')
+ − 43 }
+ − 44 if ('ggplot2' %in% rownames(installed.packages())){}
+ − 45 else {
+ − 46 install.packages('ggplot2', repos = 'http://cran.us.r-project.org')
+ − 47 }
+ − 48 }
+ − 49
+ − 50 ins_check_run()
+ − 51 library(dplyr); library(tidyr); library(ggplot2)
+ − 52
+ − 53
+ − 54 main <- function(listfile, preyfile, crapome = FALSE, color = "red", label = FALSE, cutoff = 0.8, type = "SC", inc_file = "None", exc_file = "None" ) {
+ − 55 cutoff_check(cutoff)
+ − 56 listfile <- list_type(listfile, inc_file, exc_file)
+ − 57 if(type == "SC") {
+ − 58 df <- merge_files_sc(listfile, preyfile, crapome)
+ − 59 }
+ − 60 if(type == "MQ") {
+ − 61 df <- merge_files_mq(listfile, preyfile, crapome)
+ − 62 }
+ − 63 bubble_NSAF(df, color)
+ − 64 bubble_SAINT(df, color)
+ − 65 bubble_zoom_SAINT(df, color, label, cutoff)
+ − 66 bubble_zoom_NSAF(df, color, label, cutoff)
+ − 67 write.table(df, "output.txt", sep = "\t", quote = FALSE, row.names = FALSE)
+ − 68 }
+ − 69 ###################################################################################################
+ − 70 # Include and Exclude list filtering
+ − 71 ###################################################################################################
+ − 72 list_type <- function(df, inc_file, exc_file) {
+ − 73 Saint <- read.delim(df, stringsAsFactors = FALSE)
+ − 74 if (inc_file != "None") {
+ − 75 if (exc_file == "None") {
+ − 76 inc_prots <- read.delim(inc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
+ − 77 print(inc_prots[, 1])
+ − 78 print(Saint$Prey)
+ − 79 filtered_df = subset(Saint, Saint$Prey == inc_prots[, 1])
+ − 80 }
+ − 81 else {
+ − 82 inc_prots <- read.delim(inc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
+ − 83 exc_prots <- read.delim(exc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
+ − 84 filtered_df = subset(Saint, Saint$Prey == inc_prots[, 1])
+ − 85 filtered_df = subset(filtered_df, filtered_df$Prey != exc_prots[, 1])
+ − 86 }
+ − 87 }
+ − 88 else if (exc_file != "None") {
+ − 89 exc_prots <- read.delim(exc_file, sep = '\t', header = FALSE, stringsAsFactors = FALSE)
+ − 90 filtered_df = subset(Saint, Saint$Prey != exc_prots[, 1])
+ − 91 }
+ − 92 else {
+ − 93 filtered_df = Saint
+ − 94 }
+ − 95 return(filtered_df)
+ − 96
+ − 97 }
+ − 98 ###################################################################################################
+ − 99 # Merge input files and caculate Crapome(%) and NSAF for each protein for each bait
+ − 100 ###################################################################################################
+ − 101 merge_files_mq <- function(SAINT, prey_DF, crapome = FALSE) {
+ − 102 #SAINT <- read.table(SAINT_DF, sep = '\t', header = TRUE)
+ − 103 #Some of these read.table()'s don't use stringsAsFactors = FALSE. Is this on purpose? Factors give rise to some really weird and unpredictable behavior; suggest always using stringsAsFactors = FALSE
+ − 104 prey <- read.table(prey_DF, sep = '\t', header = FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene")
+ − 105 DF <- merge(SAINT, prey)
+ − 106 DF$SpecSum <- log2(DF$SpecSum)
+ − 107
+ − 108 if(crapome != FALSE) {
+ − 109 crapome <- read.table(crapome, sep = '\t', header = TRUE)
+ − 110 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC")
+ − 111 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL;
+ − 112 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL # Removes unnecessary columns.
+ − 113 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) # Replace blank values with 0 / 1.
+ − 114 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") # Split into 2 columns.
+ − 115 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) # Calculate the crapome %.
+ − 116 }
+ − 117 DF$SAF <- DF$AvgSpec / DF$Length
+ − 118 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF))
+ − 119 DF$NSAF = DF2$NSAF
+ − 120 return(DF)
+ − 121 }
+ − 122
+ − 123 merge_files_sc <- function(SAINT, prey_DF, crapome = FALSE) {
+ − 124 #SAINT <- read.table(SAINT_DF, sep = '\t', header = TRUE)
+ − 125 prey <- read.table(prey_DF, sep = '\t', header = FALSE); colnames(prey) <- c("Prey", "Length", "PreyGene")
+ − 126 DF <- merge(SAINT, prey)
+ − 127
+ − 128 if(crapome != FALSE) {
+ − 129 crapome <- read.table(crapome, sep = '\t', header = TRUE)
+ − 130 colnames(crapome) <- c("Prey", "Symbol", "Num.of.Exp", "Ave.SC", "Max.SC")
+ − 131 DF1 <- merge(DF, crapome); as.character(DF1$Num.of.Exp); DF1$Symbol <- NULL;
+ − 132 DF1$Ave.SC <- NULL; DF1$Max.SC <- NULL # Removes unnecessary columns.
+ − 133 DF1$Num.of.Exp <- sub("^$", "0 / 1", DF1$Num.of.Exp ) # Replace blank values with 0 / 1.
+ − 134 DF <- DF1 %>% separate(Num.of.Exp, c("NumExp", "TotalExp"), " / ") # Split into 2 columns.
+ − 135 DF$CrapomePCT <- 100 - (as.integer(DF$NumExp) / as.integer(DF$TotalExp) * 100) # Calculate the crapome %.
+ − 136 }
+ − 137 DF$SAF <- DF$AvgSpec / DF$Length
+ − 138 DF2 = DF %>% group_by(Bait) %>% mutate(NSAF = SAF/sum(SAF))
+ − 139 DF$NSAF = DF2$NSAF
+ − 140 return(DF)
+ − 141 }
+ − 142 ###################################################################################################
+ − 143 # Plot all proteins for each bait by x = ln(NSAF), y = Log2(FoldChange)
+ − 144 ###################################################################################################
+ − 145 bubble_NSAF <- function(data, color) {
+ − 146 if(color == "crapome") {
+ − 147 a <- subset(data, CrapomePCT < 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
+ − 148 b <- subset(data, CrapomePCT >= 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
+ − 149 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) + scale_size(range = c(1, 10)) +
+ − 150 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a)
+ − 151 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")} # multiple graphs if multiple baits
+ − 152 #The text says ln() which is log base e, but the code uses log base 10. Fix code or the axis label.
+ − 153 p <- p + geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) +
+ − 154 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") +
+ − 155 labs(colour = "CRAPome Probability \nof Specific Interaction (%)", x = "ln(NSAF)") +
+ − 156 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b)
+ − 157 return(ggsave(p, width = 8, height = 4, filename = "bubble_NSAF.png"))
+ − 158 }
+ − 159 if(color != "crapome") {
+ − 160 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = data, colour = I(color), size = SpecSum) + scale_size(range = c(1, 10)) +
+ − 161 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = data) + # add bubble outlines
+ − 162 labs(x = "ln(NSAF)")
+ − 163 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 164 return(ggsave(p, width = 8, height = 4, filename = "bubble_NSAF.png"))
+ − 165 }
+ − 166 }
+ − 167 ###################################################################################################
+ − 168 # Plot all proteins for each bait by x = Saintscore, y = Log2(FoldChange)
+ − 169 ###################################################################################################
+ − 170 bubble_SAINT <- function(data, color) {
+ − 171 if(color == "crapome") {
+ − 172 a <- subset(data, CrapomePCT < 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait)) #filter on CRAPome
+ − 173 b <- subset(data, CrapomePCT >= 80, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait))
+ − 174 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) +
+ − 175 scale_size(range = c(1, 10)) + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a)
+ − 176 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 177 p <- p + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) +
+ − 178 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") +
+ − 179 labs(colour = "CRAPome Probability \nof Specific Interaction (%)") +
+ − 180 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b)
+ − 181 return(ggsave(p, width = 8, height = 4, filename = "bubble_SAINT.png"))
+ − 182 }
+ − 183 if(color != "crapome") {
+ − 184 p <- qplot(x = SaintScore, y = log2(FoldChange), data = data, colour = I(color), size = SpecSum) +
+ − 185 scale_size(range = c(1, 10)) + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = data)
+ − 186 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 187 return(ggsave(p, width = 8, height = 4, filename = "bubble_SAINT.png"))
+ − 188 }
+ − 189 }
+ − 190 ###################################################################################################
+ − 191 # Filter proteins on Saintscore cutoff and plot for each bait x = Saintscore, y = Log2(FoldChange)
+ − 192 ###################################################################################################
+ − 193 bubble_zoom_SAINT <- function(data, color, label = FALSE, cutoff = 0.8) {
+ − 194 if(color == "crapome") {
+ − 195 a <- subset(data, CrapomePCT < 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
+ − 196 b <- subset(data, CrapomePCT >= 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
+ − 197 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) +
+ − 198 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score")+geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a)
+ − 199 if(label == TRUE & length(a$NSAF != 0)) {
+ − 200 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black")
+ − 201 }
+ − 202 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 203 p <- p + geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) +
+ − 204 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") +
+ − 205 labs(colour = "CRAPome Probability \nof Specific Interaction (%)") +
+ − 206 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b)
+ − 207 if(label == TRUE & length(b$NSAF != 0)) {
+ − 208 p <- p + geom_text(data = b, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE)
+ − 209 }
+ − 210 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_SAINT.png"))
+ − 211 }
+ − 212 if(color != "crapome") {
+ − 213 a <- subset(data, SaintScore >= cutoff, select = c(NSAF, SpecSum, FoldChange, SaintScore, Bait, PreyGene))
+ − 214 p <- qplot(x = SaintScore, y = log2(FoldChange), data = a, colour = I(color), size = SpecSum) +
+ − 215 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") +
+ − 216 geom_point(aes(x = SaintScore, y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a)
+ − 217 if(label == TRUE & length(a$NSAF != 0)) {
+ − 218 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE)
+ − 219 }
+ − 220 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 221 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_SAINT.png"))
+ − 222 }
+ − 223 }
+ − 224 ###################################################################################################
+ − 225 # Filter proteins on Saintscore cutoff and plot for each bait x = log(NSAF), y = Log2(FoldChange)
+ − 226 ###################################################################################################
+ − 227 bubble_zoom_NSAF <- function(data, color, label = FALSE, cutoff = 0.8) {
+ − 228 if(color == "crapome") {
+ − 229 a <- subset(data, CrapomePCT < 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
+ − 230 b <- subset(data, CrapomePCT >= 80 & SaintScore >= cutoff, select = c(NSAF, SpecSum, CrapomePCT, FoldChange, SaintScore, Bait, PreyGene))
+ − 231 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I("tan"), size = SpecSum) +
+ − 232 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") +
+ − 233 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a)
+ − 234 if(label == TRUE & length(a$NSAF != 0)) {
+ − 235 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black")
+ − 236 }
+ − 237 if(length(levels(a$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 238 p <- p + geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum, color = CrapomePCT), data = b) +
+ − 239 scale_colour_gradient(limits = c(80, 100), low = "tan", high = "red") +
+ − 240 labs(colour = "CRAPome Probability \nof Specific Interaction (%)", x = "ln(NSAF)") +
+ − 241 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = b)
+ − 242 if(label == TRUE & length(b$NSAF != 0)) {
+ − 243 p <- p + geom_text(data = b, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE)
+ − 244 }
+ − 245 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_NSAF.png"))
+ − 246 }
+ − 247 if(color != "crapome") {
+ − 248 a <- subset(data, SaintScore >= cutoff, select = c(NSAF, SpecSum, FoldChange, SaintScore, Bait, PreyGene))
+ − 249 p <- qplot(x = log(NSAF), y = log2(FoldChange), data = a, colour = I(color), size = SpecSum) +
+ − 250 scale_size(range = c(1, 10)) + ggtitle("Filtered on SAINT score") +
+ − 251 geom_point(aes(x = log(NSAF), y = log2(FoldChange), size = SpecSum), colour = "black", shape = 21, data = a) +
+ − 252 labs(x = "ln(NSAF)")
+ − 253 if(label == TRUE & length(a$NSAF != 0)) {
+ − 254 p <- p + geom_text(data = a, aes(label = PreyGene, size = 10, vjust = 0, hjust = 0), colour = "black", show_guide = FALSE)
+ − 255 }
+ − 256 if(length(levels(data$Bait) > 1)) {p <- p + facet_wrap(~Bait, scales = "free_y")}
+ − 257 return(ggsave(p, width = 8, height = 4, filename = "bubble_zoom_NSAF.png"))
+ − 258 }
+ − 259 }
+ − 260 ###################################################################################################
+ − 261 # Check Saintscore cutoff and stop program if not between 0 and 1
+ − 262 ###################################################################################################
+ − 263 cutoff_check <- function(cutoff){
+ − 264 if( any(cutoff < 0 | cutoff > 1) ) stop('SAINT score cutoff not between 0 and 1. Please correct and try again')
+ − 265 }
+ − 266
+ − 267 args <- commandArgs(trailingOnly = TRUE)
+ − 268 main(args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9])