Mercurial > repos > galaxyp > mqppep_preproc
diff MaxQuantProcessingScript.R @ 0:8dfd5d2b5903 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:54 +0000 |
parents | |
children | b76c75521d91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MaxQuantProcessingScript.R Mon Jul 11 19:22:54 2022 +0000 @@ -0,0 +1,705 @@ +#!/usr/bin/env Rscript + +# This is the implementation for the +# "MaxQuant Phosphopeptide Localization Probability Cutoff" +# Galaxy tool (mqppep_lclztn_filter) +# It is adapted from the MaxQuant Processing Script written by Larry Cheng. + +# libraries +library(optparse) +library(data.table) +library(stringr) +library(ggplot2) + +# title: "MaxQuant Processing Script" +# author: "Larry Cheng" +# date: "February 19, 2018" +# +# # MaxQuant Processing Script +# Takes MaxQuant Phospho (STY)sites.txt file as input +# and performs the following (in order): +# 1) Runs the Proteomics Quality Control software +# 2) Remove contaminant and reverse sequence rows +# 3) Filters rows based on localization probability +# 4) Extract the quantitative data +# 5) Sequences phosphopeptides +# 6) Merges multiply phosphorylated peptides +# 7) Filters out phosphopeptides based on enrichment +# The output file contains the phosphopeptide (first column) +# and the quantitative values for each sample. +# +# ## Revision History +# Rev. 2022-02-10 :wrap for inclusion in Galaxy +# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" +# and "Phosphopeptide Processing Script" +# Rev. 2017-12-12 :added PTXQC +# added additional plots and table outputs for quality control +# allowed for more than 2 samples to be grouped together +# (up to 26 (eg, 1A, 1B, 1C, etc)) +# converted from .r to .rmd file to knit report +# for quality control +# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data +# impute multiple times +# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows +# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to +# preANOVA file output +# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$" +# so that it looks at the end of string +# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) +# Rev. 2016-07-03 :Removed row names from the write.table() output for +# ANOVA and PreANOVA +# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 +# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting +# the row numbers afterwards +# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol + + +### FUNCTION DECLARATIONS begin ---------------------------------------------- + +# Read first line of file at filePath +# adapted from: https://stackoverflow.com/a/35761217/15509512 +read_first_line <- function(filepath) { + con <- file(filepath, "r") + line <- readLines(con, n = 1) + close(con) + return(line) +} + +# Move columns to the end of dataframe +# - data: the dataframe +# - move: a vector of column names, each of which is an element of names(data) +movetolast <- function(data, move) { + data[c(setdiff(names(data), move), move)] +} + +# Generate phosphopeptide and build list when applied +phosphopeptide_func <- function(df) { + # generate peptide sequence and list of phosphopositions + phosphoprobsequence <- + strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] + output <- vector() + phosphopeptide <- "" + counter <- 0 # keep track of position in peptide + phosphopositions <- + vector() # keep track of phosphorylation positions in peptide + score_diff <- "" + for (chara in phosphoprobsequence) { + # build peptide sequence + if (!( + chara == " " || + chara == "(" || + chara == ")" || + chara == "." || + chara == "-" || + chara == "0" || + chara == "1" || + chara == "2" || + chara == "3" || + chara == "4" || + chara == "5" || + chara == "6" || + chara == "7" || + chara == "8" || + chara == "9") + ) { + phosphopeptide <- paste(phosphopeptide, chara, sep = "") + counter <- counter + 1 + } + # generate score_diff + if (chara == "-" || + chara == "." || + chara == "0" || + chara == "1" || + chara == "2" || + chara == "3" || + chara == "4" || + chara == "5" || + chara == "6" || + chara == "7" || + chara == "8" || + chara == "9" + ) { + score_diff <- paste(score_diff, chara, sep = "") + } + # evaluate score_diff + if (chara == ")") { + score_diff <- as.numeric(score_diff) + # only consider a phosphoresidue if score_diff > 0 + if (score_diff > 0) { + phosphopositions <- append(phosphopositions, counter) + } + score_diff <- "" + } + } + + # generate phosphopeptide sequence (ie, peptide sequence with "p"'s) + counter <- 1 + phosphoposition_correction1 <- + -1 # used to correct phosphosposition as "p"'s + # are inserted into the phosphopeptide string + phosphoposition_correction2 <- + 0 # used to correct phosphosposition as "p"'s + # are inserted into the phosphopeptide string + while (counter <= length(phosphopositions)) { + phosphopeptide <- + paste( + substr( + phosphopeptide, + 0, + phosphopositions[counter] + phosphoposition_correction1 + ), + "p", + substr( + phosphopeptide, + phosphopositions[counter] + phosphoposition_correction2, + nchar(phosphopeptide) + ), + sep = "" + ) + counter <- counter + 1 + phosphoposition_correction1 <- phosphoposition_correction1 + 1 + phosphoposition_correction2 <- phosphoposition_correction2 + 1 + } + # building phosphopeptide list + output <- append(output, phosphopeptide) + return(output) +} + +### FUNCTION DECLARATIONS end ------------------------------------------------ + + +### EXTRACT ARGUMENTS begin -------------------------------------------------- + +# parse options +option_list <- list( + make_option( + c("-i", "--input"), + action = "store", + type = "character", + help = "A MaxQuant Phospho (STY)Sites.txt" + ) + , + make_option( + c("-o", "--output"), + action = "store", + type = "character", + help = "path to output file" + ) + , + make_option( + c("-E", "--enrichGraph"), + action = "store", + type = "character", + help = "path to enrichment graph PDF" + ) + , + make_option( + c("-F", "--enrichGraph_svg"), + action = "store", + type = "character", + help = "path to enrichment graph SVG" + ) + , + make_option( + c("-L", "--locProbCutoffGraph"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph PDF" + ) + , + make_option( + c("-M", "--locProbCutoffGraph_svg"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph SVG" + ) + , + make_option( + c("-e", "--enriched"), + action = "store", + type = "character", + help = "pY or pST enriched samples (ie, 'Y' or 'ST')" + ) + # default = "^Number of Phospho [(]STY[)]$", + , + make_option( + c("-p", "--phosphoCol"), + action = "store", + type = "character", + help = paste0("PERL-compatible regular expression matching", + " header of column having number of 'Phospho (STY)'") + ) + # default = "^Intensity[^_]", + , + make_option( + c("-s", "--startCol"), + action = "store", + type = "character", + help = paste0("PERL-compatible regular expression matching", + " header of column having first sample intensity") + ) + # default = 1, + , + make_option( + c("-I", "--intervalCol"), + action = "store", + type = "integer", + help = paste0("Column interval between the Intensities of samples", + " (eg, 1 if subsequent column; 2 if every other column") + ) + # default = 0.75, + , + make_option( + c("-l", "--localProbCutoff"), + action = "store", + type = "double", + help = "Localization Probability Cutoff" + ) + # default = "sum", + , + make_option( + c("-f", "--collapse_func"), + action = "store", + type = "character", + help = paste0("merge identical phosphopeptides", + " by ('sum' or 'average') the intensities") + ) + # default = "filtered_data.txt", + , + make_option( + c("-r", "--filtered_data"), + action = "store", + type = "character", + help = "filtered_data.txt" + ) + # default = "quantData.txt", + , + make_option( + c("-q", "--quant_data"), + action = "store", + type = "character", + help = "quantData.txt" + ) +) +args <- parse_args(OptionParser(option_list = option_list)) +# Check parameter values + +### EXTRACT ARGUMENTS end ---------------------------------------------------- + + +### EXTRACT PARAMETERS from arguments begin ---------------------------------- + +if (!file.exists(args$input)) { + stop((paste("File", args$input, "does not exist"))) +} + +phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$" +start_col_pattern <- "^Intensity[^_]" +phospho_col_pattern <- read_first_line(args$phosphoCol) +start_col_pattern <- read_first_line(args$startCol) + +sink(getConnection(2)) + +input_file_name <- args$input +filtered_filename <- args$filtered_data +quant_file_name <- args$quant_data +interval_col <- as.integer(args$intervalCol) + +first_line <- read_first_line(input_file_name) +col_headers <- + unlist(strsplit( + x = first_line, + split = c("\t"), + fixed = TRUE + )) +sink(getConnection(2)) +sink() + + +intensity_header_cols <- + grep(pattern = start_col_pattern, x = col_headers, perl = TRUE) +if (length(intensity_header_cols) == 0) { + err_msg <- + paste("Found no intensity columns matching pattern:", + start_col_pattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) +} + + +phospho_col <- + grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1] +if (is.na(phospho_col)) { + err_msg <- + paste("Found no 'number of phospho sites' columns matching pattern:", + phospho_col_pattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) +} + + +i_count <- 0 +this_column <- 1 +last_value <- intensity_header_cols[1] +intensity_cols <- c(last_value) + +while (length(intensity_header_cols) >= interval_col * i_count) { + i_count <- 1 + i_count + this_column <- interval_col + this_column + if (last_value + interval_col != intensity_header_cols[this_column]) + break + last_value <- intensity_header_cols[this_column] + if (length(intensity_header_cols) < interval_col * i_count) + break + intensity_cols <- + c(intensity_cols, intensity_header_cols[this_column]) +} + +start_col <- intensity_cols[1] +num_samples <- i_count + +output_filename <- args$output +enrich_graph_filename <- args$enrichGraph +loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph +enrich_graph_filename_svg <- args$enrichGraph_svg +loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg + +local_prob_cutoff <- args$localProbCutoff +enriched <- args$enriched +collapse_fn <- args$collapse_func + +### EXTRACT PARAMETERS from arguments end ------------------------------------ + + +# Proteomics Quality Control for MaxQuant Results +# (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) +# is run by the Galaxy MaxQuant wrapper and need not be invoked here. + + +# Read & filter out contaminants, reverse sequences, & localization probability +# --- +full_data <- + read.table( + file = input_file_name, + sep = "\t", + header = TRUE, + quote = "" + ) + +# Filter out contaminant rows and reverse rows +filtered_data <- subset(full_data, !grepl("CON__", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("_MYCOPLASMA", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("CONTAMINANT_", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("REV__", Protein) + ) # since REV__ rows are blank in the first column (Proteins) +write.table( + filtered_data, + file = filtered_filename, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE +) +# ... + + +# Filter out data with localization probability below localProbCutoff +# --- +# Data filtered by localization probability +loc_prob_filtered_data <- + filtered_data[ + filtered_data$Localization.prob >= local_prob_cutoff, + ] +# ... + + +# Localization probability -- visualize locprob cutoff +# --- +loc_prob_graph_data <- + data.frame( + group = c(paste(">", toString(local_prob_cutoff), sep = ""), + paste("<", toString(local_prob_cutoff), sep = "")), + value = c( + nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100, + (nrow(filtered_data) - nrow(loc_prob_filtered_data)) + / nrow(filtered_data) * 100 + ) + ) +gigi <- + ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) + + geom_bar(width = 0.5, + stat = "identity", + color = "black") + + labs(x = NULL, + y = "percent", + title = "Phosphopeptides partitioned by localization-probability cutoff" + ) + + scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + + theme_minimal() + + theme( + legend.position = "right", + legend.title = element_text(), + plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5), + plot.title.position = "plot" + ) +pdf(loc_prob_cutoff_graph_filename) +print(gigi) +dev.off() +svg(loc_prob_cutoff_graph_fn_svg) +print(gigi) +dev.off() +# ... + + +# Extract quantitative values from filtered data +# --- +quant_data <- + loc_prob_filtered_data[, seq(from = start_col, + by = interval_col, + length.out = num_samples)] +# ... + + +# Generate Phosphopeptide Sequence +# for latest version of MaxQuant (Version 1.5.3.30) +# --- +metadata_df <- + data.frame( + loc_prob_filtered_data[, 1:8], + loc_prob_filtered_data[, phospho_col], + loc_prob_filtered_data[, phospho_col + 1], + loc_prob_filtered_data[, phospho_col + 2], + loc_prob_filtered_data[, phospho_col + 3], + loc_prob_filtered_data[, phospho_col + 4], + loc_prob_filtered_data[, phospho_col + 5], + loc_prob_filtered_data[, phospho_col + 6], + loc_prob_filtered_data[, phospho_col + 7], + quant_data + ) +colnames(metadata_df) <- + c( + "Proteins", + "Positions within proteins", + "Leading proteins", + "Protein", + "Protein names", + "Gene names", + "Fasta headers", + "Localization prob", + "Number of Phospho (STY)", + "Amino Acid", + "Sequence window", + "Modification window", + "Peptide window coverage", + "Phospho (STY) Probabilities", + "Phospho (STY) Score diffs", + "Position in peptide", + colnames(quant_data) + ) +# 'phosphopeptide_func' generates a phosphopeptide sequence +# for each row of data. +# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both +metadata_df$phosphopeptide <- + apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func) +colnames(metadata_df)[1] <- "Phosphopeptide" +# Move the quant data columns to the right end of the data.frame +metadata_df <- movetolast(metadata_df, c(colnames(quant_data))) +# ... + + +# Write quantitative values for debugging purposes +# --- +quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) +colnames(quant_write)[1] <- "Sequence.Window" +write.table( + quant_write, + file = quant_file_name, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE +) +# ... + + +# Make new data frame containing only Phosphopeptides +# that are to be mapped to quant data (merge_df) +# --- +metadata_df <- + setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map +merge_df <- + data.frame( + as.integer(metadata_df$rn), + metadata_df$phosphopeptide # row index to merge data frames + ) +colnames(merge_df) <- c("rn", "Phosphopeptide") +# ... + + +# Add Phosphopeptide column to quant columns for quality control checking +# --- +quant_data_qc <- as.data.frame(quant_data) +setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data +quant_data_qc$rn <- as.integer(quant_data_qc$rn) +quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn") +quant_data_qc$rn <- NULL # remove rn column +# ... + + +# Collapse multiphosphorylated peptides +# --- +quant_data_qc_collapsed <- + data.table(quant_data_qc, key = "Phosphopeptide") +quant_data_qc_collapsed <- + aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn) +# ... +print("quant_data_qc_collapsed") +head(quant_data_qc_collapsed) + +# Compute (as string) % of phosphopeptides that are multiphosphorylated +# (for use in next step) +# --- +pct_multiphos <- + ( + nrow(quant_data_qc) - nrow(quant_data_qc_collapsed) + ) / (2 * nrow(quant_data_qc)) +pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") +# ... + + +# Compute and visualize breakdown of pY, pS, and pT before enrichment filter +# --- +py_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), + ] +ps_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"), + ] +pt_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), + ] + +py_num <- nrow(py_data) +ps_num <- nrow(ps_data) +pt_num <- nrow(pt_data) + +# Visualize enrichment +enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"), + value = c(py_num, ps_num, pt_num)) + +enrich_graph_data <- + enrich_graph_data[ + enrich_graph_data$value > 0, + ] + +# Plot pie chart with legend +# start: https://stackoverflow.com/a/62522478/15509512 +# refine: https://www.statology.org/ggplot-pie-chart/ +# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 +slices <- enrich_graph_data$value +phosphoresidue <- enrich_graph_data$group +pct <- round(100 * slices / sum(slices)) +lbls <- + paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "") +slc_ctr <- c() +run_tot <- 0 +for (p in pct) { + slc_ctr <- c(slc_ctr, run_tot + p / 2.0) + run_tot <- run_tot + p +} +lbl_y <- 100 - slc_ctr +df <- + data.frame(slices, + pct, + lbls, + phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) +gigi <- ggplot(df + , aes(x = 1, y = pct, fill = phosphoresidue)) + + geom_col(position = "stack", orientation = "x") + + geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + + coord_polar(theta = "y", direction = -1) + + labs( + x = NULL + , + y = NULL + , + title = "Percentages (and counts) of phosphosites, by type of residue" + , + caption = sprintf( + "Roughly %s of peptides have multiple phosphosites.", + pct_multiphos + ) + ) + + labs(x = NULL, y = NULL, fill = NULL) + + theme_classic() + + theme( + legend.position = "right" + , + axis.line = element_blank() + , + axis.text = element_blank() + , + axis.ticks = element_blank() + , + plot.title = element_text(hjust = 0.5) + , + plot.subtitle = element_text(hjust = 0.5) + , + plot.caption = element_text(hjust = 0.5) + , + plot.title.position = "plot" + ) + + scale_fill_manual(breaks = phosphoresidue, + values = c("#c7eae5", "#f6e8c3", "#dfc27d")) + +pdf(enrich_graph_filename) +print(gigi) +dev.off() +svg(enrich_graph_filename_svg) +print(gigi) +dev.off() +# ... + + +# Filter phosphopeptides by enrichment +# -- +if (enriched == "Y") { + quant_data_qc_enrichment <- quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), + ] +} else if (enriched == "ST") { + quant_data_qc_enrichment <- quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") | + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), + ] +} else { + print("Error in enriched variable. Set to either 'Y' or 'ST'") +} +# ... + +print("quant_data_qc_enrichment") +head(quant_data_qc_enrichment) + +# Write phosphopeptides filtered by enrichment +# -- +write.table( + quant_data_qc_enrichment, + file = output_filename, + sep = "\t", + quote = FALSE, + row.names = FALSE +) +# ...