Mercurial > repos > galaxyp > mqppep_anova
changeset 0:dbff53e6f75f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
author | galaxyp |
---|---|
date | Mon, 11 Jul 2022 19:22:25 +0000 |
parents | |
children | 08678c931f5d |
files | MaxQuantProcessingScript.R PhosphoPeptide_Upstream_Kinase_Mapping.pl macros.xml mqppep_anova.R mqppep_anova.xml mqppep_anova_script.Rmd mqppep_mrgfltr.py search_ppep.py test-data/alpha_levels.tabular test-data/pSTY_motifs.tabular test-data/test_input_for_anova.sqlite test-data/test_input_for_anova.tabular test-data/test_input_for_preproc.tabular test-data/test_kinase_substrate.tabular test-data/test_networkin.tabular test-data/test_regulatory_sites.tabular test-data/test_swissprot.fasta workflow/ppenrich_suite_wf.ga |
diffstat | 18 files changed, 10783 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MaxQuantProcessingScript.R Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,705 @@ +#!/usr/bin/env Rscript + +# This is the implementation for the +# "MaxQuant Phosphopeptide Localization Probability Cutoff" +# Galaxy tool (mqppep_lclztn_filter) +# It is adapted from the MaxQuant Processing Script written by Larry Cheng. + +# libraries +library(optparse) +library(data.table) +library(stringr) +library(ggplot2) + +# title: "MaxQuant Processing Script" +# author: "Larry Cheng" +# date: "February 19, 2018" +# +# # MaxQuant Processing Script +# Takes MaxQuant Phospho (STY)sites.txt file as input +# and performs the following (in order): +# 1) Runs the Proteomics Quality Control software +# 2) Remove contaminant and reverse sequence rows +# 3) Filters rows based on localization probability +# 4) Extract the quantitative data +# 5) Sequences phosphopeptides +# 6) Merges multiply phosphorylated peptides +# 7) Filters out phosphopeptides based on enrichment +# The output file contains the phosphopeptide (first column) +# and the quantitative values for each sample. +# +# ## Revision History +# Rev. 2022-02-10 :wrap for inclusion in Galaxy +# Rev. 2018-02-19 :break up analysis script into "MaxQuant Processing Script" +# and "Phosphopeptide Processing Script" +# Rev. 2017-12-12 :added PTXQC +# added additional plots and table outputs for quality control +# allowed for more than 2 samples to be grouped together +# (up to 26 (eg, 1A, 1B, 1C, etc)) +# converted from .r to .rmd file to knit report +# for quality control +# Rev. 2016-09-11 :automated the FDR cutoffs; removed the option to data +# impute multiple times +# Rev. 2016-09-09 :added filter to eliminate contaminant & reverse sequence rows +# Rev. 2016-09-01 :moved the collapse step from after ANOVA filter to prior to +# preANOVA file output +# Rev. 2016-08-22 :use regexSampleNames <- "\\.(\\d + )[AB]$" +# so that it looks at the end of string +# Rev. 2016-08-05 :Removed vestigial line (ppeptides <- ....) +# Rev. 2016-07-03 :Removed row names from the write.table() output for +# ANOVA and PreANOVA +# Rev. 2016-06-25 :Set default Localization Probability cutoff to 0.75 +# Rev. 2016-06-23 :fixed a bug in filtering for pY enrichment by resetting +# the row numbers afterwards +# Rev. 2016-06-21 :test18 + standardized the regexpression in protocol + + +### FUNCTION DECLARATIONS begin ---------------------------------------------- + +# Read first line of file at filePath +# adapted from: https://stackoverflow.com/a/35761217/15509512 +read_first_line <- function(filepath) { + con <- file(filepath, "r") + line <- readLines(con, n = 1) + close(con) + return(line) +} + +# Move columns to the end of dataframe +# - data: the dataframe +# - move: a vector of column names, each of which is an element of names(data) +movetolast <- function(data, move) { + data[c(setdiff(names(data), move), move)] +} + +# Generate phosphopeptide and build list when applied +phosphopeptide_func <- function(df) { + # generate peptide sequence and list of phosphopositions + phosphoprobsequence <- + strsplit(as.character(df["Phospho (STY) Score diffs"]), "")[[1]] + output <- vector() + phosphopeptide <- "" + counter <- 0 # keep track of position in peptide + phosphopositions <- + vector() # keep track of phosphorylation positions in peptide + score_diff <- "" + for (chara in phosphoprobsequence) { + # build peptide sequence + if (!( + chara == " " || + chara == "(" || + chara == ")" || + chara == "." || + chara == "-" || + chara == "0" || + chara == "1" || + chara == "2" || + chara == "3" || + chara == "4" || + chara == "5" || + chara == "6" || + chara == "7" || + chara == "8" || + chara == "9") + ) { + phosphopeptide <- paste(phosphopeptide, chara, sep = "") + counter <- counter + 1 + } + # generate score_diff + if (chara == "-" || + chara == "." || + chara == "0" || + chara == "1" || + chara == "2" || + chara == "3" || + chara == "4" || + chara == "5" || + chara == "6" || + chara == "7" || + chara == "8" || + chara == "9" + ) { + score_diff <- paste(score_diff, chara, sep = "") + } + # evaluate score_diff + if (chara == ")") { + score_diff <- as.numeric(score_diff) + # only consider a phosphoresidue if score_diff > 0 + if (score_diff > 0) { + phosphopositions <- append(phosphopositions, counter) + } + score_diff <- "" + } + } + + # generate phosphopeptide sequence (ie, peptide sequence with "p"'s) + counter <- 1 + phosphoposition_correction1 <- + -1 # used to correct phosphosposition as "p"'s + # are inserted into the phosphopeptide string + phosphoposition_correction2 <- + 0 # used to correct phosphosposition as "p"'s + # are inserted into the phosphopeptide string + while (counter <= length(phosphopositions)) { + phosphopeptide <- + paste( + substr( + phosphopeptide, + 0, + phosphopositions[counter] + phosphoposition_correction1 + ), + "p", + substr( + phosphopeptide, + phosphopositions[counter] + phosphoposition_correction2, + nchar(phosphopeptide) + ), + sep = "" + ) + counter <- counter + 1 + phosphoposition_correction1 <- phosphoposition_correction1 + 1 + phosphoposition_correction2 <- phosphoposition_correction2 + 1 + } + # building phosphopeptide list + output <- append(output, phosphopeptide) + return(output) +} + +### FUNCTION DECLARATIONS end ------------------------------------------------ + + +### EXTRACT ARGUMENTS begin -------------------------------------------------- + +# parse options +option_list <- list( + make_option( + c("-i", "--input"), + action = "store", + type = "character", + help = "A MaxQuant Phospho (STY)Sites.txt" + ) + , + make_option( + c("-o", "--output"), + action = "store", + type = "character", + help = "path to output file" + ) + , + make_option( + c("-E", "--enrichGraph"), + action = "store", + type = "character", + help = "path to enrichment graph PDF" + ) + , + make_option( + c("-F", "--enrichGraph_svg"), + action = "store", + type = "character", + help = "path to enrichment graph SVG" + ) + , + make_option( + c("-L", "--locProbCutoffGraph"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph PDF" + ) + , + make_option( + c("-M", "--locProbCutoffGraph_svg"), + action = "store", + type = "character", + help = "path to location-proability cutoff graph SVG" + ) + , + make_option( + c("-e", "--enriched"), + action = "store", + type = "character", + help = "pY or pST enriched samples (ie, 'Y' or 'ST')" + ) + # default = "^Number of Phospho [(]STY[)]$", + , + make_option( + c("-p", "--phosphoCol"), + action = "store", + type = "character", + help = paste0("PERL-compatible regular expression matching", + " header of column having number of 'Phospho (STY)'") + ) + # default = "^Intensity[^_]", + , + make_option( + c("-s", "--startCol"), + action = "store", + type = "character", + help = paste0("PERL-compatible regular expression matching", + " header of column having first sample intensity") + ) + # default = 1, + , + make_option( + c("-I", "--intervalCol"), + action = "store", + type = "integer", + help = paste0("Column interval between the Intensities of samples", + " (eg, 1 if subsequent column; 2 if every other column") + ) + # default = 0.75, + , + make_option( + c("-l", "--localProbCutoff"), + action = "store", + type = "double", + help = "Localization Probability Cutoff" + ) + # default = "sum", + , + make_option( + c("-f", "--collapse_func"), + action = "store", + type = "character", + help = paste0("merge identical phosphopeptides", + " by ('sum' or 'average') the intensities") + ) + # default = "filtered_data.txt", + , + make_option( + c("-r", "--filtered_data"), + action = "store", + type = "character", + help = "filtered_data.txt" + ) + # default = "quantData.txt", + , + make_option( + c("-q", "--quant_data"), + action = "store", + type = "character", + help = "quantData.txt" + ) +) +args <- parse_args(OptionParser(option_list = option_list)) +# Check parameter values + +### EXTRACT ARGUMENTS end ---------------------------------------------------- + + +### EXTRACT PARAMETERS from arguments begin ---------------------------------- + +if (!file.exists(args$input)) { + stop((paste("File", args$input, "does not exist"))) +} + +phospho_col_pattern <- "^Number of Phospho [(][STY][STY]*[)]$" +start_col_pattern <- "^Intensity[^_]" +phospho_col_pattern <- read_first_line(args$phosphoCol) +start_col_pattern <- read_first_line(args$startCol) + +sink(getConnection(2)) + +input_file_name <- args$input +filtered_filename <- args$filtered_data +quant_file_name <- args$quant_data +interval_col <- as.integer(args$intervalCol) + +first_line <- read_first_line(input_file_name) +col_headers <- + unlist(strsplit( + x = first_line, + split = c("\t"), + fixed = TRUE + )) +sink(getConnection(2)) +sink() + + +intensity_header_cols <- + grep(pattern = start_col_pattern, x = col_headers, perl = TRUE) +if (length(intensity_header_cols) == 0) { + err_msg <- + paste("Found no intensity columns matching pattern:", + start_col_pattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) +} + + +phospho_col <- + grep(pattern = phospho_col_pattern, x = col_headers, perl = TRUE)[1] +if (is.na(phospho_col)) { + err_msg <- + paste("Found no 'number of phospho sites' columns matching pattern:", + phospho_col_pattern) + # Divert output to stderr + sink(getConnection(2)) + print(err_msg) + sink() + stop(err_msg) +} + + +i_count <- 0 +this_column <- 1 +last_value <- intensity_header_cols[1] +intensity_cols <- c(last_value) + +while (length(intensity_header_cols) >= interval_col * i_count) { + i_count <- 1 + i_count + this_column <- interval_col + this_column + if (last_value + interval_col != intensity_header_cols[this_column]) + break + last_value <- intensity_header_cols[this_column] + if (length(intensity_header_cols) < interval_col * i_count) + break + intensity_cols <- + c(intensity_cols, intensity_header_cols[this_column]) +} + +start_col <- intensity_cols[1] +num_samples <- i_count + +output_filename <- args$output +enrich_graph_filename <- args$enrichGraph +loc_prob_cutoff_graph_filename <- args$locProbCutoffGraph +enrich_graph_filename_svg <- args$enrichGraph_svg +loc_prob_cutoff_graph_fn_svg <- args$locProbCutoffGraph_svg + +local_prob_cutoff <- args$localProbCutoff +enriched <- args$enriched +collapse_fn <- args$collapse_func + +### EXTRACT PARAMETERS from arguments end ------------------------------------ + + +# Proteomics Quality Control for MaxQuant Results +# (Bielow C et al. J Proteome Res. 2016 PMID: 26653327) +# is run by the Galaxy MaxQuant wrapper and need not be invoked here. + + +# Read & filter out contaminants, reverse sequences, & localization probability +# --- +full_data <- + read.table( + file = input_file_name, + sep = "\t", + header = TRUE, + quote = "" + ) + +# Filter out contaminant rows and reverse rows +filtered_data <- subset(full_data, !grepl("CON__", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("_MYCOPLASMA", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("CONTAMINANT_", Proteins)) +filtered_data <- + subset(filtered_data, !grepl("REV__", Protein) + ) # since REV__ rows are blank in the first column (Proteins) +write.table( + filtered_data, + file = filtered_filename, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE +) +# ... + + +# Filter out data with localization probability below localProbCutoff +# --- +# Data filtered by localization probability +loc_prob_filtered_data <- + filtered_data[ + filtered_data$Localization.prob >= local_prob_cutoff, + ] +# ... + + +# Localization probability -- visualize locprob cutoff +# --- +loc_prob_graph_data <- + data.frame( + group = c(paste(">", toString(local_prob_cutoff), sep = ""), + paste("<", toString(local_prob_cutoff), sep = "")), + value = c( + nrow(loc_prob_filtered_data) / nrow(filtered_data) * 100, + (nrow(filtered_data) - nrow(loc_prob_filtered_data)) + / nrow(filtered_data) * 100 + ) + ) +gigi <- + ggplot(loc_prob_graph_data, aes(x = "", y = value, fill = group)) + + geom_bar(width = 0.5, + stat = "identity", + color = "black") + + labs(x = NULL, + y = "percent", + title = "Phosphopeptides partitioned by localization-probability cutoff" + ) + + scale_fill_discrete(name = "phosphopeptide\nlocalization-\nprobability") + + theme_minimal() + + theme( + legend.position = "right", + legend.title = element_text(), + plot.title = element_text(hjust = 0.5), + plot.subtitle = element_text(hjust = 0.5), + plot.title.position = "plot" + ) +pdf(loc_prob_cutoff_graph_filename) +print(gigi) +dev.off() +svg(loc_prob_cutoff_graph_fn_svg) +print(gigi) +dev.off() +# ... + + +# Extract quantitative values from filtered data +# --- +quant_data <- + loc_prob_filtered_data[, seq(from = start_col, + by = interval_col, + length.out = num_samples)] +# ... + + +# Generate Phosphopeptide Sequence +# for latest version of MaxQuant (Version 1.5.3.30) +# --- +metadata_df <- + data.frame( + loc_prob_filtered_data[, 1:8], + loc_prob_filtered_data[, phospho_col], + loc_prob_filtered_data[, phospho_col + 1], + loc_prob_filtered_data[, phospho_col + 2], + loc_prob_filtered_data[, phospho_col + 3], + loc_prob_filtered_data[, phospho_col + 4], + loc_prob_filtered_data[, phospho_col + 5], + loc_prob_filtered_data[, phospho_col + 6], + loc_prob_filtered_data[, phospho_col + 7], + quant_data + ) +colnames(metadata_df) <- + c( + "Proteins", + "Positions within proteins", + "Leading proteins", + "Protein", + "Protein names", + "Gene names", + "Fasta headers", + "Localization prob", + "Number of Phospho (STY)", + "Amino Acid", + "Sequence window", + "Modification window", + "Peptide window coverage", + "Phospho (STY) Probabilities", + "Phospho (STY) Score diffs", + "Position in peptide", + colnames(quant_data) + ) +# 'phosphopeptide_func' generates a phosphopeptide sequence +# for each row of data. +# for the 'apply' function: MARGIN 1 == rows, 2 == columns, c(1, 2) = both +metadata_df$phosphopeptide <- + apply(X = metadata_df, MARGIN = 1, FUN = phosphopeptide_func) +colnames(metadata_df)[1] <- "Phosphopeptide" +# Move the quant data columns to the right end of the data.frame +metadata_df <- movetolast(metadata_df, c(colnames(quant_data))) +# ... + + +# Write quantitative values for debugging purposes +# --- +quant_write <- cbind(metadata_df[, "Sequence window"], quant_data) +colnames(quant_write)[1] <- "Sequence.Window" +write.table( + quant_write, + file = quant_file_name, + sep = "\t", + quote = FALSE, + col.names = TRUE, + row.names = FALSE +) +# ... + + +# Make new data frame containing only Phosphopeptides +# that are to be mapped to quant data (merge_df) +# --- +metadata_df <- + setDT(metadata_df, keep.rownames = TRUE) # row name will be used to map +merge_df <- + data.frame( + as.integer(metadata_df$rn), + metadata_df$phosphopeptide # row index to merge data frames + ) +colnames(merge_df) <- c("rn", "Phosphopeptide") +# ... + + +# Add Phosphopeptide column to quant columns for quality control checking +# --- +quant_data_qc <- as.data.frame(quant_data) +setDT(quant_data_qc, keep.rownames = TRUE) # will use to match rowname to data +quant_data_qc$rn <- as.integer(quant_data_qc$rn) +quant_data_qc <- merge(merge_df, quant_data_qc, by = "rn") +quant_data_qc$rn <- NULL # remove rn column +# ... + + +# Collapse multiphosphorylated peptides +# --- +quant_data_qc_collapsed <- + data.table(quant_data_qc, key = "Phosphopeptide") +quant_data_qc_collapsed <- + aggregate(. ~ Phosphopeptide, quant_data_qc, FUN = collapse_fn) +# ... +print("quant_data_qc_collapsed") +head(quant_data_qc_collapsed) + +# Compute (as string) % of phosphopeptides that are multiphosphorylated +# (for use in next step) +# --- +pct_multiphos <- + ( + nrow(quant_data_qc) - nrow(quant_data_qc_collapsed) + ) / (2 * nrow(quant_data_qc)) +pct_multiphos <- sprintf("%0.1f%s", 100 * pct_multiphos, "%") +# ... + + +# Compute and visualize breakdown of pY, pS, and pT before enrichment filter +# --- +py_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), + ] +ps_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS"), + ] +pt_data <- + quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), + ] + +py_num <- nrow(py_data) +ps_num <- nrow(ps_data) +pt_num <- nrow(pt_data) + +# Visualize enrichment +enrich_graph_data <- data.frame(group = c("pY", "pS", "pT"), + value = c(py_num, ps_num, pt_num)) + +enrich_graph_data <- + enrich_graph_data[ + enrich_graph_data$value > 0, + ] + +# Plot pie chart with legend +# start: https://stackoverflow.com/a/62522478/15509512 +# refine: https://www.statology.org/ggplot-pie-chart/ +# colors: https://colorbrewer2.org/#type=diverging&scheme=BrBG&n=8 +slices <- enrich_graph_data$value +phosphoresidue <- enrich_graph_data$group +pct <- round(100 * slices / sum(slices)) +lbls <- + paste(enrich_graph_data$group, "\n", pct, "%\n(", slices, ")", sep = "") +slc_ctr <- c() +run_tot <- 0 +for (p in pct) { + slc_ctr <- c(slc_ctr, run_tot + p / 2.0) + run_tot <- run_tot + p +} +lbl_y <- 100 - slc_ctr +df <- + data.frame(slices, + pct, + lbls, + phosphoresidue = factor(phosphoresidue, levels = phosphoresidue)) +gigi <- ggplot(df + , aes(x = 1, y = pct, fill = phosphoresidue)) + + geom_col(position = "stack", orientation = "x") + + geom_text(aes(x = 1, y = lbl_y, label = lbls), col = "black") + + coord_polar(theta = "y", direction = -1) + + labs( + x = NULL + , + y = NULL + , + title = "Percentages (and counts) of phosphosites, by type of residue" + , + caption = sprintf( + "Roughly %s of peptides have multiple phosphosites.", + pct_multiphos + ) + ) + + labs(x = NULL, y = NULL, fill = NULL) + + theme_classic() + + theme( + legend.position = "right" + , + axis.line = element_blank() + , + axis.text = element_blank() + , + axis.ticks = element_blank() + , + plot.title = element_text(hjust = 0.5) + , + plot.subtitle = element_text(hjust = 0.5) + , + plot.caption = element_text(hjust = 0.5) + , + plot.title.position = "plot" + ) + + scale_fill_manual(breaks = phosphoresidue, + values = c("#c7eae5", "#f6e8c3", "#dfc27d")) + +pdf(enrich_graph_filename) +print(gigi) +dev.off() +svg(enrich_graph_filename_svg) +print(gigi) +dev.off() +# ... + + +# Filter phosphopeptides by enrichment +# -- +if (enriched == "Y") { + quant_data_qc_enrichment <- quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pY"), + ] +} else if (enriched == "ST") { + quant_data_qc_enrichment <- quant_data_qc_collapsed[ + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pS") | + str_detect(quant_data_qc_collapsed$Phosphopeptide, "pT"), + ] +} else { + print("Error in enriched variable. Set to either 'Y' or 'ST'") +} +# ... + +print("quant_data_qc_enrichment") +head(quant_data_qc_enrichment) + +# Write phosphopeptides filtered by enrichment +# -- +write.table( + quant_data_qc_enrichment, + file = output_filename, + sep = "\t", + quote = FALSE, + row.names = FALSE +) +# ...
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PhosphoPeptide_Upstream_Kinase_Mapping.pl Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,2192 @@ +#!/usr/local/bin/perl +############################################################################################################################### +# perl Kinase_enrichment_analysis_complete_v0.pl +# +# Nick Graham, USC +# 2016-02-27 +# +# Built from scripts written by NG at UCLA in Tom Graeber's lab: +# CombinePhosphoSites.pl +# Retrieve_p_motifs.pl +# NetworKIN_Motif_Finder_v7.pl +# +# Given a list of phospho-peptides, find protein information and upstream kinases. +# Output file can be used for KS enrichment score calculations using Enrichment_Score4Directory.pl +# +# Updated 2022-01-13, Art Eschenlauer, UMN on behalf of Justin Drake's lab: +# Added warnings and used strict; +# fixed some code paths resulting in more NetworKIN matches; +# applied Aho-Corasick algorithm (via external Python script because Perl implementation was still too slow) +# to speed up "Match the non_p_peptides to the @sequences array"; +# added support for SQLite-formatted UniProtKB/Swiss-Prot data as an alternative to FASTA-formatted data; +# added support for SQLite output in addition to tabular files. +# +# +############################################################################################################################### + +use strict; +use warnings 'FATAL' => 'all'; + +use Getopt::Std; +use DBD::SQLite::Constants qw/:file_open/; +use DBI qw(:sql_types); +use File::Copy; +use File::Basename; +use POSIX qw(strftime); +use Time::HiRes qw(gettimeofday); +#use Data::Dump qw(dump); + +my $USE_SEARCH_PPEP_PY = 1; +#my $FAILED_MATCH_SEQ = "Failed match"; +my $FAILED_MATCH_SEQ = 'No Sequence'; +my $FAILED_MATCH_GENE_NAME = 'No_Gene_Name'; + +my $dirname = dirname(__FILE__); +my %opts; +my ($file_in, $average_or_sum, $db_out, $file_out, $file_melt, $phospho_type); +my $dbtype; +my ($fasta_in, $networkin_in, $motifs_in, $PSP_Kinase_Substrate_in, $PSP_Regulatory_Sites_in); +my (@samples, %sample_id_lut, %ppep_id_lut, %data, @tmp_data, %n); +my $line = 0; +my @failed_match = ($FAILED_MATCH_SEQ); +my @failed_matches; +my (%all_data); +my (@p_peptides, @non_p_peptides); +my @parsed_fasta; +my (@accessions, @names, @sequences, @databases, $database); +my ($dbfile, $dbh, $stmth); +my @col_names; +my (%matched_sequences, %accessions, %names, %sites, ); +my (@tmp_matches, @tmp_accessions, @tmp_names, @tmp_sites); +my (%p_residues, @tmp_p_residues, @p_sites, $left, $right, %p_motifs, @tmp_motifs_array, $tmp_motif, $tmp_site, %residues); +my (@kinases_observed, $kinases); +my (@kinases_observed_lbl, @phosphosites_observed_lbl); +my ($p_sequence_kinase, $p_sequence, $kinase); +my (@motif_sequence, @motif_description, @motif_type_key_ary, %motif_type, %motif_count); +my (@kinases_PhosphoSite, $kinases_PhosphoSite); +my ($p_sequence_kinase_PhosphoSite, $p_sequence_PhosphoSite, $kinase_PhosphoSite); +my (%regulatory_sites_PhosphoSite_hash); +my (%domain, %ON_FUNCTION, %ON_PROCESS, %ON_PROT_INTERACT, %ON_OTHER_INTERACT, %notes, %organism); +my (%unique_motifs); +my ($kinase_substrate_NetworKIN_matches, $kinase_substrate_PhosphoSite_matches); +my %psp_regsite_protein_2; +my (%domain_2, %ON_FUNCTION_2, %ON_PROCESS_2, %ON_PROT_INTERACT_2, %N_PROT_INTERACT, %ON_OTHER_INTERACT_2, %notes_2, %organism_2); +my @timeData; +my $PhosphoSitePlusCitation; +my (%site_description, %site_id); + +my %kinase_substrate_NetworKIN_matches; +my %kinase_motif_matches; +my $regulatory_sites_PhosphoSite; +my ($seq_plus5aa, $seq_plus7aa, %seq_plus7aa_2); +my %kinase_substrate_PhosphoSite_matches; +my @formatted_sequence; +my $pSTY_sequence; +my $i; +my @a; +my $use_sqlite; +my $verbose; + +########## +## opts ## +########## + ## input files + # i : path to input file, e.g., 'outputfile_STEP2.txt' + # f : path to UniProtKB/SwissProt FASTA + # s : optional species argument + # n : path to NetworKIN_201612_cutoffscore2.0.txt + # m : path to pSTY_Motifs.txt + # p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt + # r : path to 2017-03_PSP_Regulatory_sites.txt + ## options + # P : phospho_type + # F : function + # v : verbose output + ## output files + # o : path to output file + # O : path to "melted" output file + # D : path to output SQLite file + +sub usage() + { + print STDERR <<"EOH"; + This program given a list of phospho-peptides, finds protein information and upstream kinases. + usage: $0 [-hvd] -f FASTA_file + -h : this (help) message + -v : slightly verbose + -a : use SQLite less + ## input files + -i : path to input file, e.g., 'outputfile_STEP2.txt' + -f : path to UniProtDB/SwissProt FASTA + -s : optional species filter argument for PSP records; defaults to 'human' + -n : path to NetworKIN_201612_cutoffscore2.0.txt + -m : path to pSTY_Motifs.txt + -p : path to 2017-03_PSP_Kinase_Substrate_Dataset.txt + -r : path to 2017-03_PSP_Regulatory_sites.txt + ## options + -P : phospho_type + -F : function + ## output files + -o : path to output file + -O : path to "melted" output file + -D : path to output SQLite file + example: $0 +EOH + exit; + } + +sub format_localtime_iso8601 { + # ref: https://perldoc.perl.org/Time::HiRes + my ($seconds, $microseconds) = gettimeofday; + # ref: https://pubs.opengroup.org/onlinepubs/9699919799/functions/strftime.html + return strftime("%Y-%m-%dT%H:%M:%S",localtime(time)) . sprintf(".%03d", $microseconds/1000); +} + +sub replace_pSpTpY { + my ($formatted_sequence, $phospho_type) = @_; + if ($phospho_type eq 'y') { + $formatted_sequence =~ s/pS/S/g; + $formatted_sequence =~ s/pT/T/g; + $formatted_sequence =~ s/pY/y/g; + } + elsif ($phospho_type eq "sty") { + $formatted_sequence =~ s/pS/s/g; + $formatted_sequence =~ s/pT/t/g; + $formatted_sequence =~ s/pY/y/g; + } + $formatted_sequence; +} + +sub pseudo_sed +{ + # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV" + # Comments give the sed equivalent + my ($t) = @_; + my $s = $t; + # / GN=/!{ s:\(OX=[^ \t]*\):\1 GN=N/A:; }; + unless ($s =~ m / GN=/s) + { + $s =~ s :(OX=[^ \t]*):${1} GN=N/A:s; + } + # / PE=/!{ s:\(GN=[^ \t]*\):\1 PE=N/A:; }; + unless ($s =~ m / PE=/s) + { + $s =~ s :(GN=[^ \t]*):${1} PE=N/A:s; + } + # / SV=/!{ s:\(PE=[^ \t]*\):\1 SV=N/A:; }; + unless ($s =~ m / SV=/s) + { + $s =~ s :(PE=[^ \t]*):${1} SV=N/A:s; + } + # s/^sp.//; + $s =~ s :^...::s; + # s/[|]/\t/g; + $s =~ s :[|]:\t:sg; + if ( !($s =~ m/ OX=/s) + && !($s =~ m/ GN=/s) + && !($s =~ m/ PE=/s) + && !($s =~ m/ SV=/s) + ) { + # OS= is used elsewhere, but it's not helpful without OX and GN + $s =~ s/OS=/Species /g; + # supply sensible default values + $s .= "\tN/A\t-1\tN/A\tN/A\tN/A"; + } else { + # s/ OS=/\t/; + if ($s =~ m/ OS=/s) { $s =~ s: OS=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; + # s/ OX=/\t/; + if ($s =~ m/ OX=/s) { $s =~ s: OX=:\t:s; } else { $s =~ s:(.*)\t:$1\t-1\t:x; }; + # s/ GN=/\t/; + if ($s =~ m/ GN=/s) { $s =~ s: GN=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; + # s/ PE=/\t/; + if ($s =~ m/ PE=/s) { $s =~ s: PE=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; + # s/ SV=/\t/; + if ($s =~ m/ SV=/s) { $s =~ s: SV=:\t:s; } else { $s =~ s:(.*)\t:$1\tN/A\t:x; }; + } + return $s; +} # sub pseudo_sed + +getopts('i:f:s:n:m:p:r:P:F:o:O:D:hva', \%opts) ; + + +if (exists($opts{'h'})) { + usage(); +} +if (exists($opts{'a'})) { + $USE_SEARCH_PPEP_PY = 0; +} +if (exists($opts{'v'})) { + $verbose = 1; +} else { + $verbose = 0; +} +if (!exists($opts{'i'}) || !-e $opts{'i'}) { + die('Input File not found'); +} else { + $file_in = $opts{'i'}; +} +if (!exists($opts{'f'}) || !-e $opts{'f'}) { + die('FASTA not found'); +} else { + $fasta_in = $opts{'f'}; + $use_sqlite = 0; +} +my $species; +if ((!exists($opts{'s'})) || ($opts{'s'} eq '')) { + $species = 'human'; +} else { + $species = $opts{'s'}; + print "'-s' option is '$species'\n"; +} +print "species filter is '$species'\n"; + +if (!exists($opts{'n'}) || !-e $opts{'n'}) { + die('Input NetworKIN File not found'); +} else { + $networkin_in = $opts{'n'}; +} +if (!exists($opts{'m'}) || !-e $opts{'m'}) { + die('Input pSTY_Motifs File not found'); +} else { + $motifs_in = $opts{'m'}; +} +if (!exists($opts{'p'}) || !-e $opts{'p'}) { + die('Input PSP_Kinase_Substrate_Dataset File not found'); +} else { + $PSP_Kinase_Substrate_in = $opts{'p'}; +} +if (!exists($opts{'r'}) || !-e $opts{'r'}) { + die('Input PSP_Regulatory_sites File not found'); +} else { + $PSP_Regulatory_Sites_in = $opts{'r'}; +} +if (exists($opts{'P'})) { + $phospho_type = $opts{'P'}; +} +else { + $phospho_type = "sty"; +} +if (exists($opts{'F'})) { + $average_or_sum = $opts{'F'}; +} +else { + $average_or_sum = "sum"; +} +if (exists($opts{'D'})) { + $db_out = $opts{'D'}; +} +else { + $db_out = "db_out.sqlite"; +} +if (exists($opts{'O'})) { + $file_melt = $opts{'O'}; +} +else { + $file_melt = "output_melt.tsv"; +} +if (exists($opts{'o'})) { + $file_out = $opts{'o'}; +} +else { + $file_out = "output.tsv"; +} + + +############################################################################################################################### +# Print the relevant file names to the screen +############################################################################################################################### +# print "\nData file: $data_in\nFASTA file: $fasta_in\nSpecies: $species\nOutput file: $motifs_out\n\n"; +print "\n--- parameters:\n"; +print "Data file: $file_in\nAverage or sum identical p-sites? $average_or_sum\nOutput file: $file_out\nMelted map: $file_melt\n"; +if ($use_sqlite == 0) { + print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt FASTA file: $fasta_in\nOutput SQLite file: $db_out\n"; +} else { + print "Motifs file: $motifs_in\nNetworKIN file: networkin_in\nPhosphosite kinase substrate data: $PSP_Kinase_Substrate_in\nPhosphosite regulatory site data: $PSP_Regulatory_Sites_in\nUniProtKB/SwissProt SQLIte file: $dbfile\nOutput SQLite file: $db_out\n"; +} +print "...\n\n"; + +print "Phospho-residues(s) = $phospho_type\n\n"; +if ($phospho_type ne 'y') { + if ($phospho_type ne 'sty') { + die "\nUsage error:\nYou must choose a phospho-type, either y or sty\n\n"; + } +} + +############################################################################################################################### +# read the input data file +# average or sum identical phospho-sites, depending on the value of $average_or_sum +############################################################################################################################### + +open (IN, "$file_in") or die "I couldn't find the input file: $file_in\n"; + +die "\n\nScript died: You must choose either average or sum for \$average_or_sum\n\n" if (($average_or_sum ne "sum") && ($average_or_sum ne "average")) ; + + +$line = 0; + +while (<IN>) { + chomp; + my @x = split(/\t/); + for my $n (0 .. $#x) {$x[$n] =~ s/\r//g; $x[$n] =~ s/\n//g; $x[$n] =~ s/\"//g;} + + # Read in the samples + if ($line == 0) { + for my $n (1 .. $#x) { + push (@samples, $x[$n]); + $sample_id_lut{$x[$n]} = $n; + } + $line++; + } else { + # check whether we have already seen a phospho-peptide + if (exists($data{$x[0]})) { + if ($average_or_sum eq "sum") { # add the data + # unload the data + @tmp_data = (); foreach (@{$data{$x[0]}}) { push(@tmp_data, $_); } + # add the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = $tmp_data[$k] + $x[$k+1]; } + $all_data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$all_data{$x[0]}}, $tmp_data[$k]); } + + } elsif ($average_or_sum eq "average") { # average the data + # unload the data + @tmp_data = (); foreach (@{$all_data{$x[0]}}) { push(@tmp_data, $_); } + # average with the new data and repack + for my $k (0 .. $#tmp_data) { $tmp_data[$k] = ( $tmp_data[$k]*$n{$x[0]} + $x[0] ) / ($n{$x[0]} + 1); } + $n{$x[0]}++; + $data{$x[0]} = (); for my $k (0 .. $#tmp_data) { push(@{$data{$x[0]}}, $tmp_data[$k]); } + } + } + # if the phospho-sequence has not been seen, save the data + else { + for my $k (1 .. $#x) { push(@{$data{$x[0]}}, $x[$k]); } + $n{$x[0]} = 1; + } + } +} +close(IN); + + +############################################################################################################################### +# Search the FASTA database for phospho-sites and motifs +# +# based on Retrieve_p_peptide_motifs_v2.pl +############################################################################################################################### + + +############################################################################################################################### +# +# Read in the Data file: +# 1) make @p_peptides array as in the original file +# 2) make @non_p_peptides array w/o residue modifications (p, #, other) +# +############################################################################################################################### + +foreach my $peptide (keys %data) { + $peptide =~ s/s/pS/g; $peptide =~ s/t/pT/g; $peptide =~ s/y/pY/g; + push (@p_peptides, $peptide); + $peptide =~ s/p//g; + push(@non_p_peptides, $peptide); +} + +if ($use_sqlite == 0) { + ############################################################################################################################### + # + # Read in the UniProtKB/Swiss-Prot data from FASTA; save to @sequences array and SQLite output database + # + ############################################################################################################################### + + # e.g. + # >sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 + # MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD + # DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK + # EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH + # QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS + # EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT + # accession: Q9Y3B9 + # name: RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 + # sequence: MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDD DAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEK EKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKH QKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKS EEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT + # + # e.g. + # >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP + # >gi|52001466|sp|P00366.2|DHE3_BOVIN Glutamate dehydrogenase 1, mitochondrial precursor (GDH) cRAP + # + # e.g. + # >zs|P00009.24.AR-V2_1.zs|zs_peptide_0024_AR-V2_1 + + + open (IN1, "$fasta_in") or die "I couldn't find $fasta_in\n"; + print "Reading FASTA file $fasta_in\n"; + # ref: https://perldoc.perl.org/perlsyn#Compound-Statements + # "If the condition expression of a while statement is based on any of + # a group of iterative expression types then it gets some magic treatment. + # The affected iterative expression types are readline, the <FILEHANDLE> + # input operator, readdir, glob, the <PATTERN> globbing operator, and + # `each`. If the condition expression is one of these expression types, + # then the value yielded by the iterative operator will be implicitly + # assigned to `$_`." + while (<IN1>) { + chomp; + # ref: https://perldoc.perl.org/functions/split#split-/PATTERN/,EXPR + # "If only PATTERN is given, EXPR defaults to $_." + my (@x) = split(/\|/); + # begin FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP + if (@x > 3) { + @x = (">".$x[$#x - 2], $x[$#x - 1], $x[$#x]); + } + # end FIX >gi|114939|sp|P00722.2|BGAL_ECOLI Beta-galactosidase (Lactase) cRAP + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; } + # Use of uninitialized value $x[0] in pattern match (m//) at /home/rstudio/src/mqppep/tools/mqppep/PhosphoPeptide_Upstream_Kinase_Mapping.pl line 411, <IN1> line 3. + if (exists($x[0])) { + if ($x[0] =~ /^>/) { + # parsing header line + $x[0] =~ s/\>//g; + push (@databases, $x[0]); + push (@accessions, $x[1]); + push (@names, $x[2]); + # format tags of standard UniProtKB headers as tab-separated values + # pseudo_sed produces "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV" + $_ = pseudo_sed(join "\t", (">".$x[0], $x[1], $x[2])); + # append tab as separator between header and sequence + s/$/\t/; + # parsed_fasta gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t" + print "push (\@parsed_fasta, $_)\n" if (0 && $x[0] ne "zs"); + push (@parsed_fasta, $_); + } elsif ($x[0] =~ /^\w/) { + # line is a portion of the sequence + if (defined $sequences[$#accessions]) { + $sequences[$#accessions] = $sequences[$#accessions].$x[0]; + } else { + $sequences[$#accessions] = $x[0]; + } + $parsed_fasta[$#accessions] = $parsed_fasta[$#accessions].$x[0]; + } + } + } + close IN1; + print "Done Reading FASTA file $fasta_in\n"; + $dbfile = $db_out; + print "Begin writing $dbfile at " . format_localtime_iso8601() . "\n"; + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); + my $auto_commit = $dbh->{AutoCommit}; + print "auto_commit was $auto_commit and is now 0\n" if ($verbose); + $dbh->{AutoCommit} = 0; + + # begin DDL-to-SQLite + # --- + $stmth = $dbh->prepare(" + DROP TABLE IF EXISTS UniProtKB; + "); + $stmth->execute(); + + $stmth = $dbh->prepare(" + CREATE TABLE UniProtKB ( + Uniprot_ID TEXT PRIMARY KEY ON CONFLICT IGNORE, + Description TEXT, + Organism_Name TEXT, + Organism_ID INTEGER, + Gene_Name TEXT, + PE TEXT, + SV TEXT, + Sequence TEXT, + Database TEXT + ) + "); + $stmth->execute(); + $stmth = $dbh->prepare(" + CREATE UNIQUE INDEX idx_uniq_UniProtKB_0 on UniProtKB(Uniprot_ID); + "); + $stmth->execute(); + $stmth = $dbh->prepare(" + CREATE INDEX idx_UniProtKB_0 on UniProtKB(Gene_Name); + "); + $stmth->execute(); + # ... + # end DDL-to-SQLite + + # insert all rows + # begin store-to-SQLite "UniProtKB" table + # --- + $stmth = $dbh->prepare(" + INSERT INTO UniProtKB ( + Uniprot_ID, + Description, + Organism_Name, + Organism_ID, + Gene_Name, + PE, + SV, + Sequence, + Database + ) VALUES (?,?,?,?,?,?,?,?,?) + "); + my $row_count = 1; + my $row_string; + my (@row, @rows); + my $wrd; + while ( scalar @parsed_fasta > 0 ) { + $database = $databases[$#parsed_fasta]; + # row_string gets "UniProt_ID\tDescription\tOS\tOX\tGN\tPE\tSV\t" + # 1 2 3 4 5 6 7 sequence database + $row_string = pop(@parsed_fasta); + @row = (split /\t/, $row_string); + if ((not exists($row[4])) || ($row[4] eq "")) { + die("invalid fasta line\n$row_string\n"); + }; + if ($row[4] eq "N/A") { + print "Organism_ID is 'N/A' for row $row_count:\n'$row_string'\n"; + $row[4] = -1; + }; + for $i (1..3,5..8) { + #BIND print "bind_param $i, $row[$i]\n"; + $stmth->bind_param($i, $row[$i]); + } + #BIND print "bind_param 4, $row[4]\n"; + $stmth->bind_param(9, $database); + #BIND print "bind_param 4, $row[4]\n"; + $stmth->bind_param(4, $row[4], { TYPE => SQL_INTEGER }); + if (not $stmth->execute()) { + print "Error in row $row_count: " . $dbh->errstr . "\n"; + print "Row $row_count: $row_string\n"; + print "Row $row_count: " . ($row_string =~ s/\t/@/g) . "\n"; + } + if (0 && $database ne "zs") { + print "row_count: $row_count\n"; + #### print "row_string: $row_string\n"; + print "Row $row_count: $row_string\n"; + for $i (1..3,5..8) { + print "bind_param $i, $row[$i]\n" if (exists($row[$i])); + } + print "bind_param 4, $row[4]\n" if (exists($row[4])); + print "bind_param 9, $database\n"; + }; + $row_count += 1; + } + # ... + # end store-to-SQLite "UniProtKB" table + + print "begin commit at " . format_localtime_iso8601() . "\n"; + $dbh->{AutoCommit} = $auto_commit; + print "auto_commit is now $auto_commit\n" if ($verbose); + $dbh->disconnect if ( defined $dbh ); + print "Finished writing $dbfile at " . format_localtime_iso8601() . "\n\n"; + $dbtype = "FASTA"; +} + +if ($use_sqlite == 1) { + ############################################################################################################################### + # + # Read in the UniProtKB/Swiss-Prot data from SQLite; save to @sequences array + # + ############################################################################################################################### + + copy($dbfile, $db_out) or die "Copy $dbfile to $db_out failed: $!"; + + # https://metacpan.org/pod/DBD::SQLite#Read-Only-Database + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef, { + sqlite_open_flags => SQLITE_OPEN_READONLY, + }); + print "DB connection $dbh is to $dbfile\n"; + + # Uniprot_ID, Description, Organism_Name, Organism_ID, Gene_Name, PE, SV, Sequence + $stmth = $dbh->prepare(" + SELECT Uniprot_ID + , Description + || CASE WHEN Organism_Name = 'N/A' THEN '' ELSE ' OS=' || Organism_Name END + || CASE WHEN Organism_ID = -1 THEN '' ELSE ' OX=' || Organism_ID END + || CASE WHEN Gene_Name = 'N/A' THEN '' ELSE ' GN=' || Gene_Name END + || CASE WHEN PE = 'N/A' THEN '' ELSE ' PE=' || PE END + || CASE WHEN SV = 'N/A' THEN '' ELSE ' SV=' || SV END + AS Description + , Sequence + , Database + FROM + UniProtKB + "); + $stmth->execute(); + @col_names = @{$stmth->{NAME}}; + print "\nColumn names selected from UniProtKB SQLite table: " . join(", ", @col_names) . "\n\n" if ($verbose); + while (my @row = $stmth->fetchrow_array) { + push (@names, $row[1]); # redacted Description + push (@accessions, $row[0]); # Uniprot_ID + $sequences[$#accessions] = $row[2]; # Sequence + push (@databases, $row[3]); # Database (should be 'sp') + } + + $dbh->disconnect if ( defined $dbh ); + + print "Done Reading UniProtKB/Swiss-Prot file $dbfile\n\n"; + $dbtype = "SQLite"; +} + +print "$#accessions accessions were read from the UniProtKB/Swiss-Prot $dbtype file\n"; + +###################### + $dbh = DBI->connect("dbi:SQLite:$dbfile", undef, undef); + $stmth = $dbh->prepare(" + INSERT INTO UniProtKB ( + Uniprot_ID, + Description, + Organism_Name, + Organism_ID, + Gene_Name, + PE, + SV, + Sequence, + Database + ) VALUES ( + 'No Uniprot_ID', + 'NO_GENE_SYMBOL No Description', + 'No Organism_Name', + 0, + '$FAILED_MATCH_GENE_NAME', + '0', + '0', + '$FAILED_MATCH_SEQ', + 'No Database' + ) + "); + if (not $stmth->execute()) { + print "Error inserting dummy row into UniProtKB: $stmth->errstr\n"; + } + $dbh->disconnect if ( defined $dbh ); +###################### + +@timeData = localtime(time); +print "\n--- Start search at " . format_localtime_iso8601() ."\n"; + +print " --> Calling 'search_ppep' script\n\n"; +if ($verbose) { + $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in --verbose"); +} else { + $i = system("python $dirname/search_ppep.py -u $db_out -p $file_in"); +} +if ($i) { + print "python $dirname/search_ppep.py -u $db_out -p $file_in\n exited with exit code $i\n"; + die "Search failed for phosphopeptides in SwissProt/SQLite file."; +} +print " <-- Returned from 'search_ppep' script\n"; + +@timeData = localtime(time); +print "... Finished search at " . format_localtime_iso8601() ."\n\n"; + + +############################################################################################################################### +# +# Match the non_p_peptides to the @sequences array: +# 1) Format the motifs +/- 10 residues around the phospho-site +# 2) Print the original data plus the phospho-motif to the output file +# +############################################################################################################################### + + +print "--- Match the non_p_peptides to the \@sequences array:\n"; + +if ($USE_SEARCH_PPEP_PY) { + print "Find the matching protein sequence(s) for the peptide using SQLite\n"; +} else { + print "Find the matching protein sequence(s) for the peptide using slow search\n"; +} + +# https://metacpan.org/pod/DBD::SQLite#Read-Only-Database +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef, { + sqlite_open_flags => SQLITE_OPEN_READONLY, +}); +print "DB connection $dbh is to $db_out\n"; + +# CREATE VIEW uniprotid_pep_ppep AS +# SELECT deppep_UniProtKB.UniprotKB_ID AS accession +# , deppep.seq AS peptide +# , ppep.seq AS phosphopeptide +# , UniProtKB.Sequence AS sequence +# , UniProtKB.Description AS description +# FROM ppep, deppep, deppep_UniProtKB, UniProtKB +# WHERE deppep.id = ppep.deppep_id +# AND deppep.id = deppep_UniProtKB.deppep_id +# AND deppep_UniProtKB.UniprotKB_ID = UniProtKB.Uniprot_ID +# ORDER BY UniprotKB_ID, deppep.seq, ppep.seq; + +my %ppep_to_count_lut; +print "start select peptide counts " . format_localtime_iso8601() . "\n"; +my $uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" + SELECT DISTINCT + phosphopeptide + , count(*) as i + FROM + uniprotkb_pep_ppep_view + GROUP BY + phosphopeptide + ORDER BY + phosphopeptide +"); +if (not $uniprotkb_pep_ppep_view_stmth->execute()) { + die "Error fetching peptide counts: $uniprotkb_pep_ppep_view_stmth->errstr\n"; +} +while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { + $ppep_to_count_lut{$row[0]} = $row[1]; + #print "\$ppep_to_count_lut{$row[0]} = $ppep_to_count_lut{$row[0]}\n"; +} + +# accession, peptide, sequence, description, phosphopeptide, long_description, pos_start, pos_end, scrubbed, ppep_id +# 0 1 2 3 4 5 6 7 8 9 +my $COL_ACCESSION = 0; +my $COL_PEPTIDE = 1; +my $COL_SEQUENCE = 2; +my $COL_DESCRIPTION = 3; +my $COL_PHOSPHOPEPTIDE = 4; +my $COL_LONG_DESCRIPTION = 5; +my $COL_POS_START = 6; +my $COL_POS_END = 7; +my $COL_SCRUBBED = 8; +my $COL_PPEP_ID = 9; + +my %ppep_to_row_lut; +print "start select all records without qualification " . format_localtime_iso8601() . "\n"; +$uniprotkb_pep_ppep_view_stmth = $dbh->prepare(" + SELECT DISTINCT + accession + , peptide + , sequence + , description + , phosphopeptide + , long_description + , pos_start + , pos_end + , scrubbed + , ppep_id + FROM + uniprotkb_pep_ppep_view + ORDER BY + phosphopeptide +"); +if (not $uniprotkb_pep_ppep_view_stmth->execute()) { + die "Error fetching all records without qualification: $uniprotkb_pep_ppep_view_stmth->errstr\n"; +} +my $current_ppep; +my $counter = 0; +my $former_ppep = ""; +@tmp_matches = (); +@tmp_accessions = (); +@tmp_names = (); +@tmp_sites = (); +while (my @row = $uniprotkb_pep_ppep_view_stmth->fetchrow_array) { + # Identify phosphopeptide for current row; + # it is an error for it to change when the counter is not zero. + $current_ppep = $row[$COL_PHOSPHOPEPTIDE]; + + # when counter is zero, prepare for a new phosphopeptide + if (not $current_ppep eq $former_ppep) { + die "counter is $counter instead of zero" if ($counter != 0); + $ppep_id_lut{$current_ppep} = $row[$COL_PPEP_ID]; + print "next phosphpepetide: $current_ppep; id: $ppep_id_lut{$current_ppep}\n" if ($verbose); + $counter = $ppep_to_count_lut{$current_ppep}; + @tmp_matches = (); + @tmp_accessions = (); + @tmp_names = (); + @tmp_sites = (); + } + + if ($USE_SEARCH_PPEP_PY) { + push(@tmp_matches, $row[ $COL_SEQUENCE ]); + push(@tmp_accessions, $row[ $COL_ACCESSION ]); + push(@tmp_names, $row[ $COL_LONG_DESCRIPTION ]); + push(@tmp_sites, $row[ $COL_POS_START ]); + } + + # Prepare counter and phosphopeptide tracker for next row + $former_ppep = $current_ppep; + $counter -= 1; + + # Set trackers for later use after last instance of current phosphopeptide + if ($counter == 0) { + if ($USE_SEARCH_PPEP_PY) { + $matched_sequences{$current_ppep} = [ @tmp_matches ]; + $accessions{ $current_ppep} = [ @tmp_accessions ]; + $names{ $current_ppep} = [ @tmp_names ]; + $sites{ $current_ppep} = [ @tmp_sites ]; + } + } +} + + +print "end select all records without qualification " . format_localtime_iso8601() . "\n"; + +for my $j (0 .. $#p_peptides) { + + #Find the matching protein sequence(s) for the peptide using SQLite + my ($site, $sequence); + my (@row, @rows); + my $match = 0; + my $p_peptide = $p_peptides[$j]; + @tmp_matches = (); + @tmp_accessions = (); + @tmp_names = (); + @tmp_sites = (); + + #Find the matching protein sequence(s) for the peptide using slow search + $site = -1; + unless ($USE_SEARCH_PPEP_PY) { + for my $k (0 .. $#sequences) { + $site = index($sequences[$k], $non_p_peptides[$j]); + if ($site != -1) { + push(@tmp_matches, $sequences[$k]); + push(@tmp_accessions, $accessions[$k]); + push(@tmp_names, $names[$k]); + push(@tmp_sites, $site); + } + # print "Non-phosphpeptide $non_p_peptides[$j] matched accession $accessions[$k] ($names[$k]) at site $site\n"; + $site = -1; $match++; + # print "tmp_accessions @tmp_accessions \n"; + } + if ($match == 0) { # Check to see if no match was found. Skip to next if no match found. + print "Warning: Failed match for $p_peptides[$j]\n"; + $matched_sequences{$p_peptides[$j]} = \@failed_match; + push(@failed_matches,$p_peptides[$j]); + next; + } else { + $matched_sequences{$p_peptides[$j]} = [ @tmp_matches ]; + $accessions{$p_peptides[$j]} = [ @tmp_accessions ]; + $names{$p_peptides[$j]} = [ @tmp_names ]; + $sites{$p_peptides[$j]} = [ @tmp_sites ]; + } + } + +} # end for my $j (0 .. $#p_peptides) + +print "... Finished match the non_p_peptides at " . format_localtime_iso8601() ."\n\n"; + +print "--- Match the p_peptides to the \@sequences array:\n"; + +for my $peptide_to_match ( keys %matched_sequences ) { + if (grep($peptide_to_match, @failed_matches)) { + print "Failed to match peptide $peptide_to_match\n"; + } + next if (grep($peptide_to_match, @failed_matches)); + my @matches = @{$matched_sequences{$peptide_to_match}}; + @tmp_motifs_array = (); + for my $i (0 .. $#matches) { + + # Find the location of the phospo-site in the sequence(s) + $tmp_site = 0; my $offset = 0; + my $tmp_p_peptide = $peptide_to_match; + $tmp_p_peptide =~ s/#//g; $tmp_p_peptide =~ s/\d//g; $tmp_p_peptide =~ s/\_//g; $tmp_p_peptide =~ s/\.//g; + + # Find all phosphorylated residues in the p_peptide + @p_sites = (); + while ($tmp_site != -1) { + $tmp_site = index($tmp_p_peptide, 'p', $offset); + if ($tmp_site != -1) {push (@p_sites, $tmp_site);} + $offset = $tmp_site + 1; + $tmp_p_peptide =~ s/p//; + } + @tmp_p_residues = (); + for my $l (0 .. $#p_sites) { + next if not defined $sites{$peptide_to_match}[$i]; + + push (@tmp_p_residues, $p_sites[$l] + $sites{$peptide_to_match}[$i]); + + # Match the sequences around the phospho residues to find the motifs + my ($desired_residues_L, $desired_residues_R); + if ($tmp_p_residues[0] - 10 < 0) { #check to see if there are fewer than 10 residues left of the first p-site + # eg, XXXpYXX want $desired_residues_L = 3, $p_residues[0] = 3 + $desired_residues_L = $tmp_p_residues[0]; + } + else { + $desired_residues_L = 10; + } + my $seq_length = length($matched_sequences{$peptide_to_match}[$i]); + if ($tmp_p_residues[$#tmp_p_residues] + 10 > $seq_length) { #check to see if there are fewer than 10 residues right of the last p-site + $desired_residues_R = $seq_length - ($tmp_p_residues[$#tmp_p_residues] + 1); + # eg, XXXpYXX want $desired_residues_R = 2, $seq_length = 6, $p_residues[$#p_residues] = 3 + # print "Line 170: seq_length = $seq_length\tp_residue = $p_residues[$#p_residues]\n"; + } + else { + $desired_residues_R = 10; + } + + my $total_length = $desired_residues_L + $tmp_p_residues[$#tmp_p_residues] - $tmp_p_residues[0] + $desired_residues_R + 1; + my $arg2 = $tmp_p_residues[0] - $desired_residues_L; + my $arg1 = $matched_sequences{$peptide_to_match}[$i]; + + if (($total_length > 0) && (length($arg1) > $arg2 + $total_length - 1)) { + $tmp_motif = substr($arg1, $arg2, $total_length); + + # Put the "p" back in front of the appropriate phospho-residue(s). + my (@tmp_residues, $tmp_position); + for my $m (0 .. $#p_sites) { + # print "Line 183: $p_sites[$m]\n"; + if ($m == 0) { + $tmp_position = $desired_residues_L; + } else { + $tmp_position = $desired_residues_L + $p_sites[$m] - $p_sites[0]; + } + if ($tmp_position < length($tmp_motif) + 1) { + push (@tmp_residues, substr($tmp_motif, $tmp_position, 1)); + if ($tmp_residues[$m] eq "S") {substr($tmp_motif, $tmp_position, 1, "s");} + if ($tmp_residues[$m] eq "T") {substr($tmp_motif, $tmp_position, 1, "t");} + if ($tmp_residues[$m] eq "Y") {substr($tmp_motif, $tmp_position, 1, "y");} + } + } + + $tmp_motif =~ s/s/pS/g; $tmp_motif =~ s/t/pT/g; $tmp_motif =~ s/y/pY/g; + + # Comment out on 8.10.13 to remove the numbers from motifs + my $left_residue = $tmp_p_residues[0] - $desired_residues_L+1; + my $right_residue = $tmp_p_residues[$#tmp_p_residues] + $desired_residues_R+1; + $tmp_motif = $left_residue."-[ ".$tmp_motif." ]-".$right_residue; + push(@tmp_motifs_array, $tmp_motif); + $residues{$peptide_to_match}{$i} = [ @tmp_residues ]; + $p_residues{$peptide_to_match}{$i} = [ @tmp_p_residues ]; + } + } + $p_motifs{$peptide_to_match} = [ @tmp_motifs_array ]; + } # end for my $i (0 .. $#matches) ### this bracket could be in the wrong place +} + +print "... Finished match the p_peptides to the \@sequences array at " . format_localtime_iso8601() ."\n\n"; + +############################################################################################################################### +# +# Annotate the peptides with the NetworKIN predictions and HPRD / Phosida kinase motifs +# +############################################################################################################################### + + +print "--- Reading various site data:\n"; + +############################################################################################################################### +# +# Read the NetworKIN_predictions file: +# 1) make a "kinases_observed" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +############################################################################################################################### +my $SITE_KINASE_SUBSTRATE = 1; +$site_description{$SITE_KINASE_SUBSTRATE} = "NetworKIN"; + +open (IN1, "$networkin_in") or die "I couldn't find $networkin_in\n"; +print "Reading the NetworKIN data: $networkin_in\n"; +while (<IN1>) { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + next if ($x[0] eq "#substrate"); + if (exists ($kinases -> {$x[2]})) { + #do nothing + } + else { + $kinases -> {$x[2]} = $x[2]; + push (@kinases_observed, $x[2]); + } + my $tmp = $x[10]."_".$x[2]; #eg, REEILsEMKKV_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase -> {$tmp} = $tmp; + } +} +close IN1; + +############################################################################################################################### +# +# Read the Kinase motifs file: +# 1) make a "motif_sequence" array +# +############################################################################################################################### + +# file format (tab separated): +# x[0] = quasi-primary key (character), e.g., '17' or '23a' +# x[1] = pattern (egrep pattern), e.g., '(M|I|L|V|F|Y).R..(pS|pT)' +# x[2] = description, e.g., 'PKA_Phosida' or '14-3-3 domain binding motif (HPRD)' or 'Akt kinase substrate motif (HPRD & Phosida)' +# "counter" "pcre" "symbol" "description" "pubmed_id" "classification" "source" +# "1" "R.R..(pS|pT)(F|L)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8985174" "kinase substrate" "HPRD" +# x[3] = old description, i.e., description in Amanchy (HPRD) and Phosida tables +# x[4] = pubmed id +# x[5] = classification +# x[6] = source (Phosida or HPRD) +my $SITE_HPRD = 2; +$site_description{$SITE_HPRD} = "HPRD"; +$site_id{$site_description{$SITE_HPRD}} = $SITE_HPRD; +my $SITE_PHOSIDA = 4; +$site_description{$SITE_PHOSIDA} = "Phosida"; +$site_id{$site_description{$SITE_PHOSIDA}} = $SITE_PHOSIDA; + +open (IN2, "$motifs_in") or die "I couldn't find $motifs_in\n"; +print "Reading the Motifs file: $motifs_in\n"; + +while (<IN2>) { + chomp; + my (@x) = split(/\t/); + my $tmp_motif_description; + if ($#x == 6) { # weirdly, a @list of length seven has $#list == 6 + # remove double-quotes which are helpful or necessary for Excel + $x[6] =~ s/\"//g; + $tmp_motif_description = $x[6]; + } else { + $tmp_motif_description = "motif"; + } + for my $i (0 .. 2) { + # remove any embedded CR or LF (none should exist) + $x[$i] =~ s/\r//g; + $x[$i] =~ s/\n//g; + # remove double-quotes which are helpful or necessary for Excel + $x[$i] =~ s/\"//g; + } + if (exists ($motif_type{$x[2]})) { + #ACE-2022.06.20 $motif_type{$x[1]} = $motif_type{$x[1]}." & ".$x[2]; + $motif_type{$x[2]} = $motif_type{$x[2]}."|".$x[2]; + } else { + $motif_type{$x[2]} = $x[2]; + $motif_count{$x[1]} = 0; + push (@motif_sequence, $x[1]); + push (@motif_description, $tmp_motif_description); + push (@motif_type_key_ary, $x[2]) + } +} +close (IN2); + + +############################################################################################################################### +# 6.28.2011 +# Read PSP_Kinase_Substrate data: +# 1) make a "kinases_PhosphoSite" array +# 2) annotate the phospho-substrates with the appropriate kinase +# +# Columns: +# (0) GENE +# (1) KINASE +# (2) KIN_ACC_ID +# (3) KIN_ORGANISM +# (4) SUBSTRATE +# (5) SUB_GENE_ID +# (6) SUB_ACC_ID +# (7) SUB_GENE +# (8) SUB_ORGANISM +# (9) SUB_MOD_RSD +# (10) SITE_GRP_ID +# (11) SITE_+/-7_AA +# (12) DOMAIN +# (13) IN_VIVO_RXN +# (14) IN_VITRO_RXN +# (15) CST_CAT# +############################################################################################################################### + +my $SITE_PHOSPHOSITE = 3; +$site_description{$SITE_PHOSPHOSITE} = "PhosphoSite"; + + +$line = 0; + +open (IN3, "$PSP_Kinase_Substrate_in") or die "I couldn't find $PSP_Kinase_Substrate_in\n"; +print "Reading the PhosphoSite Kinase-Substrate data: $PSP_Kinase_Substrate_in\n"; + +while (<IN3>) { + chomp; + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + if ($line != 0) { + if (($species eq $x[3]) && ($species eq $x[8])) { + if (exists ($kinases_PhosphoSite -> {$x[0]})) { + #do nothing + } + else { + $kinases_PhosphoSite -> {$x[0]} = $x[0]; + push (@kinases_PhosphoSite, $x[0]); + } + my $offset = 0; + # Replace the superfluous lower case s, t and y + my @lowercase = ('s','t','y'); + my @uppercase = ('S','T','Y'); + for my $k (0 .. 2) { + my $site = 0; + while ($site != -1) { + $site = index($x[11],$lowercase[$k], $offset); + if (($site != 7) && ($site != -1)) {substr($x[11], $site, 1, $uppercase[$k]);} + $offset = $site + 1; + } + } + my $tmp = $x[11]."_".$x[0]; #eg, RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + #do nothing + } + else { + $p_sequence_kinase_PhosphoSite -> {$tmp} = $tmp; + } + } + else { + # do nothing + #print "PSP_kinase_substrate line rejected because KIN_ORGANISM is '$x[3]' and SUB_ORGANISM is '$x[8]': $line\n"; + } + } + $line++; +} +close IN3; + + +############################################################################################################################### +# Read PhosphoSite regulatory site data: +# 1) make a "regulatory_sites_PhosphoSite" hash +# +# Columns: +# (0) GENE +# (2) PROT_TYPE +# (3) ACC_ID +# (4) GENE_ID +# (5) HU_CHR_LOC +# (6) ORGANISM --> %organism +# (7) MOD_RSD +# (8) SITE_GRP_ID +# (9) SITE_+/-7_AA --> %regulatory_sites_PhosphoSite_hash +# (10) DOMAIN --> %domain +# (11) ON_FUNCTION --> %ON_FUNCTION +# (12) ON_PROCESS --> %ON_PROCESS +# (13) ON_PROT_INTERACT --> %ON_PROT_INTERACT +# (14) ON_OTHER_INTERACT --> %ON_OTHER_INTERACT +# (15) PMIDs +# (16) LT_LIT +# (17) MS_LIT +# (18) MS_CST +# (19) NOTES --> %notes +############################################################################################################################### + + +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); +my $auto_commit = $dbh->{AutoCommit}; +$dbh->{AutoCommit} = 0; +print "DB connection $dbh is to $db_out, opened for modification\n"; + +# add partial PSP_Regulatory_site table (if not exists) regardless of whether SwissProt input was FASTA or SQLite +$stmth = $dbh->prepare(" +CREATE TABLE IF NOT EXISTS PSP_Regulatory_site ( + SITE_PLUSMINUS_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, + DOMAIN TEXT, + ON_FUNCTION TEXT, + ON_PROCESS TEXT, + ON_PROT_INTERACT TEXT, + ON_OTHER_INTERACT TEXT, + NOTES TEXT, + ORGANISM TEXT, + PROTEIN TEXT +) +"); +$stmth->execute(); + +# add partial PSP_Regulatory_site LUT (if not exists) regardless of whether SwissProt input was FASTA or SQLite +$stmth = $dbh->prepare(" +CREATE TABLE IF NOT EXISTS ppep_regsite_LUT +( ppep_id INTEGER REFERENCES ppep(id) +, site_plusminus_7AA TEXT REFERENCES PSP_Regulatory_site(site_plusminus_7AA) +, PRIMARY KEY (ppep_id, site_plusminus_7AA) ON CONFLICT IGNORE +); +"); +$stmth->execute(); + +# $stmth = $dbh->prepare(" +# CREATE UNIQUE INDEX idx_PSP_Regulatory_site_0 +# ON PSP_Regulatory_site(site_plusminus_7AA); +# "); +# $stmth->execute(); + + +# add Citation table (if not exists) regardless of whether SwissProt input was FASTA or SQLite +my $citation_sql; +$citation_sql = " +CREATE TABLE IF NOT EXISTS Citation ( + ObjectName TEXT REFERENCES sqlite_schema(name) ON DELETE CASCADE, + CitationData TEXT, + PRIMARY KEY (ObjectName, CitationData) ON CONFLICT IGNORE +) +"; +$stmth = $dbh->prepare($citation_sql); +$stmth->execute(); + + +open (IN4, "$PSP_Regulatory_Sites_in") or die "I couldn't find $PSP_Regulatory_Sites_in\n"; +print "Reading the PhosphoSite regulatory site data: $PSP_Regulatory_Sites_in\n"; + + +$line = -1; +while (<IN4>) { + $line++; + chomp; + if ($_ =~ m/PhosphoSitePlus/) { + #$PhosphoSitePlusCitation = ($_ =~ s/PhosphoSitePlus/FooBar/g); + $PhosphoSitePlusCitation = $_; + $PhosphoSitePlusCitation =~ s/\t//g; + $PhosphoSitePlusCitation =~ s/\r//g; + $PhosphoSitePlusCitation =~ s/\n//g; + $PhosphoSitePlusCitation =~ s/""/"/g; + $PhosphoSitePlusCitation =~ s/^"//g; + $PhosphoSitePlusCitation =~ s/"$//g; + print "$PhosphoSitePlusCitation\n"; + next; + } + my (@x) = split(/\t/); + for my $i (0 .. $#x) { + $x[$i] =~ s/\r//g; $x[$i] =~ s/\n//g; $x[$i] =~ s/\"//g; + } + my $found_GENE=0; + if ( (not exists($x[0])) ) { + next; + } + elsif ( ($x[0] eq "GENE") ) { + $found_GENE=1; + next; + } + if ( (not exists($x[9])) || ($x[9] eq "") ) { + if (exists($x[8]) && (not $x[8] eq "")) { + die "$PSP_Regulatory_Sites_in line $line has no SITE_+/-7_AA: $_\n"; + } else { + if ( (not exists($x[1])) || (not $x[1] eq "") ) { + print "$PSP_Regulatory_Sites_in line $line (".length($_)." characters) has no SITE_+/-7_AA: $_\n" + if $found_GENE==1; + } + next; + } + } + elsif ($line != 0) { + if ($species ne $x[6]) { + # Do nothing - this record was filtered out by the species filter + } + elsif (!exists($regulatory_sites_PhosphoSite_hash{$x[9]})) { + if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { + $regulatory_sites_PhosphoSite_hash{$x[9]} = $x[9]; + $domain{$x[9]} = $x[10]; + $ON_FUNCTION{$x[9]} = $x[11]; + $ON_PROCESS{$x[9]} = $x[12]; + $ON_PROT_INTERACT{$x[9]} = $x[13]; + $ON_OTHER_INTERACT{$x[9]} = $x[14]; + $notes{$x[9]} = $x[19]; + $organism{$x[9]} = $x[6]; + } + } + else { + # $domain + if (!defined $domain{$x[9]} || $domain{$x[9]} eq "") { + if ($x[10] ne "") { + $domain{$x[9]} = $domain{$x[10]}; + } + else { + # do nothing + } + } + else { + if ($domain{$x[9]} =~ /$x[10]/) { + # do nothing + } + else { + $domain{$x[9]} = $domain{$x[9]}." / ".$x[10]; + #print "INFO line $line - compound domain for 7aa: GENE $x[0] PROTEIN $x[1] PROT_TYPE $x[2] ACC_ID $x[3] GENE_ID $x[4] HU_CHR_LOC $x[5] ORGANISM $x[6] MOD_RSD $x[7] SITE_GRP_ID $x[8] SITE_+/-7_AA $x[9] DOMAIN $domain{$x[9]}\n"; + } + } + + # $ON_FUNCTION + if (!defined $ON_FUNCTION{$x[9]} || $ON_FUNCTION{$x[9]} eq "") { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_FUNCTION{$x[9]} = $ON_FUNCTION{$x[9]}." / ".$x[10]; + } + + # $ON_PROCESS + if (!defined $ON_PROCESS{$x[9]} || $ON_PROCESS{$x[9]} eq "") { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROCESS{$x[9]} = $ON_PROCESS{$x[9]}." / ".$x[10]; + } + + # $ON_PROT_INTERACT + if (!defined $ON_PROT_INTERACT{$x[9]} || $ON_PROT_INTERACT{$x[9]} eq "") { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT{$x[9]} = $ON_PROT_INTERACT{$x[9]}." / ".$x[10]; + } + + # $ON_OTHER_INTERACT + if (!defined $ON_OTHER_INTERACT{$x[9]} || $ON_OTHER_INTERACT{$x[9]} eq "") { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT{$x[9]} = $ON_OTHER_INTERACT{$x[9]}." / ".$x[10]; + } + + # $notes + if (!defined $notes{$x[9]} || $notes{$x[9]} eq "") { + $notes{$x[9]} = $notes{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $notes{$x[9]} = $notes{$x[9]}." / ".$x[10]; + } + + # $organism + if (!defined $organism{$x[9]} || $organism{$x[9]} eq "") { + $organism{$x[9]} = $organism{$x[10]}; + } elsif ($x[10] eq "") { + # do nothing + } + else { + $organism{$x[9]} = $organism{$x[9]}." / ".$x[10]; + } + } + } +} +close IN4; + +print "... Finished reading various site data at " . format_localtime_iso8601() ."\n\n"; + +$stmth = $dbh->prepare(" +INSERT INTO Citation ( + ObjectName, + CitationData +) VALUES (?,?) +"); + +sub add_citation { + my ($cit_table, $cit_text, $cit_label) = @_; + $stmth->bind_param(1, $cit_table); + $stmth->bind_param(2, $cit_text); + if (not $stmth->execute()) { + print "Error writing $cit_label cit for table $cit_table: $stmth->errstr\n"; + } +} +my ($citation_text, $citation_table); + +# PSP regulatory or kinase/substrate site +$citation_text = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."'; +$citation_table = "PSP_Regulatory_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); +$citation_text = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122'; +$citation_table = "PSP_Regulatory_site"; +add_citation($citation_table, $citation_text, "PSP_Regulatory_site"); +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "PSP_Kinase_Substrate"); + +# NetworKIN site +$citation_text = 'Linding, 2007, "Systematic discovery of in vivo phosphorylation networks.", https://pubmed.ncbi.nlm.nih.gov/17570479, https://doi.org/10.1016/j.cell.2007.05.052'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_text = 'Horn, 2014, "KinomeXplorer: an integrated platform for kinome biology studies.", https://pubmed.ncbi.nlm.nih.gov/24874572, https://doi.org/10.1038/nmeth.296'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_text = 'Aken, 2016, "The Ensembl gene annotation system.", https://pubmed.ncbi.nlm.nih.gov/33137190, https://doi.org/10.1093/database/baw093'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "NetworkKIN"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "NetworkKIN"); + +# pSTY motifs +$citation_text = 'Amanchy, 2007, "A curated compendium of phosphorylation motifs.", https://pubmed.ncbi.nlm.nih.gov/17344875, https://doi.org/10.1038/nbt0307-285'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "Amanchy_pSTY_motifs"); +$citation_text = 'Gnad, 2011, "PHOSIDA 2011: the posttranslational modification database.", https://pubmed.ncbi.nlm.nih.gov/21081558, https://doi.org/10.1093/nar/gkq1159'; +$citation_table = "psp_gene_site"; +add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); +$citation_table = "psp_gene_site_view"; +add_citation($citation_table, $citation_text, "Phosida_pSTY_motifs"); + + +############################################################################################################################### +# +# Read the data file: +# 1) find sequences that match the NetworKIN predictions +# 2) find motifs that match the observed sequences +# +############################################################################################################################### + +print "--- Find sequences that match the NetworKIN predictions and find motifs that match observed sequences\n"; + +my $ppep_regsite_LUT_stmth; +$ppep_regsite_LUT_stmth = $dbh->prepare(" + INSERT INTO ppep_regsite_LUT ( + ppep_id, + site_plusminus_7AA + ) VALUES (?,?) +"); + +my ($start_seconds, $start_microseconds) = gettimeofday; + +foreach my $peptide (keys %data) { + # find the unique phospho-motifs for this $peptide + my @all_motifs = (); + my $have_all_motifs = 0; + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + my $tmp_motif = $p_motifs{$peptide}[$i]; + push(@all_motifs, $tmp_motif); + $have_all_motifs = 1; + } + if ($have_all_motifs == 1) { + for my $j (0 .. $#all_motifs) { + if (defined $all_motifs[$j]) { + $all_motifs[$j] =~ s/\d+-\[\s//; + $all_motifs[$j] =~ s/\s\]\-\d+//; + } + } + } + my %seen = (); + if ($have_all_motifs == 1) { + foreach my $a (@all_motifs) { + if (defined $a) { + if (exists($seen{$a})) { + next; + } else { + push(@{$unique_motifs{$peptide}}, $a); + $seen{$a} = 1; + } + } + print "push(\@{\$unique_motifs{$peptide}}, $a);\n" if ($verbose); + } + } + + # count the number of phospo-sites in the motif + my $number_pY = 0; + my $number_pSTY = 0; + if ($phospho_type eq 'y') { + if (defined(${$unique_motifs{$peptide}}[0])) { + while (${$unique_motifs{$peptide}}[0] =~ /pY/g) { + $number_pY++; + } + } + } + if ($phospho_type eq 'sty') { + print "looking for unique_motifs for $peptide\n" if ($verbose); + if (defined(${$unique_motifs{$peptide}}[0])) { + while (${$unique_motifs{$peptide}}[0] =~ /(pS|pT|pY)/g) { + $number_pSTY++; + print "We have found $number_pSTY unique_motifs for $peptide\n" if ($verbose); + } + } + } + + + # search each of the unique motifs for matches + print "searching $#{$unique_motifs{$peptide}} motifs for peptide $peptide\n" if ($verbose); + for my $i (0 .. $#{$unique_motifs{$peptide}}) { + print "\$i = $i; peptide = $peptide; unique_motif = ${$unique_motifs{$peptide}}[$i]\n" if ($verbose); + my $tmp_motif = ${$unique_motifs{$peptide}}[$i]; + print " --- matching unique motif $tmp_motif for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + my $formatted_sequence; + if (($number_pY == 1) || ($number_pSTY == 1)) { + my $seq_plus5aa = ""; + my $seq_plus7aa = ""; + $formatted_sequence = &replace_pSpTpY($tmp_motif, $phospho_type); + print " a #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequence for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequence))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequence))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequence))[1]; + } + + if (defined $seq_plus7aa) { + # commit the 7aa LUT records + $ppep_regsite_LUT_stmth->bind_param( 1, $ppep_id_lut{$peptide} ); + $ppep_regsite_LUT_stmth->bind_param( 2, $seq_plus7aa ); + if (not $ppep_regsite_LUT_stmth->execute()) { + print "Error writing tuple ($ppep_id_lut{$peptide},$seq_plus7aa) for peptide $peptide to ppep_regsite_LUT: $ppep_regsite_LUT_stmth->errstr\n"; + } + } + for my $i (0 .. $#kinases_observed) { + if (defined $seq_plus5aa) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should be PGRPLsSYGMD_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; #ACE + } + } + } + for my $i (0 .. $#motif_sequence) { + print "matching $motif_sequence[$i]" if ($verbose); + if ($peptide =~ /$motif_sequence[$i]/) { + $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + if (defined $seq_plus7aa) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + } + } + } + if (exists($regulatory_sites_PhosphoSite_hash{$seq_plus7aa})) { + $seq_plus7aa_2{$peptide} = $seq_plus7aa; + $domain_2{$peptide} = $domain{$seq_plus7aa}; + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + $notes_2{$peptide} = $notes{$seq_plus7aa}; + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } else { + } + } + elsif (($number_pY > 1) || ($number_pSTY > 1)) { #eg, if $x[4] is 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329 and $number_pY == 2 + $formatted_sequence = $tmp_motif; + $seq_plus5aa = ""; + $seq_plus7aa = ""; + #Create the sequences with only one phosphorylation site + #eg, 1308-[ VIYFQAIEEVpYpYDHLRSAAKKR ]-1329, which becomes 1308-[ VIYFQAIEEVpYYDHLRSAAKKR ]-1329 and 1308-[ VIYFQAIEEVYpYDHLRSAAKKR ]-1329 + + my (@sites, $offset, $next_p_site); + $sites[0] = index($tmp_motif, "p"); + $offset = $sites[0] + 1; + $next_p_site = 0; + while ($next_p_site != -1) { + $next_p_site = index($tmp_motif, "p", $offset); + if ($next_p_site != -1) { + push (@sites, $next_p_site); + } + $offset = $next_p_site+1; + } + + my @pSTY_sequences; + for my $n (0 .. $#sites) { + $pSTY_sequences[$n] = $tmp_motif; + for (my $m = $#sites; $m >= 0; $m--) { + if ($m != $n) {substr($pSTY_sequences[$n], $sites[$m], 1) = "";} + } + } + + my @formatted_sequences; + for my $k (0 .. $#sites) { + $formatted_sequences[$k] = &replace_pSpTpY($pSTY_sequences[$k], $phospho_type); + } + + for my $k (0 .. $#formatted_sequences) { + print " b #pY $number_pY; #pSTY $number_pSTY; matching formatted motif $formatted_sequences[$k] for peptide $peptide at " . format_localtime_iso8601() ."\n" if ($verbose); + if ($phospho_type eq 'y') { + $seq_plus5aa = (split(/(\w{0,5}y\w{0,5})/, $formatted_sequences[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}y\w{0,7})/, $formatted_sequences[$k]))[1]; + } + elsif ($phospho_type eq "sty") { + $seq_plus5aa = (split(/(\w{0,5}(s|t|y)\w{0,5})/, $formatted_sequences[$k]))[1]; + $seq_plus7aa = (split(/(\w{0,7}(s|t|y)\w{0,7})/, $formatted_sequences[$k]))[1]; + } + for my $i (0 .. $#kinases_observed) { + my $tmp = $seq_plus5aa."_".$kinases_observed[$i]; #eg, should look like REEILsEMKKV_PKCalpha + if (exists($p_sequence_kinase -> {$tmp})) { + $kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]} = "X"; + } + } + $pSTY_sequence = $formatted_sequences[$k]; + for my $i (0 .. $#motif_sequence) { + if ($pSTY_sequence =~ /$motif_sequence[$i]/) { + $kinase_motif_matches{$peptide}{$motif_type{$motif_type_key_ary[$i]}} = "X"; + } + } + for my $i (0 .. $#kinases_PhosphoSite) { + my $tmp = $seq_plus7aa."_".$kinases_PhosphoSite[$i]; #eg, should be RTPGRPLsSYGMDSR_PAK2 + #print "seq_plus7aa._.kinases_PhosphoSite[i] is $tmp"; + if (exists($p_sequence_kinase_PhosphoSite -> {$tmp})) { + $kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]} = "X"; + } + } + if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) { + $seq_plus7aa_2{$peptide} = $seq_plus7aa; + + # $domain + if ($domain_2{$peptide} eq "") { + $domain_2{$peptide} = $domain{$seq_plus7aa}; + } + elsif ($domain{$seq_plus7aa} eq "") { + # do nothing + } + else { + $domain_2{$peptide} = $domain_2{$peptide}." / ".$domain{$seq_plus7aa}; + } + + + # $ON_FUNCTION_2 + if ($ON_FUNCTION_2{$peptide} eq "") { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION{$seq_plus7aa}; + } + elsif ($ON_FUNCTION{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_FUNCTION_2{$peptide} = $ON_FUNCTION_2{$peptide}." / ".$ON_FUNCTION{$seq_plus7aa}; + } + + # $ON_PROCESS_2 + if ($ON_PROCESS_2{$peptide} eq "") { + $ON_PROCESS_2{$peptide} = $ON_PROCESS{$seq_plus7aa}; + } + elsif ($ON_PROCESS{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROCESS_2{$peptide} = $ON_PROCESS_2{$peptide}." / ".$ON_PROCESS{$seq_plus7aa}; + } + + # $ON_PROT_INTERACT_2 + if ($ON_PROT_INTERACT_2{$peptide} eq "") { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT{$seq_plus7aa}; + } + elsif ($ON_PROT_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_PROT_INTERACT_2{$peptide} = $ON_PROT_INTERACT_2{$peptide}." / ".$ON_PROT_INTERACT{$seq_plus7aa}; + } + + # $ON_OTHER_INTERACT_2 + if ($ON_OTHER_INTERACT_2{$peptide} eq "") { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT{$seq_plus7aa}; + } + elsif ($ON_OTHER_INTERACT{$seq_plus7aa} eq "") { + # do nothing + } + else { + $ON_OTHER_INTERACT_2{$peptide} = $ON_OTHER_INTERACT_2{$peptide}." / ".$ON_OTHER_INTERACT{$seq_plus7aa}; + } + + # $notes_2 + if ($notes_2{$peptide} eq "") { + $notes_2{$peptide} = $notes{$seq_plus7aa}; + } + elsif ($notes{$seq_plus7aa} eq "") { + # do nothing + } + else { + $notes_2{$peptide} = $notes_2{$peptide}." / ".$notes{$seq_plus7aa}; + } + $notes_2{$peptide} = $notes{$seq_plus7aa}; + + # $organism_2 + if ($organism_2{$peptide} eq "") { + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } + elsif ($organism{$seq_plus7aa} eq "") { + # do nothing + } + else { + $organism_2{$peptide} = $organism_2{$peptide}." / ".$organism{$seq_plus7aa}; + } + $organism_2{$peptide} = $organism{$seq_plus7aa}; + } else { + } # if (exists($regulatory_sites_PhosphoSite -> {$seq_plus7aa})) + } # for my $k (0 .. $#formatted_sequences) + } # if/else number of phosphosites + } # for each motif i # for my $i (0 .. $#{$unique_motifs{$peptide}}) +} # for each $peptide + +my ($end_seconds, $end_microseconds) = gettimeofday; + +my $delta_seconds = $end_seconds - $start_seconds; +my $delta_microseconds = $end_microseconds - $start_microseconds; +$delta_microseconds += 1000000 * $delta_seconds; +my $key_count = keys(%data); +print sprintf("Average search time is %d microseconds per phopshopeptide\n", ($delta_microseconds / $key_count)); + +($start_seconds, $start_microseconds) = gettimeofday; + +print "Writing PSP_Regulatory_site records\n"; + +my $psp_regulatory_site_stmth = $dbh->prepare(" + INSERT INTO PSP_Regulatory_site ( + DOMAIN, + ON_FUNCTION, + ON_PROCESS, + ON_PROT_INTERACT, + ON_OTHER_INTERACT, + NOTES, + SITE_PLUSMINUS_7AA, + ORGANISM + ) VALUES (?,?,?,?,?,?,?,?) + "); + +foreach my $peptide (keys %data) { + if (exists($domain_2{$peptide}) and (defined $domain_2{$peptide}) and (not $domain_2{$peptide} eq "") ) { + $psp_regulatory_site_stmth->bind_param(1, $domain_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(2, $ON_FUNCTION_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(3, $ON_PROCESS_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(4, $ON_PROT_INTERACT_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(5, $ON_OTHER_INTERACT_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(6, $notes_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(7, $seq_plus7aa_2{$peptide}); + $psp_regulatory_site_stmth->bind_param(8, $organism_2{$peptide}); + if (not $psp_regulatory_site_stmth->execute()) { + print "Error writing PSP_Regulatory_site for one regulatory site with peptide '$domain_2{$peptide}': $psp_regulatory_site_stmth->errstr\n"; + } else { + } + } elsif (exists($domain_2{$peptide}) and (not defined $domain_2{$peptide})) { + print "\$domain_2{$peptide} is undefined\n"; #ACE + } +} + +$dbh->{AutoCommit} = $auto_commit; +# auto_commit implicitly finishes psp_regulatory_site_stmth, apparently # $psp_regulatory_site_stmth->finish; +$dbh->disconnect if ( defined $dbh ); + + +($end_seconds, $end_microseconds) = gettimeofday; + +$delta_seconds = $end_seconds - $start_seconds; +$delta_microseconds = $end_microseconds - $start_microseconds; +$delta_microseconds += 1000000 * $delta_seconds; +$key_count = keys(%data); +print sprintf("Write time is %d microseconds\n", ($delta_microseconds)); + +print "... Finished find sequences that match the NetworKIN predictions and find motifs that match observed sequences at " . format_localtime_iso8601() ."\n\n"; + +############################################################################################################################### +# +# Print to the output file +# +############################################################################################################################### + + +open (OUT, ">$file_out") || die "could not open the fileout: $file_out"; +open (MELT, ">$file_melt") || die "could not open the fileout: $file_melt"; + +# print the header info +print MELT "phospho_peptide\tgene_names\tsite_type\tkinase_map\n"; +print OUT "p-peptide\tProtein description\tGene name(s)\tFASTA name\tPhospho-sites\tUnique phospho-motifs, no residue numbers\tAccessions\tPhospho-motifs for all members of protein group with residue numbers\t"; + +# print the PhosphoSite regulatory data +print OUT "Domain\tON_FUNCTION\tON_PROCESS\tON_PROT_INTERACT\tON_OTHER_INTERACT\tPhosphoSite notes\t"; + +# print the sample names +for my $i (0 .. $#samples) { print OUT "$samples[$i]\t"; } + +# print the kinases and groups +for my $i (0 .. $#kinases_observed) { + my $temp = $kinases_observed[$i]."_NetworKIN"; + print OUT "$temp\t"; + push(@kinases_observed_lbl, $temp); +} +my @motif_type_keys = keys %motif_type; +for my $i (1 .. $#motif_type_keys) { + print OUT "$motif_type{$motif_type_keys[$i]}\t"; +} +for my $i (0 .. $#kinases_PhosphoSite) { + my $temp = $kinases_PhosphoSite[$i]; # ."_PhosphoSite"; + if ($i < $#kinases_PhosphoSite) { print OUT "$temp\t"; } + if ($i == $#kinases_PhosphoSite) { print OUT "$temp\n"; } + push(@phosphosites_observed_lbl, $temp); +} + +# begin DDL-to-SQLite +# --- +$dbh = DBI->connect("dbi:SQLite:$db_out", undef, undef); +$auto_commit = $dbh->{AutoCommit}; +$dbh->{AutoCommit} = 0; +print "DB connection $dbh is to $db_out, opened for modification\n"; + +my $sample_stmth; +$sample_stmth = $dbh->prepare(" + INSERT INTO sample ( + id, + name + ) VALUES (?,?) +"); + +my $ppep_intensity_stmth; +$ppep_intensity_stmth = $dbh->prepare(" + INSERT INTO ppep_intensity ( + ppep_id, + sample_id, + intensity + ) VALUES (?,?,?) +"); + +my $site_type_stmth; +$site_type_stmth = $dbh->prepare(" + insert into site_type ( + id, + type_name + ) values (?,?) +"); + +my $ppep_gene_site_stmth; +$ppep_gene_site_stmth = $dbh->prepare(" + insert into ppep_gene_site ( + ppep_id, + gene_names, + kinase_map, + site_type_id + ) values (?,?,?,?) +"); + +my $ppep_metadata_stmth; +$ppep_metadata_stmth = $dbh->prepare(" + INSERT INTO ppep_metadata + ( ppep_id + , protein_description + , gene_name + , FASTA_name + , phospho_sites + , motifs_unique + , accessions + , motifs_all_members + , domain + , ON_FUNCTION + , ON_PROCESS + , ON_PROT_INTERACT + , ON_OTHER_INTERACT + , notes + ) VALUES ( + ?,?,?,?,?,?,? + , ?,?,?,?,?,?,? + ) +"); +# end DDL-to-SQLite +# ... + +# begin store-to-SQLite "sample" table +# --- +# %sample_id_lut maps name -> ID +for my $sample_name (keys %sample_id_lut) { + $sample_stmth->bind_param( 2, $sample_name ); + $sample_stmth->bind_param( 1, $sample_id_lut{$sample_name} ); + if (not $sample_stmth->execute()) { + print "Error writing tuple ($sample_name,$sample_id_lut{$sample_name}): $sample_stmth->errstr\n"; + } +} +# end store-to-SQLite "sample" table +# ... + +# begin store-to-SQLite "site_type" table +# --- +sub add_site_type { + my ($site_type_id, $site_type_type_name) = @_; + $site_type_stmth->bind_param( 2, $site_type_type_name ); + $site_type_stmth->bind_param( 1, $site_type_id ); + if (not $site_type_stmth->execute()) { + die "Error writing tuple ($site_type_id,$site_type_type_name): $site_type_stmth->errstr\n"; + } +} +add_site_type($SITE_KINASE_SUBSTRATE, $site_description{$SITE_KINASE_SUBSTRATE}); +add_site_type($SITE_HPRD , $site_description{$SITE_HPRD }); +add_site_type($SITE_PHOSIDA , $site_description{$SITE_PHOSIDA }); +add_site_type($SITE_PHOSPHOSITE , $site_description{$SITE_PHOSPHOSITE }); +# end store-to-SQLite "site_type" table +# ... + +foreach my $peptide (sort(keys %data)) { + next if (grep($peptide, @failed_matches)); + my $ppep_id = $ppep_id_lut{$peptide}; + my @ppep_metadata = (); + my @ppep_intensity = (); + my @gene = (); + my $gene_names; + my $j; + # Print the peptide itself + # column 1: p-peptide + print OUT "$peptide\t"; + push (@ppep_metadata, $ppep_id); + push (@ppep_intensity, $peptide); + + my $verbose_cond = 0; # $peptide eq 'AAAAAAAGDpSDpSWDADAFSVEDPVR' || $peptide eq 'KKGGpSpSDEGPEPEAEEpSDLDSGSVHSASGRPDGPVR'; + # skip over failed matches + print "\nfirst match for '$peptide' is '$matched_sequences{$peptide}[0]' and FAILED_MATCH_SEQ is '$FAILED_MATCH_SEQ'\n" if $verbose_cond; + if ($matched_sequences{$peptide}[0] eq $FAILED_MATCH_SEQ) { + # column 2: Protein description + # column 3: Gene name(s) + # column 4: FASTA name + # column 5: phospho-residues + # Column 6: UNIQUE phospho-motifs + # Column 7: accessions + # Column 8: ALL motifs with residue numbers + # 2 3 4 5 6 7 8 + print OUT "Sequence not found in FASTA database\tNA\tNA\tNA\tNA\tNA\tNA\t"; + print "No match found for '$peptide' in sequence database\n"; + $gene_names = '$FAILED_MATCH_GENE_NAME'; + } else { + my @description = (); + my %seen = (); + # Print just the protein description + for $i (0 .. $#{$names{$peptide}}) { + my $long_name = $names{$peptide}[$i]; + my @naming_parts = split(/\sOS/, $long_name); + my @front_half = split(/\s/, $naming_parts[0]); + push(@description, join(" ", @front_half[1..($#front_half)])); + } + # column 2: Protein description + print OUT join(" /// ", @description), "\t"; + push (@ppep_metadata, join(" /// ", @description)); + + # Print just the gene name + for $i (0 .. $#{$names{$peptide}}) { + my $tmp_gene = $names{$peptide}[$i]; + $tmp_gene =~ s/^.*GN=//; + $tmp_gene =~ s/\s.*//; + if (!exists($seen{$tmp_gene})) { + push(@gene, $tmp_gene); + $seen{$tmp_gene} = $tmp_gene; + } + } + # column 3: Gene name(s) + $gene_names = join(" /// ", @gene); + print OUT $gene_names, "\t"; + push (@ppep_metadata, join(" /// ", @gene)); + + # column 4: FASTA name + print OUT join(" /// ", @{$names{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$names{$peptide}})); + + # column 5: phospho-residues + my $tmp_for_insert = ""; + my $foobar; + for my $i (0 .. $#{ $matched_sequences{$peptide} } ) { + print "match $i for '$peptide' is '$matched_sequences{$peptide}[$i]'\n" if $verbose_cond; + if ($i < $#{ $matched_sequences{$peptide} }) { + if (defined $p_residues{$peptide}{$i}) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + for $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + } + elsif ($j == $#tmp_p_residues) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing /// "; + } + } + } + } + elsif ($i == $#{ $matched_sequences{$peptide} }) { + my $there_were_sites = 0; + if (defined $p_residues{$peptide}{$i}) { + @tmp_p_residues = @{$p_residues{$peptide}{$i}}; + if ($#tmp_p_residues > 0) { + for my $j (0 .. $#tmp_p_residues) { + if ($j < $#tmp_p_residues) { + if (defined $p_residues{$peptide}{$i}[$j]) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + $foobar = $residues{$peptide}{$i}[$j]; + if (defined $foobar) { + print OUT "$foobar"; + print OUT "$tmp_site_for_printing, "; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing, "; + $there_were_sites = 1; + } + } + } + elsif ($j == $#tmp_p_residues) { + if (defined $p_residues{$peptide}{$i}[$j]) { + $foobar = $residues{$peptide}{$i}[$j]; + if (defined $foobar) { + my $tmp_site_for_printing = $p_residues{$peptide}{$i}[$j] + 1; # added 12.05.2012 for Justin's data + print OUT "$foobar"; + print OUT "$tmp_site_for_printing\t"; + $tmp_for_insert .= "p$residues{$peptide}{$i}[$j]$tmp_site_for_printing"; + $there_were_sites = 1; + } + } + } + } + } + } + if (0 == $there_were_sites) { + print OUT "\t"; + } + } + } + print "tmp_for_insert '$tmp_for_insert' for '$peptide'\n" if $verbose_cond; + push (@ppep_metadata, $tmp_for_insert); + + # Column 6: UNIQUE phospho-motifs + print OUT join(" /// ", @{$unique_motifs{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$unique_motifs{$peptide}})); + + # Column 7: accessions + if (defined $accessions{$peptide}) { + print OUT join(" /// ", @{$accessions{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$accessions{$peptide}})); + } else { + print OUT "\t"; + push (@ppep_metadata, ""); + } + + # Column 8: ALL motifs with residue numbers + if (defined $p_motifs{$peptide}) { + print OUT join(" /// ", @{$p_motifs{$peptide}}), "\t"; + push (@ppep_metadata, join(" /// ", @{$p_motifs{$peptide}})); + } else { + print OUT "\t"; + push (@ppep_metadata, ""); + } + + } + + # Print the PhosphoSite regulatory data + + if (defined $domain_2{$peptide}) { print OUT "$domain_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_FUNCTION_2{$peptide}) { print OUT "$ON_FUNCTION_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_PROCESS_2{$peptide}) { print OUT "$ON_PROCESS_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_PROT_INTERACT_2{$peptide}) { print OUT "$ON_PROT_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $ON_OTHER_INTERACT_2{$peptide}) { print OUT "$ON_OTHER_INTERACT_2{$peptide}\t"; } else { print OUT "\t"; } + if (defined $notes_2{$peptide}) { print OUT "$notes_2{$peptide}\t"; } else { print OUT "\t"; } + + if (defined $domain_2{$peptide}) { push (@ppep_metadata, $domain_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_FUNCTION_2{$peptide}) { push (@ppep_metadata, $ON_FUNCTION_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_PROCESS_2{$peptide}) { push (@ppep_metadata, $ON_PROCESS_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_PROT_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_PROT_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $ON_OTHER_INTERACT_2{$peptide}) { push (@ppep_metadata, $ON_OTHER_INTERACT_2{$peptide}); } else { push(@ppep_metadata, ""); } + if (defined $notes_2{$peptide}) { push (@ppep_metadata, $notes_2{$peptide}); } else { push(@ppep_metadata, ""); } + + # begin store-to-SQLite "ppep_metadata" table + # --- + for $i (1..14) { + $ppep_metadata_stmth->bind_param($i, $ppep_metadata[$i-1]); + } + if (not $ppep_metadata_stmth->execute()) { + print "Error writing ppep_metadata row for phosphopeptide $ppep_metadata[$i]: $ppep_metadata_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_metadata" table + + # Print the data + @tmp_data = (); + foreach (@{$data{$peptide}}) { + push(@tmp_data, $_); + } + print OUT join("\t", @tmp_data), "\t"; + + # begin store-to-SQLite "ppep_intensity" table + # --- + # commit the sample intensities + $i = 0; + foreach (@{$data{$peptide}}) { + my $intense = $_; + $ppep_intensity_stmth->bind_param( 1, $ppep_id ); + $ppep_intensity_stmth->bind_param( 2, $sample_id_lut{$samples[$i]} ); + $ppep_intensity_stmth->bind_param( 3, $intense ); + if (not $ppep_intensity_stmth->execute()) { + print "Error writing tuple ($peptide,$samples[$i],$intense): $ppep_intensity_stmth->errstr\n"; + } + $i += 1; + } + # ... + # end store-to-SQLite "ppep_intensity" table + + # print the kinase-substrate data + for my $i (0 .. $#kinases_observed) { + if (exists($kinase_substrate_NetworKIN_matches{$peptide}{$kinases_observed[$i]})) { + print OUT "X\t"; + my $NetworKIN_label = $kinases_observed[$i]; #."_NetworKIN"; + print MELT "$peptide\t$gene_names\t$site_description{$SITE_KINASE_SUBSTRATE}\t$NetworKIN_label\n"; + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $NetworKIN_label); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $SITE_KINASE_SUBSTRATE); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$kinases_observed[$i]): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + else { print OUT "\t";} + } + my %wrote_motif; + my $motif_parts_0; + my @motif_split; + my $one_motif; + + for my $i (0 .. $#motif_type_keys) { + if (exists($kinase_motif_matches{$peptide}{$motif_type_keys[$i]})) { + print OUT "X\t"; + #ACE-2022.06.20 $motif_parts_0 = $motif_type{$motif_sequence[$i]}." ".$motif_sequence[$i]; + $motif_parts_0 = $motif_type{$motif_type_keys[$i]}; + @motif_split = split("[|]", $motif_parts_0); + #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; + for my $j (0 .. $#motif_split) { + $one_motif = $motif_split[$j]; + #ACE-2022.06.20 my $key = "$peptide\t$gene_names\t$motif_parts_0"; + my $key = "$peptide\t$gene_names\t$one_motif"; + if (!exists($wrote_motif{$key})) { + $wrote_motif{$key} = $key; + print MELT "$peptide\t$gene_names\t$motif_description[$i]\t$one_motif\n"; + # print "Line 657: i is $i\t$kinase_motif_matches{$peptide}{$motif_sequence[$i]}\n"; #debug + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $one_motif); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $site_id{$motif_description[$i]}); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$one_motif): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + } + } + else { print OUT "\t";} + } + for my $i (0 .. $#kinases_PhosphoSite) { + if (exists($kinase_substrate_PhosphoSite_matches{$peptide}{$kinases_PhosphoSite[$i]})) { + print MELT "$peptide\t$gene_names\t$site_description{$SITE_PHOSPHOSITE}\t$phosphosites_observed_lbl[$i]\n"; + if ($i < $#kinases_PhosphoSite) { + print OUT "X\t"; + } + else { + print OUT "X\n"; + } + # begin store-to-SQLite "ppep_gene_site" table + # --- + $ppep_gene_site_stmth->bind_param(1, $ppep_id); # ppep_gene_site.ppep_id + $ppep_gene_site_stmth->bind_param(2, $gene_names); # ppep_gene_site.gene_names + $ppep_gene_site_stmth->bind_param(3, $phosphosites_observed_lbl[$i]); # ppep_gene_site.kinase_map + $ppep_gene_site_stmth->bind_param(4, $SITE_PHOSPHOSITE); # ppep_gene_site.site_type_id + if (not $ppep_gene_site_stmth->execute()) { + print "Error writing tuple ($peptide,$gene_names,$phosphosites_observed_lbl[$i]): $ppep_gene_site_stmth->errstr\n"; + } + # ... + # end store-to-SQLite "ppep_gene_site" table + } + else { + if ($i < $#kinases_PhosphoSite) { + print OUT "\t"; + } + elsif ($i == $#kinases_PhosphoSite) { + print OUT "\n"; + } + } + } +} + +close OUT; +close MELT; +$ppep_gene_site_stmth->finish; +print "begin DB commit at " . format_localtime_iso8601() . "\n"; +$dbh->{AutoCommit} = $auto_commit; +$dbh->disconnect if ( defined $dbh ); + +print "\nFinished writing output at " . format_localtime_iso8601() ."\n\n"; + +###############################################################################################################################
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,89 @@ +<macros> + <token name="@TOOL_VERSION@">0.1.13</token> + <token name="@VERSION_SUFFIX@">0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="1.56.0" >bioconductor-preprocesscore</requirement> + <requirement type="package" version="1.22.2" >numpy</requirement> + <requirement type="package" version="0.3.3" >openblas</requirement> + <requirement type="package" version="1.4.1" >pandas</requirement> + <requirement type="package" version="1.64" >perl-dbd-sqlite</requirement> + <requirement type="package" version="5.26.2" >perl</requirement> + <requirement type="package" version="1.4.0" >pyahocorasick</requirement> + <requirement type="package" version="3.9.10" >python</requirement> + <requirement type="package" version="1.14.2" >r-data.table</requirement> + <requirement type="package" version="1.1.2" >r-dbi</requirement> + <requirement type="package" version="3.3.5" >r-ggplot2</requirement> + <requirement type="package" version="3.1.3" >r-gplots</requirement> + <requirement type="package" version="0.9.4" >r-latex2exp</requirement> + <requirement type="package" version="1.7.1" >r-optparse</requirement> + <requirement type="package" version="1.4.4" >r-reshape2</requirement> + <requirement type="package" version="2.11" >r-rmarkdown</requirement> + <requirement type="package" version="2.2.8" >r-rsqlite</requirement> + <requirement type="package" version="0.4.0" >r-sass</requirement> + <requirement type="package" version="0.4_11" >r-sqldf</requirement> + <requirement type="package" version="1.4.0" >r-stringr</requirement> + <requirement type="package" version="0.37" >r-tinytex</requirement> + <requirement type="package" version="0.3.7" >r-vioplot</requirement> + <!-- + It would be nice to use conda-forge/texlive-core rather than r-tinytex because the + former installs texlive when the package is built, but issue 23 blocked PDF-creation. + Also, texlive-core also gave pango font errors (output had missing symbols replaced + with boxes) unless I specified the build as well as the version when building a + conda environment, e.g.: texlive-core=20210325=h97429d4_0 + --> + </requirements> + <!-- I specified the versions above because it takes a VERY long time to search for package versions when they are not omitted; also, version numbers should lead to reproducible behavior. Contrast execution times of this (about 18 seconds): + echo n | time conda create -n mqppep_ver -c conda-forge -c bioconda \ + bioconductor-preprocesscore=1.56.0 \ + numpy=1.22.2 \ + openblas=0.3.3 \ + pandas=1.4.1 \ + perl-dbd-sqlite=1.64 \ + perl-dbd-sqlite=1.64 \ + perl=5.26.2 \ + pyahocorasick=1.4.0 \ + python=3.9.10 \ + r-data.table=1.14.2 \ + r-dbi=1.1.2 \ + r-ggplot2=3.3.5 \ + r-gplots=3.1.3 \ + r-latex2exp=0.9.4 \ + r-optparse=1.7.1 \ + r-reshape2=1.4.4 \ + r-rmarkdown=2.11 \ + r-rsqlite=2.2.8 \ + r-sass=0.4.0 \ + r-sqldf=0.4_11 \ + r-stringr=1.4.0 \ + r-tinytex=0.37 \ + r-vioplot=0.3.7 + with this (42 or more seconds): + echo n | time conda create -n mqppep_nover -c conda-forge -c bioconda \ + bioconductor-preprocesscore= \ + numpy \ + openblas=0.3.3 \ + pandas \ + perl \ + perl-dbd-sqlite \ + perl-dbd-sqlite \ + pyahocorasick \ + python \ + r-data.table \ + r-dbi \ + r-ggplot2 \ + r-gplots \ + r-latex2exp \ + r-optparse \ + r-reshape2 \ + r-rmarkdown \ + r-rsqlite \ + r-sass \ + r-sqldf \ + r-stringr \ + r-tinytex \ + r-vioplot + + --> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova.R Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,297 @@ +#!/usr/bin/env Rscript +# libraries +library(optparse) +library(data.table) +library(stringr) + +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 + +# parse options +option_list <- list( + make_option( + c("-i", "--inputFile"), + action = "store", + default = NA, + type = "character", + help = "Phosphopeptide Intensities sparse input file path" + ), + make_option( + c("-a", "--alphaFile"), + action = "store", + default = NA, + type = "character", + help = paste0("List of alpha cutoff values for significance testing;", + " path to text file having one column and no header") + ), + make_option( + c("-S", "--preproc_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`" + ), + make_option( + c("-K", "--ksea_sqlite"), + action = "store", + default = NA, + type = "character", + help = "Path to 'ksea_sqlite' output produced by this tool" + ), + make_option( + c("-f", "--firstDataColumn"), + action = "store", + default = "^Intensity[^_]", + type = "character", + help = "First column of intensity values" + ), + make_option( + c("-m", "--imputationMethod"), + action = "store", + default = "random", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('group-median','median','mean','random')") + ), + make_option( + c("-p", "--meanPercentile"), + action = "store", + default = 3, + type = "integer", + help = paste0("Mean percentile for randomly generated imputed values;", + ", range [1,99]") + ), + make_option( + c("-d", "--sdPercentile"), + action = "store", + default = 3, + type = "double", + help = paste0("Adjustment value for standard deviation of", + " randomly generated imputed values; real") + ), + make_option( + c("-s", "--regexSampleNames"), + action = "store", + default = "\\.(\\d+)[A-Z]$", + type = "character", + help = "Regular expression extracting sample-names" + ), + make_option( + c("-g", "--regexSampleGrouping"), + action = "store", + default = "(\\d+)", + type = "character", + help = paste0("Regular expression extracting sample-group", + " from an extracted sample-name") + ), + make_option( + c("-o", "--imputedDataFile"), + action = "store", + default = "output_imputed.tsv", + type = "character", + help = "Imputed Phosphopeptide Intensities output file path" + ), + make_option( + c("-n", "--imputedQNLTDataFile"), + action = "store", + default = "output_imp_qn_lt.tsv", + type = "character", + help = + paste( + "Imputed, Quantile-Normalized Log-Transformed Phosphopeptide", + "Intensities output file path" + ) + ), + make_option( + c("-r", "--reportFile"), + action = "store", + default = "QuantDataProcessingScript.html", + type = "character", + help = "HTML report file path" + ), + make_option( + c("-k", "--ksea_cutoff_statistic"), + action = "store", + default = "FDR", + type = "character", + help = paste0("Method for missing-value imputation,", + " one of c('FDR','p.value'), but don't expect 'p.value' to work well.") + ), + make_option( + c("-t", "--ksea_cutoff_threshold"), + action = "store", + default = 0.05, + type = "double", + help = paste0("Maximum score to be used to score a kinase enrichment as significant") + ), + make_option( + c("-M", "--anova_ksea_metadata"), + action = "store", + default = "anova_ksea_metadata.tsv", + type = "character", + help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments" + ) +) +args <- parse_args(OptionParser(option_list = option_list)) +print("args is:") +cat(str(args)) + +# Check parameter values + +if (! file.exists(args$inputFile)) { + stop((paste("Input file", args$inputFile, "does not exist"))) +} +input_file <- args$inputFile +alpha_file <- args$alphaFile +preproc_sqlite <- args$preproc_sqlite +imputed_data_file_name <- args$imputedDataFile +imp_qn_lt_data_filenm <- args$imputedQNLTDataFile +anova_ksea_metadata <- args$anova_ksea_metadata +report_file_name <- args$reportFile +ksea_sqlite <- args$ksea_sqlite +ksea_cutoff_statistic <- args$ksea_cutoff_statistic +ksea_cutoff_threshold <- args$ksea_cutoff_threshold +if ( + sum( + grepl( + pattern = ksea_cutoff_statistic, + x = c("FDR", "p.value") + ) + ) < 1 + ) { + print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic)) + return(-1) + } + +imputation_method <- args$imputationMethod +if ( + sum( + grepl( + pattern = imputation_method, + x = c("group-median", "median", "mean", "random") + ) + ) < 1 + ) { + print(sprintf("bad imputationMethod argument: %s", imputation_method)) + return(-1) + } + +# read with default values, when applicable +mean_percentile <- args$meanPercentile +sd_percentile <- args$sdPercentile +# in the case of 'random" these values are ignored by the client script +if (imputation_method == "random") { + print("mean_percentile is:") + cat(str(mean_percentile)) + + print("sd_percentile is:") + cat(str(mean_percentile)) +} + +# convert string parameters that are passed in via config files: +# - firstDataColumn +# - regexSampleNames +# - regexSampleGrouping +read_config_file_string <- function(fname, limit) { + # eliminate any leading whitespace + result <- gsub("^[ \t\n]*", "", readChar(fname, limit)) + # eliminate any trailing whitespace + result <- gsub("[ \t\n]*$", "", result) + # substitute characters escaped by Galaxy sanitizer + result <- gsub("__lt__", "<", result) + result <- gsub("__le__", "<=", result) + result <- gsub("__eq__", "==", result) + result <- gsub("__ne__", "!=", result) + result <- gsub("__gt__", ">", result) + result <- gsub("__ge__", ">=", result) + result <- gsub("__sq__", "'", result) + result <- gsub("__dq__", '"', result) + result <- gsub("__ob__", "[", result) + result <- gsub("__cb__", "]", result) +} +cat(paste0("first_data_column file: ", args$firstDataColumn, "\n")) +cat(paste0("regex_sample_names file: ", args$regexSampleNames, "\n")) +cat(paste0("regex_sample_grouping file: ", args$regexSampleGrouping, "\n")) +nc <- 1000 +regex_sample_names <- read_config_file_string(args$regexSampleNames, nc) +regex_sample_grouping <- read_config_file_string(args$regexSampleGrouping, nc) +first_data_column <- read_config_file_string(args$firstDataColumn, nc) +cat(paste0("first_data_column: ", first_data_column, "\n")) +cat(paste0("regex_sample_names: ", regex_sample_names, "\n")) +cat(paste0("regex_sample_grouping: ", regex_sample_grouping, "\n")) + +# from: https://github.com/molgenis/molgenis-pipelines/wiki/ +# How-to-source-another_file.R-from-within-your-R-script +# Function location_of_this_script returns the location of this .R script +# (may be needed to source other files in same dir) +location_of_this_script <- function() { + this_file <- NULL + # This file may be 'sourced' + for (i in - (1:sys.nframe())) { + if (identical(sys.function(i), base::source)) { + this_file <- (normalizePath(sys.frame(i)$ofile)) + } + } + + if (!is.null(this_file)) return(dirname(this_file)) + + # But it may also be called from the command line + cmd_args <- commandArgs(trailingOnly = FALSE) + cmd_args_trailing <- commandArgs(trailingOnly = TRUE) + cmd_args <- cmd_args[ + seq.int( + from = 1, + length.out = length(cmd_args) - length(cmd_args_trailing) + ) + ] + res <- gsub("^(?:--file=(.*)|.*)$", "\\1", cmd_args) + + # If multiple --file arguments are given, R uses the last one + res <- tail(res[res != ""], 1) + if (0 < length(res)) return(dirname(res)) + + # Both are not the case. Maybe we are in an R GUI? + return(NULL) +} + +script_dir <- location_of_this_script() + +rmarkdown_params <- list( + inputFile = input_file + , alphaFile = alpha_file + , preprocDb = preproc_sqlite + , firstDataColumn = first_data_column + , imputationMethod = imputation_method + , meanPercentile = mean_percentile + , sdPercentile = sd_percentile + , regexSampleNames = regex_sample_names + , regexSampleGrouping = regex_sample_grouping + , imputedDataFilename = imputed_data_file_name + , imputedQNLTDataFile = imp_qn_lt_data_filenm + , anovaKseaMetadata = anova_ksea_metadata + , kseaAppPrepDb = ksea_sqlite + , kseaCutoffThreshold = ksea_cutoff_threshold + , kseaCutoffStatistic = ksea_cutoff_statistic + ) + +print("rmarkdown_params") +str(rmarkdown_params) + +# freeze the random number generator so the same results will be produced +# from run to run +set.seed(28571) + +# BUG (or "opportunity") +# To render as PDF for the time being requires installing the conda +# package `r-texlive` until this issue in `texlive-core` is resolved: +# https://github.com/conda-forge/texlive-core-feedstock/issues/19 +# This workaround is detailed in the fourth comment of: +# https://github.com/conda-forge/texlive-core-feedstock/issues/61 + +library(tinytex) +tinytex::install_tinytex() +rmarkdown::render( + input = paste(script_dir, "mqppep_anova_script.Rmd", sep = "/") +, output_format = rmarkdown::pdf_document(toc = TRUE) +, output_file = report_file_name +, params = rmarkdown_params +)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova.xml Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,341 @@ +<tool + id="mqppep_anova" + name="MaxQuant Phosphopeptide ANOVA" + version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" + profile="21.05" + > + <description>Runs ANOVA and KSEA for phosphopeptides.</description> + <macros> + <import>macros.xml</import> + </macros> + <edam_topics> + <edam_topic>topic_0121</edam_topic><!-- proteomics --> + <edam_topic>topic_3520</edam_topic><!-- proteomics experiment--> + </edam_topics> + <edam_operations> + <edam_operation>operation_0276</edam_operation><!-- Analyse a network of protein interactions. --> + <edam_operation>operation_0531</edam_operation><!-- Heat map generation --> + <edam_operation>operation_2938</edam_operation><!-- Dendrogram generation --> + <edam_operation>operation_2938</edam_operation><!-- Imputation --> + <edam_operation>operation_3435</edam_operation><!-- Standardisation and normalisation --> + <edam_operation>operation_3501</edam_operation><!-- Enrichment analysis --> + <edam_operation>operation_3658</edam_operation><!-- Statistical inference --> + </edam_operations> + <expand macro="requirements"/> + <!-- + The weird invocation used here is because knitr and install_tinytex + both need access to a writeable directory, but most directories in a + biocontainer are read-only, so this builds a pseudo-home under /tmp + --> + <command detect_errors="exit_code"><![CDATA[ + cp '$__tool_directory__/mqppep_anova_script.Rmd' . && + cp '$__tool_directory__/mqppep_anova.R' . && + Rscript mqppep_anova.R + --inputFile '$input_file' + --alphaFile '$alpha_file' + --preproc_sqlite '$preproc_sqlite' + --firstDataColumn $intensity_column_regex_f + --imputationMethod $imputation.imputation_method + #if $imputation.imputation_method == "random" + --meanPercentile '$imputation.meanPercentile' + --sdPercentile '$imputation.sdPercentile' + #end if + --regexSampleNames $sample_names_regex_f + --regexSampleGrouping $sample_grouping_regex_f + --imputedDataFile $imputed_data_file + --imputedQNLTDataFile '$imp_qn_lt_file' + --ksea_sqlite '$ksea_sqlite' + --ksea_cutoff_threshold '$ksea_cutoff_threshold' + --ksea_cutoff_statistic 'FDR' + --reportFile '$report_file' + --anova_ksea_metadata '$anova_ksea_metadata' + ]]></command> + <configfiles> + <configfile name="sample_names_regex_f"> + $sample_names_regex + </configfile> + <configfile name="sample_grouping_regex_f"> + $sample_grouping_regex + </configfile> + <configfile name="intensity_column_regex_f"> + $intensity_column_regex + </configfile> + </configfiles> + <inputs> + <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" + help="Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" + /> + <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level" + help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header" + /> + <param name="preproc_sqlite" type="data" format="sqlite" label="preproc_sqlite dataset from mqppep_preproc" + help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" + /> + <param name="intensity_column_regex" type="text" value="^Intensity[^_]" + label="Intensity-column pattern" + help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)" + /> + <!-- imputation_method <- c("group-median","median","mean","random")[1] --> + <conditional name="imputation"> + <param name="imputation_method" type="select" label="Imputation method" + help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" + > + <option value="random" selected="true">random</option> + <option value="group-median">group-median</option> + <option value="median">median</option> + <option value="mean">mean</option> + </param> + <when value="group-median" /> + <when value="median" /> + <when value="mean" /> + <when value="random"> + <param name="meanPercentile" type="integer" value="1" min="1" max="99" + label="Mean percentile for random values" + help="Percentile center of random values; range [1,99]" + /> + <param name="sdPercentile" type="float" value="1.0" + label="Percentile std. dev. for random values" + help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" + /> + </when> + </conditional> + <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" + help="Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" + label="Sample-extraction pattern"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + <param name="sample_grouping_regex" type="text" value="\d+" + help="Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" + label="Group-extraction pattern"> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + </sanitizer> + </param> + <param name="ksea_cutoff_threshold" type="float" value="0.05" + label="KSEA threshold level" + help="Maximum FDR to be used to score a kinase enrichment as significant" + /> + </inputs> + <outputs> + <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" ></data> + <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" ></data> + <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" ></data> + <!-- + <data name="report_file" format="html" label="${input_file.name}.${imputation.imputation_method}-imputed_report (download/unzip to view)" ></data> + --> + <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" ></data> + <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite"> + </data> + </outputs> + <tests> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="median"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.8765300.8765300.8765300.2355900.14706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.088909" /> + + <has_text text="pSQKQEEENPAEETGEEK" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="mean"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6721601.6721601.8765300.6721601.2355900.14706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.092451" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="group-median"/> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observd missing observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.8765300.8765300.5886074.2355900.14706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- missing missing observed missing observed observed --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" /> + </assert_contents> + </output> + </test> + <test> + <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> + <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> + <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> + <param name="intensity_column_regex" value="^Intensity[^_]"/> + <param name="imputation_method" value="random"/> + <param name="meanPercentile" value="1" /> + <param name="sdPercentile" value="1.0" /> + <param name="sample_names_regex" value="\.\d+[A-Z]$"/> + <param name="sample_grouping_regex" value="\d+"/> + <output name="imputed_data_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <!-- observd observd observd --> + <has_text_matching expression="pSQKQEEENPAEETGEEK.*8765300.*2355900.*4706000" /> + + </assert_contents> + </output> + <output name="imp_qn_lt_file"> + <assert_contents> + <has_text text="Phosphopeptide" /> + <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> + <has_text text="5.409549" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> + <has_text text="6.464714" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +==================================================== +Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA +==================================================== + +**Input files** + +``Filtered Phosphopeptide Intensities`` + Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format). + This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter" + (``mqppep_mrgflt``) tool. + +``ANOVA alpha cutoff level`` + List of alpha cutoff values for significance testing; text file having one column and no header. For example: + +:: + + 0.2 + 0.1 + 0.05 + +**Input parameters** + +``Intensity-column pattern`` + First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity** + +``Imputation method`` + Impute missing values by: + + 1. ``group-median`` - use median for each sample-group; + 2. ``mean`` - use mean across all samples; or + 3. ``median`` - use median across all samples; + 4. ``random`` - use randomly generated values where: + + - ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and + - ``Percentile std. dev. for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. + +``Sample-extraction pattern`` + PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. + + - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A`` + - Note that *this is case sensitive* by default. + +``Group-extraction pattern`` + PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``). + + - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10`` + - Note that *this is case sensitive* by default. + +``KSEA threshold level`` + Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of 0.05 is arbitrary. + +**Outputs** + +``imputed_intensities (input_file.imputation_method-imputed_intensities)`` + Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format. + +``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)`` + Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. + +``report_file (input_file.imputation_method-imputed_report)`` + Summary report for normalization, imputation, and **ANOVA**, in PDF format. + +``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)`` + Phosphopeptide metadata including ANOVA significance and KSEA enrichments. + +``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)`` + SQLite database for ad-hoc report creation. + +**Algorithm** + +The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017]. +The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool. + +**Authors** + +``Larry C. Cheng`` + (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script. + +``Arthur C. Eschenlauer`` + (`ORCiD 0000-0002-2882-0508 <https://orcid.org/0000-0002-2882-0508>`_) adapted the script to run in Galaxy. + +=================================== +PERL-compatible regular expressions +=================================== + +Note that the PERL-compatible regular expressions accepted by this tool are documented at http://rdrr.io/r/base/regex.html + + ]]></help> + <citations> + <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 --> + <citation type="doi">10.3791/57996</citation> + <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 --> + <citation type="doi">10.1093/bioinformatics/btx415</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova_script.Rmd Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,3536 @@ +--- +title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" +author: +- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" +- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" +- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" +date: +- "May 28, 2018" +- "; revised June 23, 2022" +output: + pdf_document: + toc: true + toc_depth: 3 + keep_tex: true +header-includes: + - \usepackage{longtable} + - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut + - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut +params: + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/test_input_for_anova.tabular" + preprocDb: "test-data/test_input_for_anova.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + show_toc: true + firstDataColumn: "^Intensity[^_]" + imputationMethod: !r c("group-median", "median", "mean", "random")[1] + meanPercentile: 1 + sdPercentile: 1.0 + regexSampleNames: "\\.\\d+[A-Z]$" + regexSampleGrouping: "\\d+" + imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" + imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" + anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" + oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] + oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] + kseaCutoffStatistic: !r c("p.value", "FDR")[2] + kseaCutoffThreshold: !r c( 0.1, 0.05)[2] + kseaMinKinaseCount: 1 + intensityHeatmapRows: 75 +--- +<!-- + kseaCutoffStatistic: !r c("p.value", "FDR")[2] + kseaCutoffThreshold: !r c(0.05, 0.1)[1] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/test_input_for_anova.tabular" + preprocDb: "test-data/test_input_for_anova.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" + preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" + preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] + + alphaFile: "test-data/alpha_levels.tabular" + inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" + preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" + kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] + + inputFile: "test-data/density_failure.preproc_tab.tabular" + kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] + latex_document: default +--> +```{r setup, include = FALSE} +#ref for debugging: https://yihui.org/tinytex/r/#debugging +options(tinytex.verbose = TRUE) + +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 +# ref for top and bottom struts: https://tex.stackexchange.com/a/50355 +knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) + +# freeze the random number generator so the same results will be produced +# from run to run +set.seed(28571) + +### LIBRARIES +library(gplots) +library(DBI) +library(RSQLite) +# Suppress "Warning: no DISPLAY variable so Tk is not available" +suppressWarnings(suppressMessages(library(sqldf))) + +# required but not added to search list: +# - DBI +# - RSQLite +# - ggplot2 +# - knitr +# - latex2exp +# - preprocessCore +# - reshape2 +# - vioplot + +### CONSTANTS + +const_parfin <- par("fin") +const_boxplot_fill <- "grey94" +const_stripchart_cex <- 0.5 +const_stripsmall_cex <- + sqrt(const_stripchart_cex * const_stripchart_cex / 2) +const_stripchart_jitter <- 0.3 +const_write_debug_files <- FALSE +const_table_anchor_bp <- "bp" +const_table_anchor_ht <- "ht" +const_table_anchor_p <- "p" +const_table_anchor_tbp <- "tbp" + + +const_ksea_astrsk_kinases <- 1 +const_ksea_nonastrsk_kinases <- 2 +const_ksea_all_kinases <- 3 + +const_log10_e <- log10(exp(1)) + +### FUNCTIONS + +# from `demo(error.catching)` +##' Catch *and* save both errors and warnings, and in the case of +##' a warning, also keep the computed result. +##' +##' @title tryCatch both warnings (with value) and errors +##' @param expr an \R expression to evaluate +##' @return a list with 'value' and 'warning', where +##' 'value' may be an error caught. +##' @author Martin Maechler; +##' Copyright (C) 2010-2012 The R Core Team +try_catch_w_e <- function(expr) { + wrn <- NULL + # warning handler + w_handler <- function(w) { + wrn <<- w + invokeRestart("muffleWarning") + } + list( + value = withCallingHandlers( + tryCatch( + expr, + error = function(e) e + ), + warning = w_handler + ), + warning = wrn + ) +} + + +write_debug_file <- function(s) { + if (const_write_debug_files) { + s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) + print(sprintf("DEBUG writing file %s", spath)) + write.table( + s, + file = s_path, + sep = "\t", + col.names = TRUE, + row.names = TRUE, + quote = FALSE + ) + } +} + +# ref: http://adv-r.had.co.nz/Environments.html +# "When creating your own environment, note that you should set its parent +# environment to be the empty environment. This ensures you don't +# accidentally inherit objects from somewhere else." +# Caution: this prevents `with(my_env, expr)` from working when `expr` +# contains anything from the global environment, even operators! +# Hence, `x <- 1; get("x", new_env())` fails by design. +new_env <- function() { + new.env(parent = emptyenv()) +} + +### numerical/statistical helper functions + +any_nan <- function(x) { + !any(x == "NaN") +} + +# determine standard deviation of quantile to impute +sd_finite <- function(x) { + ok <- is.finite(x) + sd(x[ok]) +} + +anova_func <- function(x, grouping_factor, one_way_f) { + subject <- data.frame( + intensity = x + ) + x_aov <- + one_way_f( + formula = intensity ~ grouping_factor, + data = subject + ) + pvalue <- + if (identical(one_way_f, aov)) + summary(x_aov)[[1]][["Pr(>F)"]][1] + else + pvalue <- x_aov$p.value + pvalue +} + + +### LaTeX functions + +latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { + v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v + cat( + paste0( + v_sub, + collapse = collapse_string + ) + ) +} + +latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { + cat("\\begin{itemize}\n\\item ") + latex_collapsed_vector(collapse_string, v, underscore_whack) + cat("\n\\end{itemize}\n") +} + +latex_itemized_list <- function(v, underscore_whack = TRUE) { + latex_itemized_collapsed("\n\\item ", v, underscore_whack) +} + +latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { + cat("\\begin{enumerate}\n\\item ") + latex_collapsed_vector(collapse_string, v, underscore_whack) + cat("\n\\end{enumerate}\n") +} + +latex_enumerated_list <- function(v) { + latex_enumerated_collapsed("\n\\item ", v) +} + +latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { + latex_collapsed_vector(" & ", v, underscore_whack) + cat(extra) + cat(" \\\\\n") +} + +# Use this like print.data.frame, from which it is adapted: +data_frame_latex <- + function( + x, + ..., + # digits to pass to format.data.frame + digits = NULL, + # TRUE -> right-justify columns; FALSE -> left-justify + right = TRUE, + # maximumn number of rows to print + max = NULL, + # string with justification of each column + justification = NULL, + # TRUE to center on page + centered = TRUE, + # optional caption + caption = NULL, + # h(inline); b(bottom); t (top) or p (separate page) + anchor = "h", + # set underscore_whack to TRUE to escape underscores + underscore_whack = TRUE + ) { + if (is.null(justification)) + justification <- + Reduce( + f = paste, + x = rep_len(if (right) "r" else "l", length(colnames(x))) + ) + n <- length(rownames(x)) + if (length(x) == 0L) { + cat( + sprintf( + # if n is one, use singular 'row', else use plural 'rows' + ngettext( + n, + "data frame with 0 columns and %d row", + "data frame with 0 columns and %d rows" + ), + n + ), + "\n", + sep = "" + ) + } else if (n == 0L) { + cat("0 rows for:\n") + latex_itemized_list( + v = names(x), + underscore_whack = underscore_whack + ) + } else { + if (is.null(max)) + max <- getOption("max.print", 99999L) + if (!is.finite(max)) + stop("invalid 'max' / getOption(\"max.print\"): ", + max) + omit <- (n0 <- max %/% length(x)) < n + m <- as.matrix( + format.data.frame( + if (omit) x[seq_len(n0), , drop = FALSE] else x, + digits = digits, + na.encode = FALSE + ) + ) + cat( + # h(inline); b(bottom); t (top) or p (separate page) + paste0("\\begin{table}[", anchor, "]\n") + ) + if (!is.null(caption)) + cat(paste0(" \\caption{", caption, "}")) + if (centered) cat("\\centering\n") + cat( + paste( + " \\begin{tabular}{", + justification, + "}\n", + sep = "" + ) + ) + # ref: https://tex.stackexchange.com/a/50353 + # Describes use of \rule{0pt}{3ex} + if (!is.null(caption)) + cat("\\B \\\\ \\hline\\hline\n") + # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 + latex_table_row( + v = colnames(m), + extra = "\\T\\B", + underscore_whack = underscore_whack + ) + cat("\\hline\n") + for (i in seq_len(length(m[, 1]))) { + latex_table_row( + v = m[i, ], + underscore_whack = underscore_whack + ) + } + cat( + paste( + " \\end{tabular}", + "\\end{table}", + sep = "\n" + ) + ) + if (omit) + cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", + n - n0, "rows ]\n") + } + invisible(x) + } + +hypersub <- + function(s) { + hyper <- tolower(s) + hyper <- gsub("[^a-z0-9]+", "-", hyper) + hyper <- gsub("[-]+", "-", hyper) + hyper <- sub("^[-]", "", hyper) + hyper <- sub("[-]$", "", hyper) + return(hyper) + } + +subsection_header <- + function(s) { + hyper <- hypersub(s) + cat( + sprintf( + "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", + hyper, s, hyper + ) + ) + } + +subsubsection_header <- + function(s) { + hyper <- hypersub(s) + cat( + sprintf( + "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", + hyper, s, hyper + ) + ) + } + +### SQLite functions + +ddl_exec <- function(db, sql) { + discard <- DBI::dbExecute(conn = db, statement = sql) + if (FALSE && discard != 0) { + need_newpage <- TRUE + if (need_newpage) { + need_newpage <<- FALSE + cat("\\newpage\n") + } + o_file <- stdout() + cat("\n\\begin{verbatim}\n") + cat(sql, file = o_file) + cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) + cat("\n\\end{verbatim}\n") + } +} + +dml_no_rows_exec <- function(db, sql) { + discard <- DBI::dbExecute(conn = db, statement = sql) + if (discard != 0) { + need_newpage <- TRUE + if (need_newpage) { + need_newpage <<- FALSE + cat("\\newpage\n") + } + cat("\n\\begin{verbatim}\n") + o_file <- stdout() + cat(sql, file = o_file) + cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) + cat("\n\\end{verbatim}\n") + } +} + +### KSEA functions and helpers + +# Adapted from KSEAapp::KSEA.Scores to allow retrieval of: +# - maximum log2(FC) +ksea_scores <- function( + + # For human data, typically, ksdata = KSEAapp::ksdata + ksdata, + + # Input data file having columns: + # - Protein : abbreviated protein name + # - Gene : HUGO gene name + # - Peptide : peptide sequence without indications of phosphorylation + # - Reside.Both : position(s) of phosphorylation within Gene sequence + # - First letter designates AA that is modified + # - Numbers indicate position within Gene + # - Multiple values are separated by semicolons + # - p : p-value + # - FC : fold-change + px, + + # A binary input of TRUE or FALSE, indicating whether or not to include + # NetworKIN predictions + networkin, + + # A numeric value between 1 and infinity setting the minimum NetworKIN + # score (can be left out if networkin = FALSE) + networkin_cutoff + +) { + if (length(grep(";", px$Residue.Both)) == 0) { + # There are no Residue.Both entries having semicolons, so new is + # simply px except two columns are renamed and a column is added + # for log2(abs(fold-change)) + new <- px + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) + new <- new[complete.cases(new$log2_fc), ] + } else { + # Split each row having semicolons in Residue.Both into rows that are + # duplicated in all respects except that each row has a single + # member of the set "split-on-semicolon-Residue.Both" + px_double <- px[grep(";", px$Residue.Both), ] + residues <- as.character(px_double$Residue.Both) + residues <- as.matrix(residues, ncol = 1) + split <- strsplit(residues, split = ";") + # x gets count of residues in each row, + # i.e., 1 + count of semicolons + x <- sapply(split, length) + # Here is the set of split rows + px_single <- data.frame( + Protein = rep(px_double$Protein, x), + Gene = rep(px_double$Gene, x), + Peptide = rep(px_double$Peptide, x), + Residue.Both = unlist(split), + p = rep(px_double$p, x), + FC = rep(px_double$FC, x) + ) + # new first gets the split rows + new <- px[-grep(";", px$Residue.Both), ] + # to new, append the rows that didn't need splitting in the first place + new <- rbind(new, px_single) + # map Gene to SUB_GENE + # map Residue.Both to SUB_MOD_RSD + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + # Eliminate any non-positive values to prevent introduction of + # infinite or NaN values + new[(0 <= new$log2_fc), "log2_fc"] <- NA + # Because of preceding step, there is no need for abs in the next line + new$log2_fc <- log2(as.numeric(as.character(new$FC))) + # Convert any illegal values from NaN to NA + new[is.nan(new$log2_fc), "log2_fc"] <- NA + # Eliminate rows having missing values (e.g., non-imputed data) + new <- new[complete.cases(new$log2_fc), ] + } + if (networkin == TRUE) { + # When NetworKIN is true, filter on NetworKIN.cutoff which includes + # PhosphoSitePlus data *because its networkin_score is set to Inf* + ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] + ksdata_filtered <- ksdata_filtered[ + (ksdata_filtered$networkin_score >= networkin_cutoff), ] + } else { + # Otherwise, simply use PhosphSitePlus rows + ksdata_filtered <- ksdata[ + grep("PhosphoSitePlus", ksdata$Source), ] + } + # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD + # colnames of ksdata_filtered: + # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" + # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" + # "SITE_...7_AA" "networkin_score" "Source" + # colnames of new: + # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" + # Equivalent to: + # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc + # FROM ksdata_filtered a + # INNER JOIN new b + # ON a.SUB_GENE = b.SUB_GENE + # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD + ksdata_dataset <- base::merge(ksdata_filtered, new) + # colnames of ksdata_dataset: + # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" + # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" + # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" + # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) + # Re-order dataset; prior to accounting for isoforms + ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] + # Extract non-isoform accession in UniProtKB + ksdata_dataset$uniprot_no_isoform <- sapply( + ksdata_dataset$KIN_ACC_ID, + function(x) unlist(strsplit(as.character(x), split = "-"))[1] + ) + # Discard previous results while selecting interesting columns ... + ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] + # Column names are now: + # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" + # "FC" "log2_fc" "Source" + # Make column names human-readable + colnames(ksdata_dataset_abbrev) <- c( + "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", + "FC", "log2FC", "Source" + ) + # SELECT * FROM ksdata_dataset_abbrev + # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p + ksdata_dataset_abbrev <- + ksdata_dataset_abbrev[ + order( + ksdata_dataset_abbrev$Kinase.Gene, + ksdata_dataset_abbrev$Substrate.Gene, + ksdata_dataset_abbrev$Substrate.Mod, + ksdata_dataset_abbrev$p), + ] + # First aggregation step to account for multiply phosphorylated peptides + # and differing peptide sequences; the goal here is to combine results + # for all measurements of the same substrate. + # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, + # `Source`, avg(log2FC) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, + # `Source` + # ORDER BY `Kinase.Gene`; + # in two steps: + # (1) compute average log_2(fold-change) + ksdata_dataset_abbrev <- aggregate( + log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, + data = ksdata_dataset_abbrev, + FUN = mean + ) + # (2) order by Kinase.Gene + ksdata_dataset_abbrev <- + ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] + # SELECT `Kinase.Gene`, count(*) + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene`; + # in two steps: + # (1) Extract the list of Kinase.Gene names + kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) + # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, + # named by Kinase.Gene + kinase_list <- as.matrix(table(kinase_list)) + # Second aggregation step to account for all substrates per kinase + # CREATE TABLE mean_fc + # AS + # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene` + mean_fc <- aggregate( + log2FC ~ Kinase.Gene, + data = ksdata_dataset_abbrev, + FUN = mean + ) + # mean_fc columns: "Kinase.Gene", "log2FC" + if (FALSE) { + # I need to re-think this; I was trying to find the most-represented + # peptide, but that horse has already left the barn + # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC + # FROM ksdata_dataset_abbrev + # GROUP BY `Kinase.Gene` + max_fc <- aggregate( + log2FC ~ Kinase.Gene, + data = ksdata_dataset_abbrev, + FUN = function(r) max(abs(r)) + ) + } + + # Create column 3: mS + mean_fc$m_s <- mean_fc[, 2] + # Create column 4: Enrichment + mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) + # Create column 5: m, count of substrates + mean_fc$m <- kinase_list + # Create column 6: z-score + mean_fc$z_score <- ( + (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * + sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) + # Create column 7: p-value, deduced from z-score + mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) + # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value + mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") + + # Remove log2FC column, which is duplicated as mS + mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] + # Correct the column names which we had to hack because of the linter... + colnames(mean_fc) <- c( + "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" + ) + return(mean_fc) +} + +low_fdr_barplot <- function( + rslt, + i_cntrst, + i, + a_level, + b_level, + fold_change, + caption +) { + rslt_score_list_i <- rslt$score_list[[i]] + if (!is.null(rslt_score_list_i)) { + rslt_score_list_i_nrow <- nrow(rslt_score_list_i) + k <- data.frame( + contrast = as.integer(i_cntrst), + a_level = rep.int(a_level, rslt_score_list_i_nrow), + b_level = rep.int(b_level, rslt_score_list_i_nrow), + kinase_gene = rslt_score_list_i$Kinase.Gene, + mean_log2_fc = rslt_score_list_i$mS, + enrichment = rslt_score_list_i$Enrichment, + substrate_count = rslt_score_list_i$m, + z_score = rslt_score_list_i$z.score, + p_value = rslt_score_list_i$p.value, + fdr = rslt_score_list_i$FDR + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + k$fdr + }, + "p.value" = { + k$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + + k <- k[selector < ksea_cutoff_threshold, ] + + if (nrow(k) > 1) { + op <- par(mai = c(1, 1.5, 0.4, 0.4)) + numeric_z_score <- as.numeric(k$z_score) + z_score_order <- order(numeric_z_score) + kinase_name <- k$kinase_gene + long_caption <- + sprintf( + "Kinase z-score, %s < %s, %s", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption + ) + my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) + cat("\n\\clearpage\n") + barplot( + height = numeric_z_score[z_score_order], + border = NA, + xpd = FALSE, + cex.names = 1.0, + cex.axis = 1.0, + main = long_caption, + cex.main = my_cex_caption, + names.arg = kinase_name[z_score_order], + horiz = TRUE, + srt = 45, + las = 1) + par(op) + } + } +} + +# note that this adds elements to the global variable `ksea_asterisk_hash` + +low_fdr_print <- function( + rslt, + i_cntrst, + i, + a_level, + b_level, + fold_change, + caption +) { + rslt_score_list_i <- rslt$score_list[[i]] + if (!is.null(rslt_score_list_i)) { + rslt_score_list_i_nrow <- nrow(rslt_score_list_i) + k <- contrast_ksea_scores <- data.frame( + contrast = as.integer(i_cntrst), + a_level = rep.int(a_level, rslt_score_list_i_nrow), + b_level = rep.int(b_level, rslt_score_list_i_nrow), + kinase_gene = rslt_score_list_i$Kinase.Gene, + mean_log2_fc = rslt_score_list_i$mS, + enrichment = rslt_score_list_i$Enrichment, + substrate_count = rslt_score_list_i$m, + z_score = rslt_score_list_i$z.score, + p_value = rslt_score_list_i$p.value, + fdr = rslt_score_list_i$FDR + ) + + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + k$fdr + }, + "p.value" = { + k$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + + k <- k[selector < ksea_cutoff_threshold, ] + # save kinase names to ksea_asterisk_hash + for (kinase_name in k$kinase_gene) { + ksea_asterisk_hash[[kinase_name]] <- 1 + } + + db_write_table_overwrite <- (i_cntrst < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ksea_scores", + value = contrast_ksea_scores, + append = db_write_table_append + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + contrast_ksea_scores$fdr + }, + "p.value" = { + contrast_ksea_scores$p_value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + output_df <- contrast_ksea_scores[ + selector < ksea_cutoff_threshold, + c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", + "z_score", "p_value", "fdr") + ] + output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) + output_df <- output_df[output_order, ] + colnames(output_df) <- + c( + colnames(output_df)[1], + colnames(output_df)[2], + "enrichment", + "m_s", + "z_score", + "p_value", + "fdr" + ) + output_df$fdr <- sprintf("%0.4f", output_df$fdr) + output_df$p_value <- sprintf("%0.2e", output_df$p_value) + output_df$z_score <- sprintf("%0.2f", output_df$z_score) + output_df$m_s <- sprintf("%d", output_df$m_s) + output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) + output_ncol <- ncol(output_df) + colnames(output_df) <- + c( + "Kinase", + "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", + "Enrichment", + "Substrates", + "z-score", + "p-value", + "FDR" + ) + selector <- switch( + ksea_cutoff_statistic, + "FDR" = { + rslt$score_list[[i]]$FDR + }, + "p.value" = { + rslt$score_list[[i]]$p.value + }, + stop( + sprintf( + "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", + ksea_cutoff_statistic + ) + ) + ) + if (sum(selector < ksea_cutoff_threshold) > 0) { + math_caption <- gsub("{", "\\{", caption, fixed = TRUE) + math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) + data_frame_latex( + x = output_df, + justification = "l c c c c c c", + centered = TRUE, + caption = sprintf( + "\\text{%s}, %s < %s", + math_caption, + ksea_cutoff_statistic, + ksea_cutoff_threshold + ), + anchor = const_table_anchor_p + ) + } else { + cat( + sprintf( + "\\break + No kinases had + \\(\\text{%s}_\\text{enrichment} < %s\\) + for contrast %s\\hfill\\break\n", + ksea_cutoff_statistic, + ksea_cutoff_threshold, + caption + ) + ) + } + } +} + +# create_breaks is a helper for ksea_heatmap +create_breaks <- function(merged_scores) { + if (min(merged_scores, na.rm = TRUE) < -1.6) { + breaks_neg <- seq(-1.6, 0, length.out = 30) + breaks_neg <- + append( + seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), + breaks_neg + ) + breaks_neg <- sort(unique(breaks_neg)) + } else { + breaks_neg <- seq(-1.6, 0, length.out = 30) + } + if (max(merged_scores, na.rm = TRUE) > 1.6) { + breaks_pos <- seq(0, 1.6, length.out = 30) + breaks_pos <- + append( + breaks_pos, + seq(1.6, max(merged_scores, na.rm = TRUE), + length.out = 10) + ) + breaks_pos <- sort(unique(breaks_pos)) + } else { + breaks_pos <- seq(0, 1.6, length.out = 30) + } + breaks_all <- unique(append(breaks_neg, breaks_pos)) + mycol_neg <- + gplots::colorpanel(n = length(breaks_neg), + low = "blue", + high = "white") + mycol_pos <- + gplots::colorpanel(n = length(breaks_pos) - 1, + low = "white", + high = "red") + mycol <- unique(append(mycol_neg, mycol_pos)) + color_breaks <- list(breaks_all, mycol) + return(color_breaks) +} + +# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap +draw_kseaapp_summary_heatmap <- function( + x, + sample_cluster, + merged_asterisk, + my_cex_row, + color_breaks, + margins, + ... +) { + merged_scores <- x + if (!is.matrix(x)) { + cat( + paste0( + "No plot because \\texttt{typeof(x)} is '", + typeof(x), + "' rather than 'matrix'.\n\n" + ) + ) + } else if (nrow(x) < 2) { + cat("No plot because matrix x has ", nrow(x), " rows.\n\n") + cat("\\begin{verbatim}\n") + str(x) + cat("\\end{verbatim}\n") + } else if (ncol(x) < 2) { + cat("No plot because matrix x has ", ncol(x), " columns.\n\n") + cat("\\begin{verbatim}\n") + str(x) + cat("\\end{verbatim}\n") + } else { + gplots::heatmap.2( + x = merged_scores, + Colv = sample_cluster, + scale = "none", + cellnote = merged_asterisk, + notecol = "white", + cexCol = 0.9, + # Heuristically assign size of row labels + cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), + srtCol = 45, + srtRow = 45, + notecex = 3 * my_cex_row, + col = color_breaks[[2]], + density.info = "none", + trace = "none", + breaks = color_breaks[[1]], + lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), + lhei = c(0.4, 8.0, 1.1), + lwid = c(0.5, 3), + key = FALSE, + margins = margins, + ... + ) + } +} + +# Adapted from KSEAapp::KSEA.Heatmap +ksea_heatmap <- function( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats, + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff = + stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the heatmap as a .png image into the working directory + export = FALSE, + # bottom and right margins; adjust as needed if contrast names are too long + margins = c(6, 20), + # print which kinases? + # - Mandatory argument, must be one of const_ksea_.*_kinases + which_kinases, + # additional arguments to gplots::heatmap.2, such as: + # - main: main title of plot + # - xlab: x-axis label + # - ylab: y-axis label + ... +) { + filter_m <- function(dataset, m_cutoff) { + filtered <- dataset[(dataset$m >= m_cutoff), ] + return(filtered) + } + score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) + for (i in seq_len(length(score_list_m))) { + names <- colnames(score_list_m[[i]])[c(2:7)] + colnames(score_list_m[[i]])[c(2:7)] <- + paste(names, i, sep = ".") + } + master <- + Reduce( + f = function(...) { + base::merge(..., by = "Kinase.Gene", all = FALSE) + }, + x = score_list_m + ) + + row.names(master) <- master$Kinase.Gene + columns <- as.character(colnames(master)) + merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) + colnames(merged_scores) <- sample_labels + merged_stats <- as.matrix(master[, grep(stats, columns)]) + asterisk <- function(mtrx, p_cutoff) { + new <- data.frame() + for (i in seq_len(nrow(mtrx))) { + for (j in seq_len(ncol(mtrx))) { + my_value <- mtrx[i, j] + if (!is.na(my_value) && my_value < p_cutoff) { + new[i, j] <- "*" + } else { + new[i, j] <- "" + } + } + } + return(new) + } + merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) + + # begin hack to print only significant rows + asterisk_rows <- rowSums(merged_asterisk == "*") > 0 + all_rows <- rownames(merged_stats) + names(asterisk_rows) <- all_rows + non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) + asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) + merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] + merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] + # end hack to print only significant rows + + row_list <- list() + row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows + row_list[[const_ksea_all_kinases]] <- all_rows + row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows + + i <- which_kinases + my_row_names <- row_list[[i]] + scrs <- merged_scores[my_row_names, ] + stts <- merged_stats[my_row_names, ] + merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) + + color_breaks <- create_breaks(scrs) + plot_height <- nrow(scrs) ^ 0.55 + plot_width <- ncol(scrs) ^ 0.7 + my_cex_row <- 0.25 * 16 / plot_height + if (export == "TRUE") { + png( + "KSEA.Merged.Heatmap.png", + width = plot_width * 300, + height = 2 * plot_height * 300, + res = 300, + pointsize = 14 + ) + } + draw_kseaapp_summary_heatmap( + x = scrs, + sample_cluster = sample_cluster, + merged_asterisk = merged_asterisk, + my_cex_row = my_cex_row, + color_breaks = color_breaks, + margins = margins + ) + if (export == "TRUE") { + dev.off() + } + return(my_row_names) +} + +# helper for heatmaps of phosphopeptide intensities + +draw_intensity_heatmap <- + function( + m, # matrix with rownames already formatted + cutoff, # cutoff used by hm_heading_function + hm_heading_function, # construct and cat heading from m and cutoff + hm_main_title, # main title for plot (drawn below heading) + suppress_row_dendrogram = TRUE, # set to false to show dendrogram + max_peptide_count # experimental: + = intensity_hm_rows, # values of 50 and 75 worked well + ... # passthru parameters for heatmap + ) { + peptide_count <- 0 + # emit the heading for the heatmap + if (hm_heading_function(m, cutoff)) { + peptide_count <- min(max_peptide_count, nrow(m)) + if (nrow(m) > 1) { + m_margin <- m[peptide_count:1, ] + # Margin setting was heuristically derived + margins <- + c(0.5, # col + max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row + ) + } + if (nrow(m) > 1) { + tryCatch( + { + old_oma <- par("oma") + par(cex.main = 0.6) + # Heuristically determined character size adjustment formula + char_contractor <- + 250000 / ( + max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows + ) + heatmap( + m[peptide_count:1, ], + Rowv = if (suppress_row_dendrogram) NA else NULL, + Colv = NA, + cexRow = char_contractor, + cexCol = char_contractor * 50 / max_peptide_count, + scale = "row", + margins = margins, + main = + "Unimputed, unnormalized log(intensities)", + xlab = "", + las = 1, + ... + ) + }, + error = function(e) { + cat( + sprintf( + "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", + e$message + ) + ) + }, + finally = par(old_oma) + ) + } + } + return(peptide_count) + } +``` + +```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +cat("\\listoftables\n") +``` +# Purpose + +To perform for phosphopeptides: + +- imputation of missing values, +- quantile normalization, +- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and +- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: + - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) + - The Human Proteome Database [http://hprd.org](http://hprd.org) + - NetworKIN [http://networkin.science/](http://networkin.science/) + - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) + +```{r include = FALSE} + +### GLOBAL VARIABLES + +# parameters for KSEA + +ksea_cutoff_statistic <- params$kseaCutoffStatistic +ksea_cutoff_threshold <- params$kseaCutoffThreshold +ksea_min_kinase_count <- params$kseaMinKinaseCount + +ksea_heatmap_titles <- list() +ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- + sprintf( + "Summary for all kinases enriched in one or more contrasts at %s < %s", + ksea_cutoff_statistic, + ksea_cutoff_threshold + ) +ksea_heatmap_titles[[const_ksea_all_kinases]] <- + "Summary figure for all contrasts and all kinases" +ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- + sprintf( + "Summary for all kinases not enriched at %s < %s in any contrast", + ksea_cutoff_statistic, + ksea_cutoff_threshold + ) +# hash to hold names of significantly enriched kinases +ksea_asterisk_hash <- new_env() + +# READ PARAMETERS (mostly) + +intensity_hm_rows <- params$intensityHeatmapRows +# Input Filename +input_file <- params$inputFile + +# First data column - ideally, this could be detected via regexSampleNames, +# but for now leave it as is. +first_data_column <- params$firstDataColumn +fdc_is_integer <- is.integer(first_data_column) +if (fdc_is_integer) { + first_data_column <- as.integer(params$firstDataColumn) +} + +# False discovery rate adjustment for ANOVA +# Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 +val_fdr <- + read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") + +if ( + ncol(val_fdr) != 1 || + sum(!is.numeric(val_fdr[, 1])) || + sum(val_fdr[, 1] < 0) || + sum(val_fdr[, 1] > 1) +) { + stop("alphaFile should be one column of numbers within the range [0.0,1.0]") +} +val_fdr <- val_fdr[, 1] + +#Imputed Data filename +imputed_data_filename <- params$imputedDataFilename +imp_qn_lt_data_filenm <- params$imputedQNLTDataFile +anova_ksea_mtdt_file <- params$anovaKseaMetadata + +``` + +```{r echo = FALSE} +# Imputation method, should be one of +# "random", "group-median", "median", or "mean" +imputation_method <- params$imputationMethod + +# Selection of percentile of logvalue data to set the mean for random number +# generation when using random imputation +mean_percentile <- params$meanPercentile / 100.0 + +# deviation adjustment-factor for random values; real number. +sd_percentile <- params$sdPercentile + +# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regex_sample_names <- params$regexSampleNames + +# Regular expression to extract Sample Grouping from Sample Name; +# if error occurs, compare sample_treatment_levels vs. sample_name_matches +# to see if groupings/pairs line up +# e.g., "(\\d+)" +regex_sample_grouping <- params$regexSampleGrouping + +one_way_all_categories_fname <- params$oneWayManyCategories +one_way_all_categories <- try_catch_w_e( + match.fun(one_way_all_categories_fname)) +if (!is.function(one_way_all_categories$value)) { + write("fatal error for parameter oneWayManyCategories:", stderr()) + write(one_way_all_categories$value$message, stderr()) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} +one_way_all_categories <- one_way_all_categories$value + +one_way_two_categories_fname <- params$oneWayManyCategories +one_way_two_categories <- try_catch_w_e( + match.fun(one_way_two_categories_fname)) +if (!is.function(one_way_two_categories$value)) { + cat("fatal error for parameter oneWayTwoCategories: \n") + cat(one_way_two_categories$value$message, fill = TRUE) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} +one_way_two_categories <- one_way_two_categories$value + +preproc_db <- params$preprocDb +ksea_app_prep_db <- params$kseaAppPrepDb +result <- file.copy( + from = preproc_db, + to = ksea_app_prep_db, + overwrite = TRUE + ) +if (!result) { + write( + sprintf( + "fatal error copying initial database '%s' to output '%s'", + preproc_db, + ksea_app_prep_db, + ), + stderr() + ) + if (sys.nframe() > 0) quit(save = "no", status = 1) + stop("Cannot continue. Goodbye.") +} +``` + +```{r echo = FALSE} +### READ DATA + +# read.table reads a file in table format and creates a data frame from it. +# - note that `quote = ""` means that quotation marks are treated literally. +full_data <- read.table( + file = input_file, + sep = "\t", + header = TRUE, + quote = "", + check.names = FALSE + ) +``` + +# Extract Sample Names and Treatment Levels + +Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. + +```{r echo = FALSE, results = 'asis'} + +data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) + +if (!fdc_is_integer) { + if (length(data_column_indices) > 0) { + first_data_column <- data_column_indices[1] + } else { + stop(paste("failed to convert firstDataColumn:", first_data_column)) + } +} + +cat( + sprintf( + paste( + "\n\nThe input data file has peptide-intensity data for each sample", + "in one of columns %d through %d.\n\n" + ), + min(data_column_indices), + max(data_column_indices) + ) + ) + +# Write column names as a LaTeX enumerated list. +column_name_df <- data.frame( + column = seq_len(length(colnames(full_data))), + name = paste0("\\verb@", colnames(full_data), "@") + ) +data_frame_latex( + x = column_name_df, + justification = "l l", + centered = TRUE, + caption = "Input data column names", + anchor = const_table_anchor_bp, + underscore_whack = FALSE + ) + +``` + +```{r echo = FALSE, results = 'asis'} +quant_data <- full_data[first_data_column:length(full_data)] +quant_data[quant_data == 0] <- NA +rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide +# Extract factors and trt-replicates using regular expressions. +# Typically: +# regex_sample_names is "\\.\\d+[A-Z]$" +# regex_sample_grouping is "\\d+" +# This would distinguish trt-replicates by terminal letter [A-Z] +# in sample names and group them into trts by the preceding digits. +# e.g.: +# group .1A .1B .1C into group 1; +# group .2A .2B .2C, into group 2; +# etc. +m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) +sample_name_matches <- regmatches(names(quant_data), m) +colnames(quant_data) <- sample_name_matches + +write_debug_file(quant_data) + +rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) +sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) +number_of_samples <- length(sample_name_matches) +sample_treatment_df <- data.frame( + level = sample_treatment_levels, + sample = sample_name_matches + ) +data_frame_latex( + x = sample_treatment_df, + justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", + centered = TRUE, + caption = "Treatment levels", + anchor = const_table_anchor_tbp, + underscore_whack = FALSE + ) +``` + +```{r echo = FALSE, results = 'asis'} +cat("\\newpage\n") +``` + +## Are the log-transformed sample distributions similar? + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} + +quant_data[quant_data == 0] <- NA #replace 0 with NA +quant_data_log <- log10(quant_data) + +rownames(quant_data_log) <- rownames(quant_data) +colnames(quant_data_log) <- sample_name_matches + +write_debug_file(quant_data_log) + +# data visualization +old_par <- par( + mai = par("mai") + c(0.5, 0, 0, 0) +) +# ref: https://r-charts.com/distribution/add-points-boxplot/ +# Vertical plot +boxplot( + quant_data_log +, las = 1 +, col = const_boxplot_fill +, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") +, xlab = "Sample" +) +par(old_par) + + + +cat("\n\n\n") +cat("\n\n\n") + +``` + +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} +if (nrow(quant_data_log) > 1) { + quant_data_log_stack <- stack(quant_data_log) + ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ggplot2::ylab("Probability density") + + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) +} else { + cat("No density plot because there are too few peptides.\n\n") +} +``` + +## Globally, are peptide intensities are approximately unimodal? + +<!-- +# bquote could be used as an alternative to latex2exp::TeX below particularly +# and when plotting math expressions generally, at the expense of mastering +# another syntax, which hardly seems worthwhile when I need to use TeX +# elsewhere; here's an introduction to bquote: +# https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ +--> +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} + +# identify the location of missing values +fin <- is.finite(as.numeric(as.matrix(quant_data_log))) + +logvalues <- as.numeric(as.matrix(quant_data_log))[fin] +logvalues_density <- density(logvalues) +plot( + x = logvalues_density, + main = latex2exp::TeX( + "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" + ), + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + ylab = "Probability density" + ) +hist( + x = as.numeric(as.matrix(quant_data_log)), + xlim = c(min(logvalues_density$x), max(logvalues_density$x)), + breaks = 100, + main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") +) +``` + +## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values + +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} +# determine quantile +q1 <- quantile(logvalues, probs = mean_percentile)[1] + +# 1 = row of matrix (ie, phosphopeptide) +sds <- apply(quant_data_log, 1, sd_finite) +if (sum(!is.na(sds)) > 2) { + plot( + density(sds, na.rm = TRUE) + , main = "Smoothed estimated probability density vs. std. deviation" + , sub = "(probability estimation made with Gaussian smoothing)" + , ylab = "Probability density" + ) +} else { + cat( + "At least two non-missing values are required to plot", + "probability density.\n\n" + ) +} + +``` + +```{r echo = FALSE} +# Determine number of cells to impute +temp <- quant_data[is.na(quant_data)] + +# Determine number of values to impute +number_to_impute <- length(temp) + +# Determine percent of missing values +pct_missing_values <- + round(length(temp) / (length(logvalues) + length(temp)) * 100) +``` + +```{r echo = FALSE} + +# prep for trt-median based imputation + +``` +# Impute Missing Values + +```{r echo = FALSE} + +imp_smry_pot_peptides_before <- nrow(quant_data_log) +imp_smry_missing_values_before <- number_to_impute +imp_smry_pct_missing <- pct_missing_values + +``` + +```{r echo = FALSE} +#Determine number of cells to impute + +``` +```{r echo = FALSE} + +# Identify which values are missing and need to be imputed +ind <- which(is.na(quant_data), arr.ind = TRUE) + +``` +```{r echo = FALSE, results = 'asis'} + +# Apply imputation +switch( + imputation_method +, "group-median" = { + quant_data_imp <- quant_data + imputation_method_description <- + paste("Substitute missing value with", + "median peptide-intensity for sample group.\n" + ) + sample_level_integers <- as.integer(sample_treatment_levels) + # Take the accurate ln(x+1) because the data are log-normally distributed + # and because median can involve an average of two measurements. + quant_data_imp <- log1p(quant_data_imp) + for (i in seq_len(length(levels(sample_treatment_levels)))) { + # Determine the columns for this factor-level + level_cols <- i == sample_level_integers + # Extract those columns + lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] + # assign to ind the row-column pairs corresponding to each NA + ind <- which(is.na(lvlsbst), arr.ind = TRUE) + # No group-median exists if there is only one sample + # a given ppep has no measurement; otherwise, proceed. + if (ncol(lvlsbst) > 1) { + the_centers <- + apply(lvlsbst, 1, median, na.rm = TRUE) + for (j in seq_len(nrow(lvlsbst))) { + for (k in seq_len(ncol(lvlsbst))) { + if (is.na(lvlsbst[j, k])) { + lvlsbst[j, k] <- the_centers[j] + } + } + } + quant_data_imp[, level_cols] <- lvlsbst + } + } + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) + good_rows <- !is.na(rowMeans(quant_data_imp)) + } +, "median" = { + quant_data_imp <- quant_data + imputation_method_description <- + paste("Substitute missing value with", + "median peptide-intensity across all sample classes.\n" + ) + # Take the accurate ln(x+1) because the data are log-normally distributed + # and because median can involve an average of two measurements. + quant_data_imp <- log1p(quant_data_imp) + quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) + good_rows <- !is.nan(rowMeans(quant_data_imp)) + } +, "mean" = { + quant_data_imp <- quant_data + imputation_method_description <- + paste("Substitute missing value with", + "geometric-mean peptide intensity across all sample classes.\n" + ) + # Take the accurate ln(x+1) because the data are log-normally distributed, + # so arguments to mean should be previously transformed. + # this will have to be + quant_data_imp <- log1p(quant_data_imp) + # Assign to NA cells the mean for the row + quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] + # Take the accurate e^x - 1 to match scaling of original input. + quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) + good_rows <- !is.nan(rowMeans(quant_data_imp)) + } +, "random" = { + quant_data_imp <- quant_data + m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd + # If you want results to be reproducible, you will want to call + # base::set.seed before calling stats::rnorm + imputation_method_description <- + paste("Substitute each missing value with random intensity", + sprintf( + "random intensity $N \\sim (%0.2f, %0.2f)$.\n", + q1, m1 + ) + ) + cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", + 100 * mean_percentile)) + cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", + sd_percentile)) + quant_data_imp[ind] <- + 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) + quant_data_imp_ln <- log1p(quant_data_imp) + good_rows <- !is.nan(rowMeans(quant_data_imp)) + } +) +quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e + +if (length(good_rows) < 1) { + print("ERROR: Cannot impute data; there are no good rows!") + return(-1) + } +``` + +```{r echo = FALSE, results = 'asis'} +cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) +``` + +```{r echo = FALSE} + +imp_smry_pot_peptides_after <- sum(good_rows) +imp_smry_rejected_after <- sum(!good_rows) +imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) +``` +```{r echo = FALSE, results = 'asis'} +# ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf +tabular_lines_fmt <- paste( + "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) + " \\caption{Imputation Results}", + " \\centering", # \centering centers the table on the page + " \\begin{tabular}{l c c c}", + " \\hline\\hline", + " \\ & potential peptides & missing values & rejected", + " peptides \\\\ [0.5ex]", + " \\hline", + " before imputation & %d & %d (%d\\%s) & \\\\", + " after imputation & %d & %d & %d \\\\ [1ex]", + " \\hline", + " \\end{tabular}", + #" \\label{table:nonlin}", # may be used to refer this table in the text + "\\end{table}", + sep = "\n" + ) +tabular_lines <- + sprintf( + tabular_lines_fmt, + imp_smry_pot_peptides_before, + imp_smry_missing_values_before, + imp_smry_pct_missing, + "%", + imp_smry_pot_peptides_after, + imp_smry_missing_values_after, + imp_smry_rejected_after + ) +cat(tabular_lines) +``` +```{r echo = FALSE} + + +# Zap rows where imputation was ineffective +full_data <- full_data [good_rows, ] +quant_data <- quant_data [good_rows, ] + +quant_data_imp <- quant_data_imp[good_rows, ] +write_debug_file(quant_data_imp) +quant_data_imp_good_rows <- quant_data_imp + +write_debug_file(quant_data_imp_good_rows) +``` + +```{r echo = FALSE, results = 'asis'} + +can_plot_before_after_imp <- TRUE +d_combined <- + as.numeric( + as.matrix( + log10(quant_data_imp) + ) + ) +d_original <- + as.numeric( + as.matrix( + log10(quant_data_imp[!is.na(quant_data)]) + ) + ) + +if (sum(!is.na(d_original)) > 2) { + d_original <- density(d_original) +} else { + can_plot_before_after_imp <- FALSE +} +if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { + d_combined <- density(d_combined) +} else { + can_plot_before_after_imp <- FALSE +} + +if (sum(is.na(quant_data)) > 0) { + # There ARE missing values + d_imputed <- + as.numeric( + as.matrix( + log10(quant_data_imp[is.na(quant_data)]) + ) + ) + if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { + d_imputed <- (density(d_imputed)) + } else { + can_plot_before_after_imp <- FALSE + } +} else { + # There are NO missing values + d_imputed <- d_combined +} + +``` + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} +zero_sd_rownames <- + rownames(quant_data_imp)[ + is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) + ] + +if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { + stop("All peptides have zero standard deviation. Cannot continue.") +} +if (length(zero_sd_rownames) > 0) { + cat( + sprintf("%d peptides with zero variance were removed from statistical consideration", + length(zero_sd_rownames) + ) + ) + zap_named_rows <- function(df, nms) { + return(df[!(row.names(df) %in% nms), ]) + } + quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) + quant_data <- zap_named_rows(quant_data, zero_sd_rownames) + full_data <- zap_named_rows(full_data, zero_sd_rownames) +} + +if (sum(is.na(quant_data)) > 0) { + cat("\\leavevmode\\newpage\n") + # data visualization + old_par <- par( + mai = par("mai") + c(0.5, 0, 0, 0) + ) + # Copy quant data to x + x <- quant_data + # x gets to have values of: + # - NA for observed values + # - 1 for missing values + x[is.na(x)] <- 0 + x[x > 1] <- NA + x[x == 0] <- 1 + # Log-transform imputed data + # update variable because rows may have been eliminated from quant_data_imp + quant_data_imp_log10 <- log10(quant_data_imp) + + write_debug_file(quant_data_imp_log10) + + # Set blue_dots to log of quant data or NA for NA quant data + blue_dots <- log10(quant_data) + # Set red_dots to log of imputed data or NA for observed quant data + red_dots <- quant_data_imp_log10 * x + + count_red <- sum(!is.na(red_dots)) + count_blue <- sum(!is.na(blue_dots)) + ylim_save <- ylim <- c( + min(red_dots, blue_dots, na.rm = TRUE), + max(red_dots, blue_dots, na.rm = TRUE) + ) + show_stripchart <- + 50 > (count_red + count_blue) / length(sample_name_matches) + if (show_stripchart) { + boxplot_sub <- "Light blue = data before imputation; Red = imputed data" + } else { + boxplot_sub <- "" + } + + # Vertical plot + colnames(blue_dots) <- sample_name_matches + boxplot( + blue_dots + , las = 1 # "always horizontal" + , col = const_boxplot_fill + , ylim = ylim + , main = "Peptide intensities after eliminating unusable peptides" + , sub = boxplot_sub + , xlab = "Sample" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ) + + if (show_stripchart) { + # Points + # ref: https://r-charts.com/distribution/add-points-boxplot/ + # NA values are not plotted + stripchart( + blue_dots, # Data + method = "jitter", # Random noise + jitter = const_stripchart_jitter, + pch = 19, # Pch symbols + cex = const_stripsmall_cex, # Size of symbols reduced + col = "lightblue", # Color of the symbol + vertical = TRUE, # Vertical mode + add = TRUE # Add it over + ) + stripchart( + red_dots, # Data + method = "jitter", # Random noise + jitter = const_stripchart_jitter, + pch = 19, # Pch symbols + cex = const_stripsmall_cex, # Size of symbols reduced + col = "red", # Color of the symbol + vertical = TRUE, # Vertical mode + add = TRUE # Add it over + ) + + } + if (TRUE) { + # show measured values in blue on left half-violin plot + cat("\\leavevmode\n\\quad\n\n\\quad\n\n") + vioplot::vioplot( + x = lapply(blue_dots, function(x) x[!is.na(x)]), + col = "lightblue1", + side = "left", + plotCentre = "line", + ylim = ylim_save, + main = "Distributions of observed and imputed data", + sub = "Light blue = observed data; Pink = imputed data", + xlab = "Sample", + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ) + red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) + cols_to_delete <- c() + for (ix in seq_len(length(red_violins))) { + if (length(red_violins[[ix]]) < 1) { + cols_to_delete <- c(cols_to_delete, ix) + } + } + # destroy any unimputable columns + if (!is.null(cols_to_delete)) { + red_violins <- red_violins[-cols_to_delete] + } + # plot imputed values in red on right half-violin plot + vioplot::vioplot( + x = red_violins, + col = "lightpink1", + side = "right", + plotCentre = "line", + add = TRUE + ) + } + + par(old_par) + + # density plot + cat("\\leavevmode\n\n\n\n\n\n\n") + if (can_plot_before_after_imp) { + ylim <- c( + 0, + max( + if (is.list(d_combined)) d_combined$y else d_combined, + if (is.list(d_original)) d_original$y else d_original, + if (is.list(d_imputed)) d_imputed$y else d_imputed, + na.rm = TRUE + ) + ) + plot( + d_combined, + ylim = ylim, + sub = + paste( + "Blue = data before imputation; Red = imputed data;", + "Black = combined" + ), + main = "Density of peptide intensity before and after imputation", + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + ylab = "Probability density" + ) + lines(d_original, col = "blue") + lines(d_imputed, col = "red") + } else { + cat( + "There are too few points to plot the density of peptide intensity", + "before and after imputation." + ) + } + cat("\\leavevmode\\newpage\n") +} +``` + +# Perform Quantile Normalization + +The excellent `normalize.quantiles` function from +*[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* +performs "quantile normalization" as described Bolstad *et al.* (2003), +DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* +and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, +i.e., it assumes that the goal is to detect +subtle differences among grossly similar samples (having similar distributions) +by equailzing intra-quantile quantitations. +Unfortunately, one software library upon which it depends +*[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* +that requires that a specific, non-concurrent version of the library be +installed. The installation command equivalent to what was used to install the library to produce the results presented here is: +``` + conda install bioconductor-preprocesscore openblas=0.3.3 +``` + + +<!-- +# Apply quantile normalization using preprocessCore::normalize.quantiles +# --- +# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html +# except this: https://support.bioconductor.org/p/122925/#9135989 +# says to install it like this: +# ``` +# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) +# ``` +# conda installation (necessary because of a bug in recent openblas): +# conda install bioconductor-preprocesscore openblas=0.3.3 +# ... +# --- +# normalize.quantiles {preprocessCore} -- Quantile Normalization +# +# Description: +# Using a normalization based upon quantiles, this function normalizes a +# matrix of probe level intensities. +# +# THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the +# assumption that the data is missing at random. +# +# Usage: +# normalize.quantiles(x, copy = TRUE, keep.names = FALSE) +# +# Arguments: +# +# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. +# +# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, +# but in certain situations not making a copy of the matrix, but instead normalizing +# it in place will be more memory friendly. +# +# - keep.names: Boolean option to preserve matrix row and column names in output. +# +# Details: +# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. +# No special allowances are made for outliers. If you make use of quantile normalization +# please cite Bolstad et al, Bioinformatics (2003). +# +# This functions will handle missing data (ie NA values), based on +# the assumption that the data is missing at random. +# +# Note that the current implementation optimizes for better memory usage +# at the cost of some additional run-time. +# +# Value: A normalized matrix. +# +# Author: Ben Bolstad, bmbolstad.com +# +# References +# +# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide +# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf +# +# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of +# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias +# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 +# http://bmbolstad.com/misc/normalize/normalize.html +# ... +--> +```{r echo = FALSE, results = 'asis'} + +if (nrow(quant_data_imp) > 0) { + quant_data_imp_qn <- preprocessCore::normalize.quantiles( + as.matrix(quant_data_imp), keep.names = TRUE + ) +} else { + quant_data_imp_qn <- as.matrix(quant_data_imp) +} + +quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) + +write_debug_file(quant_data_imp_qn) + +quant_data_imp_qn_log <- log10(quant_data_imp_qn) + +write_debug_file(quant_data_imp_qn_log) + +quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) + +sel <- apply(quant_data_imp_qn_ls, 1, any_nan) +quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls + +quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] +quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) + +quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) + +write_debug_file(quant_data_imp_qn_ls) +write_debug_file(quant_data_imp_qn_ls2) + +# Create data.frame used by ANOVA analysis +data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) +``` + +<!-- ACE insertion begin --> +## Are normalized, imputed, log-transformed sample distributions similar? + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} + +# Save unimputed quant_data_log for plotting below +unimputed_quant_data_log <- quant_data_log + +# log10 transform (after preparing for zero values, +# which should never happen...) +quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 +quant_data_log <- log10(quant_data_imp_qn) + +how_many_peptides <- nrow(quant_data_log) + +if ((how_many_peptides) > 0) { + cat( + sprintf( + "Intensities for %d peptides:\n\n\n", + how_many_peptides + ) + ) + cat("\n\n\n") + + + # data visualization + old_par <- par( + mai = par("mai") + c(0.5, 0, 0, 0) + , oma = par("oma") + c(0.5, 0, 0, 0) + ) + # ref: https://r-charts.com/distribution/add-points-boxplot/ + # Vertical plot + colnames(quant_data_log) <- sample_name_matches + boxplot( + quant_data_log + , las = 1 + , col = const_boxplot_fill + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , xlab = "Sample" + ) + par(old_par) +} else { + cat("There are no peptides to plot\n") +} + +cat("\n\n\n") + +``` + +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} +if (nrow(quant_data_log) > 1) { + quant_data_log_stack <- stack(quant_data_log) + ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + + ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ggplot2::ylab("Probability density") + + ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) +} else { + cat("No density plot because there are fewer than two peptides to plot.\n\n") +} +``` +```{r echo = FALSE, results = 'asis'} +cat("\\leavevmode\\newpage\n") +``` + +# ANOVA Analysis + +```{r, echo = FALSE} +# Make new data frame containing only Phosphopeptides +# to connect preANOVA to ANOVA (connect_df) +connect_df <- data.frame( + data_table_imp_qn_lt$Phosphopeptide + , data_table_imp_qn_lt[, first_data_column] + ) +colnames(connect_df) <- c("Phosphopeptide", "Intensity") +``` + +```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +count_of_treatment_levels <- length(levels(sample_treatment_levels)) +if (count_of_treatment_levels < 2) { + nuke_control_sequences <- + function(s) { + s <- gsub("[\\]", "xyzzy_plugh", s) + s <- gsub("[$]", "\\\\$", s) + s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) + return(s) + } + cat( + "ERROR!!!! Cannot perform ANOVA analysis", + "(see next page)\\newpage\n" + ) + cat( + "ERROR: ANOVA analysis", + "requires two or more factor levels!\n\n\n" + ) + + cat("\n\n\n\n\n") + cat("Unparsed sample names are:\n\n\n", + "\\begin{quote}\n", + paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), + "\n\\end{quote}\n\n") + + regex_sample_names <- nuke_control_sequences(regex_sample_names) + + cat("\\leavevmode\n\n\n") + cat("Parsing rule for SampleNames is", + "\n\n\n", + "\\text{'", + regex_sample_names, + "'}\n\n\n", + sep = "" + ) + + cat("\nParsed sample names are:\n", + "\\begin{quote}\n", + paste(sample_name_matches, collapse = "\n\n\n"), + "\n\\end{quote}\n\n") + + regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) + + cat("\\leavevmode\n\n\n") + cat("Parsing rule for SampleGrouping is", + "\n\n\n", + "\\text{'", + regex_sample_grouping, + "'}\n\n\n", + sep = "" + ) + + cat("\n\n\n") + cat("Sample group assignments are:\n", + "\\begin{quote}\n", + paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), + "\n\\end{quote}\n\n") + +} else { + + p_value_data_anova_ps <- + apply( + quant_data_imp_qn_log, + 1, + anova_func, + grouping_factor = sample_treatment_levels, + one_way_f = one_way_all_categories + ) + + p_value_data_anova_ps_fdr <- + p.adjust(p_value_data_anova_ps, method = "fdr") + p_value_data <- data.frame( + phosphopeptide = full_data[, 1] + , + raw_anova_p = p_value_data_anova_ps + , + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr + ) + + # output ANOVA file to constructed filename, + # e.g. "Outputfile_pST_ANOVA_STEP5.txt" + # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" + + # Re-output datasets to include p-values + metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) + write.table( + cbind(metadata_plus_p, quant_data_imp), + file = imputed_data_filename, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + + write.table( + cbind(metadata_plus_p, quant_data_imp_qn_log), + file = imp_qn_lt_data_filenm, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + + + p_value_data <- + p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] + + first_page_suppress <- 1 + number_of_peptides_found <- 0 + cutoff <- val_fdr[1] + for (cutoff in val_fdr) { + if (number_of_peptides_found > 49) { + cat("\\leavevmode\n\n\n") + break + } + + #loop through FDR cutoffs + + filtered_p <- + p_value_data[ + which(p_value_data$fdr_adjusted_anova_p < cutoff), + , drop = FALSE + ] + filtered_data_filtered <- + quant_data_imp_qn_log[ + rownames(filtered_p), + , drop = FALSE + ] + filtered_data_filtered <- + filtered_data_filtered[ + order(filtered_p$fdr_adjusted_anova_p), + , drop = FALSE + ] + + # <!-- ACE insertion start --> + + if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { + if (first_page_suppress == 1) { + first_page_suppress <- 0 + } else { + cat("\\newpage\n") + } + if (nrow(filtered_data_filtered) > 1) { + subsection_header(sprintf( + "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", + nrow(filtered_data_filtered), + cutoff + )) + } else { + subsection_header(sprintf( + "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", + rownames(filtered_data_filtered)[1], + cutoff + )) + } + cat("\n\n\n") + cat("\n\n\n") + + old_oma <- par("oma") + old_par <- par( + mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), + oma = old_oma * c(1, 1, 0.3, 1), + cex.main = 0.9, + cex.axis = 0.7, + fin = c(9, 7.25) + ) + # ref: https://r-charts.com/distribution/add-points-boxplot/ + # Vertical plot + colnames(filtered_data_filtered) <- sample_name_matches + tryCatch( + boxplot( + filtered_data_filtered, + main = "Imputed, normalized intensities", # no line plot + las = 1, + col = const_boxplot_fill, + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ), + error = function(e) print(e) + ) + par(old_par) + } else { + cat(sprintf( + "%s < %0.2f\n\n\n\n\n", + "No peptides were found to have cutoff adjusted p-value", + cutoff + )) + } + + if (nrow(filtered_data_filtered) > 0) { + # Add Phosphopeptide column to anova_filtered table + # The assumption here is that the first intensity is unique; + # this is a hokey assumption but almost definitely will + # be true in the real world unless there is a computation + # error upstream. + anova_filtered_merge <- base::merge( + x = connect_df, + y = filtered_data_filtered, + by.x = "Intensity", + by.y = 1 + ) + anova_filtered_merge_order <- rownames(filtered_p) + + anova_filtered <- data.frame( + ppep = anova_filtered_merge$Phosphopeptide, + intense = anova_filtered_merge$Intensity, + data = anova_filtered_merge[, 2:number_of_samples + 1] + ) + colnames(anova_filtered) <- + c("Phosphopeptide", colnames(filtered_data_filtered)) + + # Merge qualitative columns into the ANOVA data + output_table <- data.frame(anova_filtered$Phosphopeptide) + output_table <- base::merge( + x = output_table, + y = data_table_imp_qn_lt, + by.x = "anova_filtered.Phosphopeptide", + by.y = "Phosphopeptide" + ) + + # Produce heatmap to visualize significance and the effect of imputation + + anova_filtered_merge_format <- sapply( + X = filtered_p$fdr_adjusted_anova_p + , + FUN = function(x) { + if (x > 0.0001) + paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") + else + paste0("(%0.4e) %s") + } + ) + + cat_hm_heading <- function(m, cutoff) { + cat("\\newpage\n") + if (nrow(m) > intensity_hm_rows) { + subsection_header( + paste( + sprintf("Heatmap for the %d most-significant peptides", + intensity_hm_rows), + sprintf("whose adjusted p-value < %0.2f\n", cutoff) + ) + ) + } else { + if (nrow(m) == 1) { + return(FALSE) + } else { + subsection_header( + paste( + sprintf("Heatmap for %d usable peptides whose", nrow(m)), + sprintf("adjusted p-value < %0.2f\n", cutoff) + ) + ) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) + } + + # construct matrix with appropriate rownames + m <- + as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) + if (nrow(m) > 0) { + rownames_m <- rownames(m) + rownames(m) <- sapply( + X = seq_len(nrow(m)) + , + FUN = function(i) { + sprintf( + anova_filtered_merge_format[i], + filtered_p$fdr_adjusted_anova_p[i], + rownames_m[i] + ) + } + ) + } + # draw the heading and heatmap + if (nrow(m) > 0) { + number_of_peptides_found <- + draw_intensity_heatmap( + m = m, + cutoff = cutoff, + hm_heading_function = cat_hm_heading, + hm_main_title = "Unimputed, unnormalized log(intensities)", + suppress_row_dendrogram = FALSE + ) + } + } + } +} +cat("\\leavevmode\n\n\n") +``` + +```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +if (count_of_treatment_levels > 1) { + # Prepare two-way contrasts with adjusted p-values + # Strategy: + # - use imputed, log-transformed data: + # - remember this when computing log2(fold-change) + # - each contrast is between a combination of trt levels + # - for each contrast, compute samples that are members + # - compute one-way test: + # - use `oneway.test` (Welch test) if numbers of samples + # are not equivalent between trt levels + # - otherwise, aov is fine but offers no advantage + # - adjust p-value, assuming that + # (# of pppeps)*(# of contrasts) tests were performed + + # Each contrast is between a combination of trt levels + m2 <- combn( + x = seq_len(length(levels(sample_treatment_levels))), + m = 2, + simplify = TRUE + ) + contrast_count <- ncol(m2) + + # For each contrast, compute samples that are members + # - local function to construct a data.frame for each contrast + # - the contrast in the first "column" + f_m2 <- + function(cntrst, lvl1, lvl2) { + return( + data.frame( + contrast = cntrst, + level = sample_treatment_levels[ + sample_treatment_levels %in% + levels(sample_treatment_levels)[c(lvl1, lvl2)] + ], + label = sample_name_matches[ + sample_treatment_levels %in% + levels(sample_treatment_levels)[c(lvl1, lvl2)] + ] + ) + ) + } + # - compute a df for each contrast + sample_level_dfs <- lapply( + X = 1:contrast_count, + FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) + ) + # - compute a single df for all contrasts + combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) + + # - dispose objects to free resources + rm(sample_level_dfs) + + # - write the df to a DB for later join-per-contrast + db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) + + RSQLite::dbWriteTable( + conn = db, + name = "contrast", + value = combined_contrast_df, + overwrite = TRUE + ) + + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx + ON contrast(contrast, label); + " + ) + # Create indexes for join + ddl_exec(db, " + -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label + CREATE INDEX IF NOT EXISTS contrast__label__idx + ON contrast(label); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on contrast + CREATE INDEX IF NOT EXISTS contrast__contrast__idx + ON contrast(contrast); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on phophospep + CREATE INDEX IF NOT EXISTS contrast__level__idx + ON contrast(level); + " + ) + # - dispose objects to free resources + rm(combined_contrast_df) + + # Use imputed, log-transformed data + # - remember that this was donoe when computing log2(fold-change) + # - melt data matrix for use in later join-per-contrast + casted <- cbind( + data.frame(vrbl = rownames(quant_data_imp_qn_log)), + quant_data_imp_qn_log + ) + quant_data_imp_qn_log_melted <- reshape2::melt( + casted, + id.vars = "vrbl" + ) + colnames(quant_data_imp_qn_log_melted) <- + c("phosphopep", "sample", "quant") + # - dispose objects to free resources + rm(casted) + + # - write the df to a DB for use in later join-per-contrast + RSQLite::dbWriteTable( + conn = db, + name = "ppep_smpl_qnlt", + value = quant_data_imp_qn_log_melted, + overwrite = TRUE + ) + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx + ON ppep_smpl_qnlt(phosphopep, sample); + " + ) + # Create index for join + ddl_exec(db, " + -- index for join in contrast_ppep_smpl_qnlt + CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx + ON ppep_smpl_qnlt(sample); + " + ) + ddl_exec(db, " + -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep + CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx + ON ppep_smpl_qnlt(phosphopep); + " + ) + # - dispose objects to free resources + rm(quant_data_imp_qn_log_melted) + + # - drop views if exist + ddl_exec(db, " + -- drop view dependent on contrast_lvl_ppep_avg_quant + DROP VIEW IF EXISTS v_contrast_log2_fc; + " + ) + ddl_exec(db, " + -- drop table dependent on contrast_ppep_smpl_qnlt + DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; + " + ) + ddl_exec(db, " + DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_contrast_lvl_metadata; + " + ) + ddl_exec(db, " + -- drop view dependent on contrast_ppep_smpl_qnlt + DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_contrast_lvl_lvl; + " + ) + ddl_exec(db, " + -- drop view upon which other views depend + DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; + " + ) + # - create view + dml_no_rows_exec(db, " + -- view contrast_ppep_smpl_qnlt is used for each phopshopep to + -- compute p-value for test of trt effect for two trt levels + CREATE VIEW contrast_ppep_smpl_qnlt + AS + SELECT contrast, + level, + phosphopep, + sample, + quant + FROM contrast AS c, + ppep_smpl_qnlt AS q + WHERE q.sample = c.label + ORDER BY contrast, level, phosphopep + ; + " + ) + # - create simplification views + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_metadata + AS + SELECT contrast, + level, + group_concat(label, ';') AS samples + FROM contrast + GROUP BY contrast, level + /* view v_contrast_lvl_metadata is used + to simplify creation of table contrast_lvl_lvl_metadata */ + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_ppep_avg_quant + AS + SELECT contrast, + level, + phosphopep, + avg(quant) AS avg_quant + FROM contrast_ppep_smpl_qnlt + GROUP BY contrast, level, phosphopep + /* view v_contrast_lvl_ppep_avg_quant is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + + # - create contrast-metadata table + dml_no_rows_exec(db, " + CREATE TABLE contrast_lvl_lvl_metadata + AS + SELECT DISTINCT + a.contrast AS ab_contrast, + a.level AS a_level, + b.level AS b_level, + a.samples AS a_samples, + b.samples AS b_samples, + 'log2(level_'||a.level||'/level_'||b.level||')' + AS fc_description + FROM v_contrast_lvl_metadata AS a, + v_contrast_lvl_metadata AS b + WHERE a.contrast = b.contrast + AND a.level > b.level + /* view v_contrast_lvl_lvl is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + # - create pseudo-materialized view table + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_lvl_lvl + AS + SELECT DISTINCT + a.contrast AS ab_contrast, + a.level AS a_level, + b.level AS b_level + FROM contrast AS a, + contrast AS b + WHERE a.contrast = b.contrast + AND a.level > b.level + /* view v_contrast_lvl_lvl is used + to simplify view v_contrast_log2_fc */ + ; + " + ) + + # - create view to compute log2(fold-change) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_log2_fc + AS + SELECT ab.ab_contrast AS contrast, + m.a_level AS a_level, + c.avg_quant AS a_quant, + m.a_samples AS a_samples, + ab.b_level AS b_level, + d.avg_quant AS b_quant, + m.b_samples AS b_samples, + m.fc_description AS fc_description, + 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, + d.phosphopep AS phosphopep + FROM contrast_lvl_lvl_metadata AS m, + v_contrast_lvl_ppep_avg_quant AS d, + v_contrast_lvl_lvl AS ab + INNER JOIN v_contrast_lvl_ppep_avg_quant AS c + ON c.contrast = ab.ab_contrast + AND c.level = ab.a_level + WHERE d.contrast = ab.ab_contrast + AND m.ab_contrast = ab.ab_contrast + AND d.level = ab.b_level + AND d.phosphopep = c.phosphopep + /* view to compute log2(fold-change) */ + ; + " + ) + + # For each contrast, compute samples that are members + # compute one-way test: + # - use `oneway.test` (Welch test) if numbers of samples + # are not equivalent between trt levels + # - otherwise, aov is fine but offers no advantage + for (contrast in contrast_count:2) { + invisible(contrast) + } + for (contrast in 1:contrast_count) { + contrast_df <- sqldf::sqldf( + x = paste0(" + SELECT level, phosphopep, sample, quant + FROM contrast_ppep_smpl_qnlt + WHERE contrast = ", contrast, " + ORDER BY phosphopep, level, sample + "), + connection = db + ) + contrast_cast <- reshape2::dcast( + data = contrast_df, + formula = phosphopep ~ sample, + value.var = "quant" + ) + contrast_cast_ncol <- ncol(contrast_cast) + contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] + contrast_cast_samples <- colnames(contrast_cast_data) + + # - order grouping_factor by order of sample columns of contrast_cast_data + grouping_factor <- sqldf::sqldf( + x = paste0(" + SELECT sample, level + FROM contrast_ppep_smpl_qnlt + WHERE contrast = ", contrast, " + ORDER BY phosphopep, level, sample + LIMIT ", contrast_cast_ncol - 1 + ), + connection = db + ) + rownames(grouping_factor) <- grouping_factor$sample + grouping_factor <- grouping_factor[, "level", drop = FALSE] + + # - run the two-level (one-way) test + p_value_data_contrast_ps <- + apply( + X = contrast_cast_data, + MARGIN = 1, # apply to rows + FUN = anova_func, + grouping_factor = + as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 + one_way_f = one_way_two_categories, # anova_func arg3 + simplify = TRUE # TRUE is the default for simplify + ) + contrast_data_adj_p_values <- p.adjust( + p = p_value_data_contrast_ps, + method = "fdr", + n = length(p_value_data_contrast_ps) # this is the default, length(p) + ) + # - compute the fold-change + contrast_p_df <- + data.frame( + contrast = contrast, + phosphopep = contrast_cast$phosphopep, + p_value_raw = p_value_data_contrast_ps, + p_value_adj = contrast_data_adj_p_values + ) + db_write_table_overwrite <- (contrast < 2) + db_write_table_append <- !db_write_table_overwrite + RSQLite::dbWriteTable( + conn = db, + name = "contrast_ppep_p_val", + value = contrast_p_df, + append = db_write_table_append + ) + # Create UK for insert + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx + ON contrast_ppep_p_val(phosphopep, contrast); + " + ) + } + # Perhaps this could be done more elegantly using unique keys + # or creating the tables before saving data to them, but this + # is fast and, if the database exists on disk rather than in + # memory, it doesn't stress memory. + dml_no_rows_exec(db, " + CREATE TEMP table contrast_log2_fc + AS + SELECT * + FROM v_contrast_log2_fc + ORDER BY contrast, phosphopep + ; + " + ) + dml_no_rows_exec(db, " + CREATE TEMP table ppep_p_val + AS + SELECT p_value_raw, + p_value_adj, + contrast AS p_val_contrast, + phosphopep AS p_val_ppep + FROM contrast_ppep_p_val + ORDER BY contrast, phosphopep + ; + " + ) + dml_no_rows_exec(db, " + DROP TABLE IF EXISTS contrast_log2_fc_p_val + ; + " + ) + dml_no_rows_exec(db, " + CREATE TABLE contrast_log2_fc_p_val + AS + SELECT a.*, + b.p_value_raw, + b.p_value_adj, + b.p_val_contrast, + b.p_val_ppep + FROM contrast_log2_fc a, ppep_p_val b + WHERE a.rowid = b.rowid + AND a.phosphopep = b.p_val_ppep + ; + " + ) + # Create UK + ddl_exec(db, " + CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx + ON contrast_log2_fc_p_val(phosphopep, contrast); + " + ) + # Create indices for future queries + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx + ON contrast_log2_fc_p_val(contrast); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx + ON contrast_log2_fc_p_val(phosphopep); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx + ON contrast_log2_fc_p_val(p_value_raw); + " + ) + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx + ON contrast_log2_fc_p_val(p_value_adj); + " + ) + dml_no_rows_exec(db, " + DROP VIEW IF EXISTS v_contrast_log2_fc_p_val + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_contrast_log2_fc_p_val + AS + SELECT contrast, + a_level, + a_samples, + b_level, + b_samples, + a_quant, + b_quant, + fc_description, + log2_fc, + p_value_raw, + p_value_adj, + phosphopep + FROM contrast_log2_fc_p_val + ORDER BY contrast, phosphopep + ; + " + ) + ddl_exec(db, " + DROP TABLE IF EXISTS kseaapp_metadata + ; + " + ) + dml_no_rows_exec(db, " + CREATE TABLE kseaapp_metadata + AS + WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( + SELECT DISTINCT + deppep.seq, + ppep.seq, + GeneName||';', + UniProtID||';', + PhosphoResidue||';' + FROM + ppep, deppep, mrgfltr_metadata + WHERE + mrgfltr_metadata.ppep_id = ppep.id + AND + ppep.deppep_id = deppep.id + ) + SELECT + ppep AS `ppep`, + SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, + SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, + deppep AS `Peptide`, + REPLACE( + REPLACE( + SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), + 'p', + '' + ), + ', ', + ';' + ) AS `Residue.Both` + FROM extended + ; + " + ) + # Create indexes for join + ddl_exec(db, " + CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx + ON kseaapp_metadata(ppep); + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_kseaapp_contrast + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_kseaapp_contrast + AS + SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` + FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b + WHERE b.ppep = a.phosphopep + ; + " + ) + ddl_exec(db, " + DROP VIEW IF EXISTS v_kseaapp_input + ; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW v_kseaapp_input + AS + SELECT v.contrast, + v.phosphopep, + m.`Protein`, + m.`Gene`, + m.`Peptide`, + m.`Residue.Both`, + v.p_value_raw AS `p`, + v.log2_fc AS `FC` + FROM kseaapp_metadata AS m, + v_contrast_log2_fc_p_val AS v + WHERE m.ppep = v.phosphopep + AND NOT m.`Gene` = 'No_Gene_Name' + AND NOT v.log2_fc = 0 + ; + " + ) +} +``` + +```{r echo = FALSE, results = 'asis'} +cat("\\newpage\n") +``` + +# KSEA Analysis + +Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: + +- The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp +- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). +- An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). + +For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: + +$$ +\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} +$$ + +and fold-enrichment is computed as: + +$$ +\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} +$$ + +where: + +- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, +- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and +- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, +- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. +- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. + +$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) + +Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). + +Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. + +- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. +- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). +- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). + +```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) + +# -- eliminate the table that's about to be defined +ddl_exec(db, " +DROP TABLE IF EXISTS site_metadata; +") + +# -- define the site_metadata table +ddl_exec(db, " +CREATE TABLE site_metadata( + id INTEGER PRIMARY KEY +, site_type_id INTEGER REFERENCES site_type(id) +, full TEXT UNIQUE ON CONFLICT IGNORE +, abbrev TEXT +, pattern TEXT +, motif TEXT +); +") + +# -- populate the table with initial values +ddl_exec(db, " +INSERT INTO site_metadata(full, abbrev, site_type_id) + SELECT DISTINCT kinase_map, kinase_map, site_type_id + FROM ppep_gene_site + ORDER BY kinase_map; +") + +# -- drop bogus KSData view if exists +ddl_exec(db, " +DROP VIEW IF EXISTS ks_data_v; +") + +# -- create view to serve as an impostor for KSEAapp::KSData +ddl_exec(db, " +CREATE VIEW IF NOT EXISTS ks_data_v +AS +SELECT + 'NA' AS KINASE, + 'NA' AS KIN_ACC_ID, + kinase_map AS GENE, + 'NA' AS KIN_ORGANISM, + 'NA' AS SUBSTRATE, + 0 AS SUB_GENE_ID, + 'NA' AS SUB_ACC_ID, + gene_names AS SUB_GENE, + 'NA' AS SUB_ORGANISM, + phospho_peptide AS SUB_MOD_RSD, + 0 AS SITE_GROUP_ID, + 'NA' AS 'SITE_7AA', + 2 AS networkin_score, + type_name AS Source +FROM ppep_gene_site_view; +") + +contrast_metadata_df <- + sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) +rslt <- new_env() +rslt$score_list <- list() +rslt$name_list <- list() +rslt$longname_list <- list() + +ddl_exec(db, " + DROP TABLE IF EXISTS contrast_ksea_scores; + " +) + +next_index <- 0 +err_na_subscr_df_const <- + "missing values are not allowed in subscripted assignments of data frames" + +for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Trt %s {%s} -> Trt %s {%s}", + contrast_metadata_df[i_cntrst, "b_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "b_samples"], + fixed = TRUE + ), + contrast_metadata_df[i_cntrst, "a_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "a_samples"], + fixed = TRUE + ) + ) + ) + kseaapp_input <- + sqldf::sqldf( + x = sprintf(" + SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` + FROM v_kseaapp_input + WHERE contrast = %d + ", + i_cntrst + ), + connection = db + ) + + pseudo_ksdata <- dbReadTable(db, "ks_data_v") + + # This hack is because SQL table has the log2-transformed values + kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- + ksea_scores( + ksdata = pseudo_ksdata, # KSEAapp::KSData, + px = kseaapp_input, + networkin = TRUE, + networkin_cutoff = 2 + ) + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + next_index <- 1 + next_index + rslt$score_list[[next_index]] <- ksea_scores_rslt + rslt$name_list[[next_index]] <- contrast_label + rslt$longname_list[[next_index]] <- contrast_longlabel + low_fdr_print( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + } + }, + error = function(e) str(e) + ) +} + +plotted_kinases <- NULL +if (length(rslt$score_list) > 1) { + for (i in seq_len(length(ksea_heatmap_titles))) { + hdr <- ksea_heatmap_titles[[i]] + which_kinases <- i + + cat("\\clearpage\n\\begin{center}\n") + if (i == const_ksea_astrsk_kinases) { + subsection_header(hdr) + } else { + subsection_header(hdr) + } + cat("\\end{center}\n") + + plotted_kinases <- ksea_heatmap( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list = rslt$score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels = rslt$name_list, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats = c("p.value", "FDR")[2], + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff = 1, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff = 0.05, + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster = TRUE, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the heatmap as a .png image into the working directory + export = FALSE, + # additional arguments to gplots::heatmap.2, such as: + # - main: main title of plot + # - xlab: x-axis label + # - ylab: y-axis label + xlab = "Contrast", + ylab = "Kinase", + # print which kinases: + # - 1 : all kinases + # - 2 : significant kinases + # - 3 : non-significant kinases + which_kinases = which_kinases + ) + cat("\\begin{center}\n") + cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") + cat("\\end{center}\n") + } # end for (i in ... +} # end if (length ... + +for (i_cntrst in seq_len(length(rslt$score_list))) { + next_index <- i_cntrst + cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] + cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] + cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] + contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) + contrast_longlabel <- ( + sprintf( + "Trt %s {%s} -> Trt %s {%s}", + contrast_metadata_df[i_cntrst, "b_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "b_samples"], + fixed = TRUE + ), + contrast_metadata_df[i_cntrst, "a_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "a_samples"], + fixed = TRUE + ) + ) + ) + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + sub_title <- contrast_longlabel + tryCatch( + expr = { + ksea_scores_rslt <- rslt$score_list[[next_index]] + + if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { + low_fdr_barplot( + rslt = rslt, + i_cntrst = i_cntrst, + i = next_index, + a_level = cntrst_a_level, + b_level = cntrst_b_level, + fold_change = cntrst_fold_change, + caption = contrast_longlabel + ) + } + }, + error = function(e) str(e) + ) +} +``` + +```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} + +# Use enriched kinases to find enriched kinase-substrate pairs +enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) +all_enriched_substrates <- sqldf(" + SELECT + gene AS kinase, + ppep, + '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label + FROM ( + SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) + GROUP BY ppep + ") + +# helper used to label per-kinase substrate enrichment figure +cat_enriched_heading <- function(m, cut_args) { + cutoff <- cut_args$cutoff + kinase <- cut_args$kinase + statistic <- cut_args$statistic + threshold <- cut_args$threshold + cat("\\newpage\n") + if (nrow(m) > intensity_hm_rows) { + subsection_header( + paste( + sprintf( + "Lowest p-valued %d (of %d) enriched %s-substrates,", + intensity_hm_rows, + nrow(m), + kinase + ), + sprintf(" KSEA %s < %0.2f\n", statistic, threshold) + ) + ) + } else { + if (nrow(m) == 1) { + return(FALSE) + } else { + subsection_header( + paste( + sprintf( + "%d enriched %s-substrates,", + nrow(m), + kinase + ), + sprintf( + " KSEA %s < %0.2f\n", + statistic, + threshold + ) + ) + ) + } + } + cat("\n\n\n") + cat("\n\n\n") + return(TRUE) +} + +# Disabling heatmaps for substrates pending decision whether to eliminate them altogether +if (FALSE) + for (kinase_name in sort(enriched_kinases$kinase)) { + enriched_substrates <- + all_enriched_substrates[ + all_enriched_substrates$kinase == kinase_name, + , + drop = FALSE + ] + # Get the intensity values for the heatmap + enriched_intensities <- + as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) + # Remove rows having too many NA values to be relevant + na_counter <- is.na(enriched_intensities) + na_counts <- apply(na_counter, 1, sum) + enriched_intensities <- + enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] + # Rename the rows with the display-name for the heatmap + rownames(enriched_intensities) <- + sapply( + X = rownames(enriched_intensities), + FUN = function(rn) { + enriched_substrates[enriched_substrates$ppep == rn, "label"] + } + ) + # Format as matrix for heatmap + m <- as.matrix(enriched_intensities) + # Draw the heading and heatmap + if (nrow(m) > 0) { + cut_args <- new_env() + cut_args$cutoff <- cutoff + cut_args$kinase <- kinase_name + cut_args$statistic <- ksea_cutoff_statistic + cut_args$threshold <- ksea_cutoff_threshold + number_of_peptides_found <- + draw_intensity_heatmap( + m = m, + cutoff = cut_args, + hm_heading_function = cat_enriched_heading, + hm_main_title + = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", + suppress_row_dendrogram = FALSE + ) + } + } + +# Write output tabular files + +# get kinase, ppep, concat(kinase) tuples for enriched kinases + +kinase_ppep_label <- sqldf(" + WITH + t(ppep, label) AS + ( + SELECT DISTINCT + SUB_MOD_RSD AS ppep, + group_concat(gene, '; ') AS label + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + GROUP BY ppep + ), + k(kinase, ppep_join) AS + ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) + SELECT k.kinase, t.ppep, t.label + FROM t, k + WHERE t.ppep = k.ppep_join + ORDER BY k.kinase, t.ppep + ") + +# extract what we need from full_data +impish <- cbind(rownames(quant_data_imp), quant_data_imp) +colnames(impish)[1] <- "Phosphopeptide" +data_table_imputed_sql <- " + SELECT + f.*, + k.label AS KSEA_enrichments, + q.* + FROM + metadata_plus_p f + LEFT JOIN kinase_ppep_label k + ON f.Phosphopeptide = k.ppep, + impish q + WHERE + f.Phosphopeptide = q.Phosphopeptide + " +data_table_imputed <- sqldf(data_table_imputed_sql) +# Zap the duplicated 'Phosphopeptide' column named 'ppep' +data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] + +# Output with imputed, un-normalized data + +write.table( + data_table_imputed + , file = imputed_data_filename + , sep = "\t" + , col.names = TRUE + , row.names = FALSE + , quote = FALSE + ) + + +#output quantile normalized data +impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) +colnames(impish)[1] <- "Phosphopeptide" +data_table_imputed <- sqldf(data_table_imputed_sql) +# Zap the duplicated 'Phosphopeptide' column named 'ppep' +data_table_imputed <- + data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] +write.table( + data_table_imputed, + file = imp_qn_lt_data_filenm, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE +) + +ppep_kinase <- sqldf(" + SELECT DISTINCT k.ppep, k.kinase + FROM ( + SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep + FROM pseudo_ksdata + WHERE GENE IN (SELECT kinase FROM enriched_kinases) + ) k + ORDER BY k.ppep, k.kinase + ") + +RSQLite::dbWriteTable( + conn = db, + name = "ksea_enriched_ks", + value = ppep_kinase, + append = FALSE + ) + +RSQLite::dbWriteTable( + conn = db, + name = "anova_signif", + value = p_value_data, + append = FALSE + ) + + ddl_exec(db, " + DROP VIEW IF EXISTS stats_metadata_v; + " + ) + dml_no_rows_exec(db, " + CREATE VIEW stats_metadata_v + AS + SELECT DISTINCT m.*, + p.raw_anova_p, + p.fdr_adjusted_anova_p, + kek.kinase AS ksea_enrichments + FROM + mrgfltr_metadata_view m + LEFT JOIN anova_signif p + ON m.phospho_peptide = p.phosphopeptide + LEFT JOIN ksea_enriched_ks kek + ON m.phospho_peptide = kek.ppep + ; + " + ) + +write.table( + dbReadTable(db, "stats_metadata_v"), + file = anova_ksea_mtdt_file, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE + ) + + +``` + +```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +cat("\\leavevmode\n\n\n") + +# write parameters to report + +param_unlist <- unlist(as.list(params)) +param_df <- data.frame( + parameter = paste0("\\verb@", names(param_unlist), "@"), + value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") + ) + +data_frame_latex( + x = param_df, + justification = "p{0.35\\linewidth} p{0.6\\linewidth}", + centered = TRUE, + caption = "Input parameters", + anchor = const_table_anchor_bp, + underscore_whack = FALSE + ) + +# write parameters to SQLite output + +mqppep_anova_script_param_df <- data.frame( + script = "mqppep_anova_script.Rmd", + parameter = names(param_unlist), + value = param_unlist + ) +ddl_exec(db, " + DROP TABLE IF EXISTS script_parameter; + " +) +ddl_exec(db, " + CREATE TABLE IF NOT EXISTS script_parameter( + script TEXT, + parameter TEXT, + value ANY, + UNIQUE (script, parameter) ON CONFLICT REPLACE + ) + ; + " +) +RSQLite::dbWriteTable( + conn = db, + name = "script_parameter", + value = mqppep_anova_script_param_df, + append = TRUE +) + +# We are done with output +RSQLite::dbDisconnect(db) +``` +<!-- +There's gotta be a better way... + +loaded_packages_df <- sessioninfo::package_info("loaded") +loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) +loaded_packages_df <- data.frame( + package = loaded_packages_df$package, + version = loaded_packages_df$loadedversion, + date = loaded_packages_df$date + ) +data_frame_latex( + x = loaded_packages_df, + justification = "l | l l", + centered = FALSE, + caption = "Loaded R packages", + anchor = const_table_anchor_bp + ) +-->
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_mrgfltr.py Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,1551 @@ +#!/usr/bin/env python + +# Import the packages needed +import argparse +import operator # for operator.itemgetter +import os.path +import re +import shutil # for shutil.copyfile(src, dest) +import sqlite3 as sql +import sys # import the sys module for exc_info +import time +import traceback # for formatting stack-trace +from codecs import getreader as cx_getreader + +import numpy as np +import pandas + +# global constants +N_A = "N/A" + + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + exit(-1) + return None + + +def whine(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("Warning: For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + return None + + +def ppep_join(x): + x = [i for i in x if N_A != i] + result = "%s" % " | ".join(x) + if result != "": + return result + else: + return N_A + + +def melt_join(x): + tmp = {key.lower(): key for key in x} + result = "%s" % " | ".join([tmp[key] for key in tmp]) + return result + + +def __main__(): + # Parse Command Line + parser = argparse.ArgumentParser( + description="Phopsphoproteomic Enrichment Pipeline Merge and Filter." + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + "--phosphopeptides", + "-p", + nargs=1, + required=True, + dest="phosphopeptides", + help="Phosphopeptide data for experimental results, including the intensities and the mapping to kinase domains, in tabular format", + ) + # UniProtKB/SwissProt DB input, SQLite + parser.add_argument( + "--ppep_mapping_db", + "-d", + nargs=1, + required=True, + dest="ppep_mapping_db", + help="UniProtKB/SwissProt SQLite Database", + ) + # species to limit records chosed from PhosPhositesPlus + parser.add_argument( + "--species", + "-x", + nargs=1, + required=False, + default=[], + dest="species", + help="limit PhosphoSitePlus records to indicated species (field may be empty)", + ) + + # outputs: + # tabular output + parser.add_argument( + "--mrgfltr_tab", + "-o", + nargs=1, + required=True, + dest="mrgfltr_tab", + help="Tabular output file for results", + ) + # CSV output + parser.add_argument( + "--mrgfltr_csv", + "-c", + nargs=1, + required=True, + dest="mrgfltr_csv", + help="CSV output file for results", + ) + # SQLite output + parser.add_argument( + "--mrgfltr_sqlite", + "-S", + nargs=1, + required=True, + dest="mrgfltr_sqlite", + help="SQLite output file for results", + ) + + # "Make it so!" (parse the arguments) + options = parser.parse_args() + print("options: " + str(options)) + + # determine phosphopeptide ("upstream map") input tabular file access + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + upstream_map_filename_tab = os.path.abspath(options.phosphopeptides[0]) + input_file = open(upstream_map_filename_tab, "r") + input_file.close() + except Exception as e: + exit("Error parsing phosphopeptides argument: %s" % str(e)) + + # determine input SQLite access + if options.ppep_mapping_db is None: + exit('Argument "ppep_mapping_db" is required but not supplied') + try: + uniprot_sqlite = os.path.abspath(options.ppep_mapping_db[0]) + input_file = open(uniprot_sqlite, "rb") + input_file.close() + except Exception as e: + exit("Error parsing ppep_mapping_db argument: %s" % str(e)) + + # copy input SQLite dataset to output SQLite dataset + if options.mrgfltr_sqlite is None: + exit('Argument "mrgfltr_sqlite" is required but not supplied') + try: + output_sqlite = os.path.abspath(options.mrgfltr_sqlite[0]) + shutil.copyfile(uniprot_sqlite, output_sqlite) + except Exception as e: + exit("Error copying ppep_mapping_db to mrgfltr_sqlite: %s" % str(e)) + + # determine species to limit records from PSP_Regulatory_Sites + if options.species is None: + exit( + 'Argument "species" is required (and may be empty) but not supplied' + ) + try: + if len(options.species) > 0: + species = options.species[0] + else: + species = "" + except Exception as e: + exit("Error parsing species argument: %s" % str(e)) + + # determine tabular output destination + if options.mrgfltr_tab is None: + exit('Argument "mrgfltr_tab" is required but not supplied') + try: + output_filename_tab = os.path.abspath(options.mrgfltr_tab[0]) + output_file = open(output_filename_tab, "w") + output_file.close() + except Exception as e: + exit("Error parsing mrgfltr_tab argument: %s" % str(e)) + + # determine CSV output destination + if options.mrgfltr_csv is None: + exit('Argument "mrgfltr_csv" is required but not supplied') + try: + output_filename_csv = os.path.abspath(options.mrgfltr_csv[0]) + output_file = open(output_filename_csv, "w") + output_file.close() + except Exception as e: + exit("Error parsing mrgfltr_csv argument: %s" % str(e)) + + def mqpep_getswissprot(): + + # + # copied from Excel Output Script.ipynb BEGIN # + # + + # String Constants ################# + DEPHOSPHOPEP = "DephosphoPep" + DESCRIPTION = "Description" + FUNCTION_PHOSPHORESIDUE = ( + "Function Phosphoresidue(PSP=PhosphoSitePlus.org)" + ) + GENE_NAME = "Gene_Name" # Gene Name from UniProtKB + ON_FUNCTION = ( + "ON_FUNCTION" # ON_FUNCTION column from PSP_Regulatory_Sites + ) + ON_NOTES = "NOTES" # NOTES column from PSP_Regulatory_Sites + ON_OTHER_INTERACT = "ON_OTHER_INTERACT" # ON_OTHER_INTERACT column from PSP_Regulatory_Sites + ON_PROCESS = ( + "ON_PROCESS" # ON_PROCESS column from PSP_Regulatory_Sites + ) + ON_PROT_INTERACT = "ON_PROT_INTERACT" # ON_PROT_INTERACT column from PSP_Regulatory_Sites + PHOSPHOPEPTIDE = "Phosphopeptide" + PHOSPHOPEPTIDE_MATCH = "Phosphopeptide_match" + PHOSPHORESIDUE = "Phosphoresidue" + PUTATIVE_UPSTREAM_DOMAINS = "Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains" + SEQUENCE = "Sequence" + SEQUENCE10 = "Sequence10" + SEQUENCE7 = "Sequence7" + SITE_PLUSMINUS_7AA_SQL = "SITE_PLUSMINUS_7AA" + UNIPROT_ID = "UniProt_ID" + UNIPROT_SEQ_AND_META_SQL = """ + select Uniprot_ID, Description, Gene_Name, Sequence, + Organism_Name, Organism_ID, PE, SV + from UniProtKB + order by Sequence, UniProt_ID + """ + UNIPROT_UNIQUE_SEQ_SQL = """ + select distinct Sequence + from UniProtKB + group by Sequence + """ + PPEP_PEP_UNIPROTSEQ_SQL = """ + select distinct phosphopeptide, peptide, sequence + from uniprotkb_pep_ppep_view + order by sequence + """ + PPEP_MELT_SQL = """ + SELECT DISTINCT + phospho_peptide AS 'p_peptide', + kinase_map AS 'characterization', + 'X' AS 'X' + FROM ppep_gene_site_view + """ + # CREATE TABLE PSP_Regulatory_site ( + # site_plusminus_7AA TEXT PRIMARY KEY ON CONFLICT IGNORE, + # domain TEXT, + # ON_FUNCTION TEXT, + # ON_PROCESS TEXT, + # ON_PROT_INTERACT TEXT, + # ON_OTHER_INTERACT TEXT, + # notes TEXT, + # organism TEXT + # ); + PSP_REGSITE_SQL = """ + SELECT DISTINCT + SITE_PLUSMINUS_7AA , + DOMAIN , + ON_FUNCTION , + ON_PROCESS , + ON_PROT_INTERACT , + ON_OTHER_INTERACT , + NOTES , + ORGANISM + FROM PSP_Regulatory_site + """ + PPEP_ID_SQL = """ + SELECT + id AS 'ppep_id', + seq AS 'ppep_seq' + FROM ppep + """ + MRGFLTR_DDL = """ + DROP VIEW IF EXISTS mrgfltr_metadata_view; + DROP TABLE IF EXISTS mrgfltr_metadata; + CREATE TABLE mrgfltr_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , Sequence10 TEXT + , Sequence7 TEXT + , GeneName TEXT + , Phosphoresidue TEXT + , UniProtID TEXT + , Description TEXT + , FunctionPhosphoresidue TEXT + , PutativeUpstreamDomains TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW mrgfltr_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , Sequence10 + , Sequence7 + , GeneName + , Phosphoresidue + , UniProtID + , Description + , FunctionPhosphoresidue + , PutativeUpstreamDomains + FROM + ppep, mrgfltr_metadata + WHERE + mrgfltr_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + """ + + CITATION_INSERT_STMT = """ + INSERT INTO Citation ( + ObjectName, + CitationData + ) VALUES (?,?) + """ + CITATION_INSERT_PSP = 'PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words "PhosphoSitePlus(R), www.phosphosite.org" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: "Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926."' + CITATION_INSERT_PSP_REF = 'Hornbeck, 2014, "PhosphoSitePlus, 2014: mutations, PTMs and recalibrations.", https://pubmed.ncbi.nlm.nih.gov/22135298, https://doi.org/10.1093/nar/gkr1122' + + MRGFLTR_METADATA_COLUMNS = [ + "ppep_id", + "Sequence10", + "Sequence7", + "GeneName", + "Phosphoresidue", + "UniProtID", + "Description", + "FunctionPhosphoresidue", + "PutativeUpstreamDomains", + ] + + # String Constants (end) ############ + + class Error(Exception): + """Base class for exceptions in this module.""" + + pass + + class PreconditionError(Error): + """Exception raised for errors in the input. + + Attributes: + expression -- input expression in which the error occurred + message -- explanation of the error + """ + + def __init__(self, expression, message): + self.expression = expression + self.message = message + + # start_time = time.clock() #timer + start_time = time.process_time() # timer + + # get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(upstream_map_filename_tab, "rb") + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + upstream_map_p_peptide_list = [] + re_tab = re.compile("^[^\t]*") + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + upstream_map_p_peptide_list.append(m[0]) + file1.close() + file1_encoded.close() + + # Get the list of phosphopeptides with the p's that represent the phosphorylation sites removed + re_phos = re.compile("p") + + end_time = time.process_time() # timer + print( + "%0.6f pre-read-SwissProt [0.1]" % (end_time - start_time,), + file=sys.stderr, + ) + + # ----------- Get SwissProt data from SQLite database (start) ----------- + # build UniProt sequence LUT and list of unique SwissProt sequences + + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up structures to hold SwissProt data + + uniprot_Sequence_List = [] + UniProtSeqLUT = {} + + # Execute query for unique seqs without fetching the results yet + uniprot_unique_seq_cur = cur.execute(UNIPROT_UNIQUE_SEQ_SQL) + + while 1: + batch = uniprot_unique_seq_cur.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for row in batch: + Sequence = row[0] + UniProtSeqLUT[(Sequence, DESCRIPTION)] = [] + UniProtSeqLUT[(Sequence, GENE_NAME)] = [] + UniProtSeqLUT[(Sequence, UNIPROT_ID)] = [] + UniProtSeqLUT[Sequence] = [] + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_meta = cur.execute(UNIPROT_SEQ_AND_META_SQL) + + while 1: + batch = uniprot_seq_and_meta.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for ( + UniProt_ID, + Description, + Gene_Name, + Sequence, + OS, + OX, + PE, + SV, + ) in batch: + uniprot_Sequence_List.append(Sequence) + UniProtSeqLUT[Sequence] = Sequence + UniProtSeqLUT[(Sequence, UNIPROT_ID)].append(UniProt_ID) + UniProtSeqLUT[(Sequence, GENE_NAME)].append(Gene_Name) + if OS != N_A: + Description += " OS=" + OS + if OX != -1: + Description += " OX=" + str(OX) + if Gene_Name != N_A: + Description += " GN=" + Gene_Name + if PE != N_A: + Description += " PE=" + PE + if SV != N_A: + Description += " SV=" + SV + UniProtSeqLUT[(Sequence, DESCRIPTION)].append(Description) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + Sequence = "" + UniProt_ID = "" + Description = "" + Gene_Name = "" + + # ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() # timer + print( + "%0.6f post-read-SwissProt [0.2]" % (end_time - start_time,), + file=sys.stderr, + ) + + # ----------- Get SwissProt data from SQLite database (start) ----------- + # Open SwissProt SQLite database + conn = sql.connect(uniprot_sqlite) + cur = conn.cursor() + + # Set up dictionary to aggregate results for phosphopeptides correspounding to dephosphoeptide + DephosphoPep_UniProtSeq_LUT = {} + + # Set up dictionary to accumulate results + PhosphoPep_UniProtSeq_LUT = {} + + # Execute query for tuples without fetching the results yet + ppep_pep_uniprotseq_cur = cur.execute(PPEP_PEP_UNIPROTSEQ_SQL) + + while 1: + batch = ppep_pep_uniprotseq_cur.fetchmany(size=50) + if not batch: + # handle case where no records are returned + break + for (phospho_pep, dephospho_pep, sequence) in batch: + # do interesting stuff here... + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP) + ] = dephospho_pep + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + DephosphoPep_UniProtSeq_LUT[dephospho_pep] = set() + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ] = [] + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ] = [] + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] = [] + DephosphoPep_UniProtSeq_LUT[(dephospho_pep, SEQUENCE)] = [] + DephosphoPep_UniProtSeq_LUT[dephospho_pep].add(phospho_pep) + + if ( + sequence + not in DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ] + ): + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ].append(sequence) + for phospho_pep in DephosphoPep_UniProtSeq_LUT[dephospho_pep]: + if phospho_pep != phospho_pep: + print( + "phospho_pep:'%s' phospho_pep:'%s'" + % (phospho_pep, phospho_pep) + ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + PhosphoPep_UniProtSeq_LUT[phospho_pep] = phospho_pep + PhosphoPep_UniProtSeq_LUT[ + (phospho_pep, DEPHOSPHOPEP) + ] = dephospho_pep + r = list( + zip( + [s for s in UniProtSeqLUT[(sequence, UNIPROT_ID)]], + [s for s in UniProtSeqLUT[(sequence, GENE_NAME)]], + [ + s + for s in UniProtSeqLUT[(sequence, DESCRIPTION)] + ], + ) + ) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com/a/4174955/15509512 + r = sorted(r, key=operator.itemgetter(0)) + # Get one tuple for each `phospho_pep` + # in DephosphoPep_UniProtSeq_LUT[dephospho_pep] + for (upid, gn, desc) in r: + # Append pseudo-tuple per UniProt_ID but only when it is not present + if ( + upid + not in DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] + ): + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ].append(upid) + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ].append(desc) + DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ].append(gn) + + # Close SwissProt SQLite database; clean up local variables + conn.close() + # wipe local variables + phospho_pep = dephospho_pep = sequence = 0 + upid = gn = desc = r = "" + + # ----------- Get SwissProt data from SQLite database (finish) ----------- + + end_time = time.process_time() # timer + print( + "%0.6f finished reading and decoding '%s' [0.4]" + % (end_time - start_time, upstream_map_filename_tab), + file=sys.stderr, + ) + + print( + "{:>10} unique upstream phosphopeptides tested".format( + str(len(upstream_map_p_peptide_list)) + ) + ) + + # Read in Upstream tabular file + # We are discarding the intensity data; so read it as text + upstream_data = pandas.read_table( + upstream_map_filename_tab, dtype="str", index_col=0 + ) + + end_time = time.process_time() # timer + print( + "%0.6f read Upstream Map from file [1g_1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + upstream_data.index = upstream_map_p_peptide_list + + end_time = time.process_time() # timer + print( + "%0.6f added index to Upstream Map [1g_2]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # ######################################################################## + # # trim upstream_data to include only the upstream map columns + # old_cols = upstream_data.columns.tolist() + # i = 0 + # first_intensity = -1 + # last_intensity = -1 + # intensity_re = re.compile("Intensity.*") + # for col_name in old_cols: + # m = intensity_re.match(col_name) + # if m: + # last_intensity = i + # if first_intensity == -1: + # first_intensity = i + # i += 1 + # # print('last intensity = %d' % last_intensity) + # col_PKCalpha = last_intensity + 2 + # + # data_in_cols = [old_cols[0]] + old_cols[ + # first_intensity: last_intensity + 1 + # ] + # + # if upstream_data.empty: + # print("upstream_data is empty") + # exit(0) + # + # data_in = upstream_data.copy(deep=True)[data_in_cols] + ######################################################################## + # trim upstream_data to include only the upstream map columns + old_cols = upstream_data.columns.tolist() + i = 0 + first_intensity = -1 + last_intensity = -1 + intensity_re = re.compile("Intensity.*") + for col_name in old_cols: + m = intensity_re.match(col_name) + if m: + last_intensity = i + if first_intensity == -1: + first_intensity = i + i += 1 + # print('last intensity = %d' % last_intensity) + col_PKCalpha = last_intensity + 2 + + data_in_cols = [old_cols[0]] + old_cols[ + first_intensity - 1: last_intensity + ] + data_col_names = [old_cols[0]] + old_cols[ + first_intensity: last_intensity + 1 + ] + + if upstream_data.empty: + print("upstream_data is empty") + exit(0) + + data_in = upstream_data.copy(deep=True)[data_in_cols] + data_in.columns = data_col_names + print("data_in") + print(data_in) + ######################################################################## + + # Convert floating-point integers to int64 integers + # ref: https://stackoverflow.com/a/68497603/15509512 + data_in[list(data_in.columns[1:])] = ( + data_in[list(data_in.columns[1:])] + .astype("float64") + .apply(np.int64) + ) + + # create another phosphopeptide column that will be used to join later; + # MAY need to change depending on Phosphopeptide column position + # data_in[PHOSPHOPEPTIDE_MATCH] = data_in[data_in.columns.tolist()[0]] + data_in[PHOSPHOPEPTIDE_MATCH] = data_in.index + + end_time = time.process_time() # timer + print( + "%0.6f set data_in[PHOSPHOPEPTIDE_MATCH] [A]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Produce a dictionary of metadata for a single phosphopeptide. + # This is a replacement of `UniProtInfo_subdict` in the original code. + def pseq_to_subdict(phospho_pep): + # Strip "p" from phosphopeptide sequence + dephospho_pep = re_phos.sub("", phospho_pep) + + # Determine number of phosphoresidues in phosphopeptide + numps = len(phospho_pep) - len(dephospho_pep) + + # Determine location(s) of phosphoresidue(s) in phosphopeptide + # (used later for Phosphoresidue, Sequence7, and Sequence10) + ploc = [] # list of p locations + i = 0 + p = phospho_pep + while i < numps: + ploc.append(p.find("p")) + p = p[: p.find("p")] + p[p.find("p") + 1:] + i += 1 + + # Establish nested dictionary + result = {} + result[SEQUENCE] = [] + result[UNIPROT_ID] = [] + result[DESCRIPTION] = [] + result[GENE_NAME] = [] + result[PHOSPHORESIDUE] = [] + result[SEQUENCE7] = [] + result[SEQUENCE10] = [] + + # Add stripped sequence to dictionary + result[SEQUENCE].append(dephospho_pep) + + # Locate phospho_pep in PhosphoPep_UniProtSeq_LUT + # Caller may elect to: + # try: + # ... + # except PreconditionError as pe: + # print("'{expression}': {message}".format( + # expression = pe.expression, + # message = pe.message)) + # ) + # ) + if phospho_pep not in PhosphoPep_UniProtSeq_LUT: + raise PreconditionError( + phospho_pep, + "no matching phosphopeptide found in PhosphoPep_UniProtSeq_LUT", + ) + if dephospho_pep not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( + dephospho_pep, + "dephosphorylated phosphopeptide not found in DephosphoPep_UniProtSeq_LUT", + ) + if ( + dephospho_pep != PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] + ): + my_err_msg = "dephosphorylated phosphopeptide does not match " + my_err_msg += "PhosphoPep_UniProtSeq_LUT[(phospho_pep,DEPHOSPHOPEP)] = " + my_err_msg += PhosphoPep_UniProtSeq_LUT[(phospho_pep, DEPHOSPHOPEP)] + raise PreconditionError(dephospho_pep, my_err_msg) + + result[SEQUENCE] = [dephospho_pep] + result[UNIPROT_ID] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, UNIPROT_ID) + ] + result[DESCRIPTION] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, DESCRIPTION) + ] + result[GENE_NAME] = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, GENE_NAME) + ] + if (dephospho_pep, SEQUENCE) not in DephosphoPep_UniProtSeq_LUT: + raise PreconditionError( + dephospho_pep, + "no matching phosphopeptide found in DephosphoPep_UniProtSeq_LUT", + ) + UniProtSeqList = DephosphoPep_UniProtSeq_LUT[ + (dephospho_pep, SEQUENCE) + ] + if len(UniProtSeqList) < 1: + print( + "Skipping DephosphoPep_UniProtSeq_LUT[('%s',SEQUENCE)] because value has zero length" + % dephospho_pep + ) + # raise PreconditionError( + # "DephosphoPep_UniProtSeq_LUT[('" + dephospho_pep + ",SEQUENCE)", + # 'value has zero length' + # ) + for UniProtSeq in UniProtSeqList: + i = 0 + phosphoresidues = [] + seq7s_set = set() + seq7s = [] + seq10s_set = set() + seq10s = [] + while i < len(ploc): + start = UniProtSeq.find(dephospho_pep) + # handle case where no sequence was found for dep-pep + if start < 0: + i += 1 + continue + psite = ( + start + ploc[i] + ) # location of phosphoresidue on protein sequence + + # add Phosphoresidue + phosphosite = "p" + str(UniProtSeq)[psite] + str(psite + 1) + phosphoresidues.append(phosphosite) + + # Add Sequence7 + if psite < 7: # phospho_pep at N terminus + seq7 = str(UniProtSeq)[: psite + 8] + if seq7[psite] == "S": # if phosphosresidue is serine + pres = "s" + elif ( + seq7[psite] == "T" + ): # if phosphosresidue is threonine + pres = "t" + elif ( + seq7[psite] == "Y" + ): # if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = ( + seq7[:psite] + pres + seq7[psite + 1: psite + 8] + ) + while ( + len(seq7) < 15 + ): # add appropriate number of "_" to the front + seq7 = "_" + seq7 + elif ( + len(UniProtSeq) - psite < 8 + ): # phospho_pep at C terminus + seq7 = str(UniProtSeq)[psite - 7:] + if seq7[7] == "S": + pres = "s" + elif seq7[7] == "T": + pres = "t" + elif seq7[7] == "Y": + pres = "y" + else: + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + while ( + len(seq7) < 15 + ): # add appropriate number of "_" to the back + seq7 = seq7 + "_" + else: + seq7 = str(UniProtSeq)[psite - 7: psite + 8] + pres = "" # phosphoresidue + if seq7[7] == "S": # if phosphosresidue is serine + pres = "s" + elif seq7[7] == "T": # if phosphosresidue is threonine + pres = "t" + elif seq7[7] == "Y": # if phosphoresidue is tyrosine + pres = "y" + else: # if not pSTY + pres = "?" + seq7 = seq7[:7] + pres + seq7[8:] + if seq7 not in seq7s_set: + seq7s.append(seq7) + seq7s_set.add(seq7) + + # add Sequence10 + if psite < 10: # phospho_pep at N terminus + seq10 = ( + str(UniProtSeq)[:psite] + "p" + str(UniProtSeq)[psite: psite + 11] + ) + elif ( + len(UniProtSeq) - psite < 11 + ): # phospho_pep at C terminus + seq10 = ( + str(UniProtSeq)[psite - 10: psite] + "p" + str(UniProtSeq)[psite:] + ) + else: + seq10 = str(UniProtSeq)[psite - 10: psite + 11] + seq10 = seq10[:10] + "p" + seq10[10:] + if seq10 not in seq10s_set: + seq10s.append(seq10) + seq10s_set.add(seq10) + + i += 1 + + result[PHOSPHORESIDUE].append(phosphoresidues) + result[SEQUENCE7].append(seq7s) + # result[SEQUENCE10] is a list of lists of strings + result[SEQUENCE10].append(seq10s) + + r = list( + zip( + result[UNIPROT_ID], + result[GENE_NAME], + result[DESCRIPTION], + result[PHOSPHORESIDUE], + ) + ) + # Sort by `UniProt_ID` + # ref: https://stackoverflow.com//4174955/15509512 + s = sorted(r, key=operator.itemgetter(0)) + + result[UNIPROT_ID] = [] + result[GENE_NAME] = [] + result[DESCRIPTION] = [] + result[PHOSPHORESIDUE] = [] + + for r in s: + result[UNIPROT_ID].append(r[0]) + result[GENE_NAME].append(r[1]) + result[DESCRIPTION].append(r[2]) + result[PHOSPHORESIDUE].append(r[3]) + + # convert lists to strings in the dictionary + for key, value in result.items(): + if key not in [PHOSPHORESIDUE, SEQUENCE7, SEQUENCE10]: + result[key] = "; ".join(map(str, value)) + elif key in [SEQUENCE10]: + # result[SEQUENCE10] is a list of lists of strings + joined_value = "" + joined_set = set() + sep = "" + for valL in value: + # valL is a list of strings + for val in valL: + # val is a string + if val not in joined_set: + joined_set.add(val) + joined_value += sep + val + sep = "; " + # joined_value is a string + result[key] = joined_value + + newstring = "; ".join( + [", ".join(prez) for prez in result[PHOSPHORESIDUE]] + ) + # #separate the isoforms in PHOSPHORESIDUE column with ";" + # oldstring = result[PHOSPHORESIDUE] + # oldlist = list(oldstring) + # newstring = "" + # i = 0 + # for e in oldlist: + # if e == ";": + # if numps > 1: + # if i%numps: + # newstring = newstring + ";" + # else: + # newstring = newstring + "," + # else: + # newstring = newstring + ";" + # i +=1 + # else: + # newstring = newstring + e + result[PHOSPHORESIDUE] = newstring + + # separate sequence7's by | + oldstring = result[SEQUENCE7] + oldlist = oldstring + newstring = "" + for ol in oldlist: + for e in ol: + if e == ";": + newstring = newstring + " |" + elif len(newstring) > 0 and 1 > newstring.count(e): + newstring = newstring + " | " + e + elif 1 > newstring.count(e): + newstring = newstring + e + result[SEQUENCE7] = newstring + + return [phospho_pep, result] + + # Construct list of [string, dictionary] lists + # where the dictionary provides the SwissProt metadata + # for a phosphopeptide + result_list = [ + whine(pseq_to_subdict, psequence) + for psequence in data_in[PHOSPHOPEPTIDE_MATCH] + ] + + end_time = time.process_time() # timer + print( + "%0.6f added SwissProt annotations to phosphopeptides [B]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Construct dictionary from list of lists + # ref: https://www.8bitavenue.com/how-to-convert-list-of-lists-to-dictionary-in-python/ + UniProt_Info = { + result[0]: result[1] + for result in result_list + if result is not None + } + + end_time = time.process_time() # timer + print( + "%0.6f create dictionary mapping phosphopeptide to metadata dictionary [C]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # cosmetic: add N_A to phosphopeptide rows with no hits + p_peptide_list = [] + for key in UniProt_Info: + p_peptide_list.append(key) + for nestedKey in UniProt_Info[key]: + if UniProt_Info[key][nestedKey] == "": + UniProt_Info[key][nestedKey] = N_A + + end_time = time.process_time() # timer + print( + "%0.6f performed cosmetic clean-up [D]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # convert UniProt_Info dictionary to dataframe + uniprot_df = pandas.DataFrame.transpose( + pandas.DataFrame.from_dict(UniProt_Info) + ) + + # reorder columns to match expected output file + uniprot_df[ + PHOSPHOPEPTIDE + ] = uniprot_df.index # make index a column too + + cols = uniprot_df.columns.tolist() + # cols = [cols[-1]]+cols[4:6]+[cols[1]]+[cols[2]]+[cols[6]]+[cols[0]] + # uniprot_df = uniprot_df[cols] + uniprot_df = uniprot_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE10, + SEQUENCE7, + GENE_NAME, + PHOSPHORESIDUE, + UNIPROT_ID, + DESCRIPTION, + ] + ] + + end_time = time.process_time() # timer + print( + "%0.6f reordered columns to match expected output file [1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # concat to split then groupby to collapse + seq7_df = pandas.concat( + [ + pandas.Series(row[PHOSPHOPEPTIDE], row[SEQUENCE7].split(" | ")) + for _, row in uniprot_df.iterrows() + ] + ).reset_index() + seq7_df.columns = [SEQUENCE7, PHOSPHOPEPTIDE] + + # --- -------------- begin read PSP_Regulatory_sites --------------------------------- + # read in PhosphoSitePlus Regulatory Sites dataset + # ----------- Get PhosphoSitePlus Regulatory Sites data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + regsites_df = pandas.read_sql_query(PSP_REGSITE_SQL, conn) + # Close SwissProt SQLite database + conn.close() + # ... -------------- end read PSP_Regulatory_sites ------------------------------------ + + # keep only the human entries in dataframe + if len(species) > 0: + print( + 'Limit PhosphoSitesPlus records to species "' + species + '"' + ) + regsites_df = regsites_df[regsites_df.ORGANISM == species] + + # merge the seq7 df with the regsites df based off of the sequence7 + merge_df = seq7_df.merge( + regsites_df, + left_on=SEQUENCE7, + right_on=SITE_PLUSMINUS_7AA_SQL, + how="left", + ) + + # after merging df, select only the columns of interest; + # note that PROTEIN is absent here + merge_df = merge_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE7, + ON_FUNCTION, + ON_PROCESS, + ON_PROT_INTERACT, + ON_OTHER_INTERACT, + ON_NOTES, + ] + ] + # combine column values of interest + # into one FUNCTION_PHOSPHORESIDUE column" + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ON_FUNCTION].str.cat( + merge_df[ON_PROCESS], sep="; ", na_rep="" + ) + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_PROT_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_OTHER_INTERACT], sep="; ", na_rep="") + merge_df[FUNCTION_PHOSPHORESIDUE] = merge_df[ + FUNCTION_PHOSPHORESIDUE + ].str.cat(merge_df[ON_NOTES], sep="; ", na_rep="") + + # remove the columns that were combined + merge_df = merge_df[ + [PHOSPHOPEPTIDE, SEQUENCE7, FUNCTION_PHOSPHORESIDUE] + ] + + end_time = time.process_time() # timer + print( + "%0.6f merge regsite metadata [1a]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # cosmetic changes to Function Phosphoresidue column + fp_series = pandas.Series(merge_df[FUNCTION_PHOSPHORESIDUE]) + + end_time = time.process_time() # timer + print( + "%0.6f more cosmetic changes [1b]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + i = 0 + while i < len(fp_series): + # remove the extra ";" so that it looks more professional + if fp_series[i] == "; ; ; ; ": # remove ; from empty hits + fp_series[i] = "" + while fp_series[i].endswith("; "): # remove ; from the ends + fp_series[i] = fp_series[i][:-2] + while fp_series[i].startswith("; "): # remove ; from the beginning + fp_series[i] = fp_series[i][2:] + fp_series[i] = fp_series[i].replace("; ; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ; ", "; ") + fp_series[i] = fp_series[i].replace("; ; ", "; ") + + # turn blanks into N_A to signify the info was searched for but cannot be found + if fp_series[i] == "": + fp_series[i] = N_A + + i += 1 + merge_df[FUNCTION_PHOSPHORESIDUE] = fp_series + + end_time = time.process_time() # timer + print( + "%0.6f cleaned up semicolons [1c]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # merge uniprot df with merge df + uniprot_regsites_merged_df = uniprot_df.merge( + merge_df, + left_on=PHOSPHOPEPTIDE, + right_on=PHOSPHOPEPTIDE, + how="left", + ) + + # collapse the merged df + uniprot_regsites_collapsed_df = pandas.DataFrame( + uniprot_regsites_merged_df.groupby(PHOSPHOPEPTIDE)[ + FUNCTION_PHOSPHORESIDUE + ].apply(lambda x: ppep_join(x)) + ) + # .apply(lambda x: "%s" % ' | '.join(x))) + + end_time = time.process_time() # timer + print( + "%0.6f collapsed pandas dataframe [1d]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + uniprot_regsites_collapsed_df[ + PHOSPHOPEPTIDE + ] = ( + uniprot_regsites_collapsed_df.index + ) # add df index as its own column + + # rename columns + uniprot_regsites_collapsed_df.columns = [ + FUNCTION_PHOSPHORESIDUE, + "ppp", + ] + + end_time = time.process_time() # timer + print( + "%0.6f selected columns to be merged to uniprot_df [1e]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # add columns based on Sequence7 matching site_+/-7_AA + uniprot_regsite_df = pandas.merge( + left=uniprot_df, + right=uniprot_regsites_collapsed_df, + how="left", + left_on=PHOSPHOPEPTIDE, + right_on="ppp", + ) + + end_time = time.process_time() # timer + print( + "%0.6f added columns based on Sequence7 matching site_+/-7_AA [1f]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + data_in.rename( + {"Protein description": PHOSPHOPEPTIDE}, + axis="columns", + inplace=True, + ) + + # data_in.sort_values(PHOSPHOPEPTIDE_MATCH, inplace=True, kind='mergesort') + res2 = sorted( + data_in[PHOSPHOPEPTIDE_MATCH].tolist(), key=lambda s: s.casefold() + ) + data_in = data_in.loc[res2] + + end_time = time.process_time() # timer + print( + "%0.6f sorting time [1f]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + print("old_cols[:col_PKCalpha]") + print(old_cols[:col_PKCalpha]) + cols = [old_cols[0]] + old_cols[col_PKCalpha - 1:] + upstream_data = upstream_data[cols] + print("upstream_data.columns") + print(upstream_data.columns) + + end_time = time.process_time() # timer + print( + "%0.6f refactored columns for Upstream Map [1g]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # #rename upstream columns in new list + # new_cols = [] + # for name in cols: + # if "_NetworKIN" in name: + # name = name.split("_")[0] + # if " motif" in name: + # name = name.split(" motif")[0] + # if " sequence " in name: + # name = name.split(" sequence")[0] + # if "_Phosida" in name: + # name = name.split("_")[0] + # if "_PhosphoSite" in name: + # name = name.split("_")[0] + # new_cols.append(name) + + # rename upstream columns in new list + def col_rename(name): + if "_NetworKIN" in name: + name = name.split("_")[0] + if " motif" in name: + name = name.split(" motif")[0] + if " sequence " in name: + name = name.split(" sequence")[0] + if "_Phosida" in name: + name = name.split("_")[0] + if "_PhosphoSite" in name: + name = name.split("_")[0] + return name + + new_cols = [col_rename(col) for col in cols] + upstream_data.columns = new_cols + + end_time = time.process_time() # timer + print( + "%0.6f renamed columns for Upstream Map [1h_1]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Create upstream_data_cast as a copy of upstream_data + # but with first column substituted by the phosphopeptide sequence + upstream_data_cast = upstream_data.copy() + new_cols_cast = new_cols + new_cols_cast[0] = "p_peptide" + upstream_data_cast.columns = new_cols_cast + upstream_data_cast["p_peptide"] = upstream_data.index + + # --- -------------- begin read upstream_data_melt ------------------------------------ + # ----------- Get melted kinase mapping data from SQLite database (start) ----------- + conn = sql.connect(uniprot_sqlite) + upstream_data_melt_df = pandas.read_sql_query(PPEP_MELT_SQL, conn) + # Close SwissProt SQLite database + conn.close() + upstream_data_melt = upstream_data_melt_df.copy() + upstream_data_melt.columns = ["p_peptide", "characterization", "X"] + upstream_data_melt["characterization"] = [ + col_rename(s) for s in upstream_data_melt["characterization"] + ] + + print( + "%0.6f upstream_data_melt_df initially has %d rows" + % (end_time - start_time, len(upstream_data_melt.axes[0])), + file=sys.stderr, + ) + # ref: https://stackoverflow.com/a/27360130/15509512 + # e.g. df.drop(df[df.score < 50].index, inplace=True) + upstream_data_melt.drop( + upstream_data_melt[upstream_data_melt.X != "X"].index, inplace=True + ) + print( + "%0.6f upstream_data_melt_df pre-dedup has %d rows" + % (end_time - start_time, len(upstream_data_melt.axes[0])), + file=sys.stderr, + ) + # ----------- Get melted kinase mapping data from SQLite database (finish) ----------- + # ... -------------- end read upstream_data_melt -------------------------------------- + + end_time = time.process_time() # timer + print( + "%0.6f melted and minimized Upstream Map dataframe [1h_2]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + # ... end read upstream_data_melt + + end_time = time.process_time() # timer + print( + "%0.6f indexed melted Upstream Map [1h_2a]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + upstream_delta_melt_LoL = upstream_data_melt.values.tolist() + + melt_dict = {} + for key in upstream_map_p_peptide_list: + melt_dict[key] = [] + + for el in upstream_delta_melt_LoL: + (p_peptide, characterization, X) = tuple(el) + if p_peptide in melt_dict: + melt_dict[p_peptide].append(characterization) + else: + exit( + 'Phosphopeptide %s not found in ppep_mapping_db: "phopsphopeptides" and "ppep_mapping_db" must both originate from the same run of mqppep_kinase_mapping' + % (p_peptide) + ) + + end_time = time.process_time() # timer + print( + "%0.6f appended peptide characterizations [1h_2b]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # for key in upstream_map_p_peptide_list: + # melt_dict[key] = ' | '.join(melt_dict[key]) + + for key in upstream_map_p_peptide_list: + melt_dict[key] = melt_join(melt_dict[key]) + + end_time = time.process_time() # timer + print( + "%0.6f concatenated multiple characterizations [1h_2c]" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # map_dict is a dictionary of dictionaries + map_dict = {} + for key in upstream_map_p_peptide_list: + map_dict[key] = {} + map_dict[key][PUTATIVE_UPSTREAM_DOMAINS] = melt_dict[key] + + end_time = time.process_time() # timer + print( + "%0.6f instantiated map dictionary [2]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # convert map_dict to dataframe + map_df = pandas.DataFrame.transpose( + pandas.DataFrame.from_dict(map_dict) + ) + map_df["p-peptide"] = map_df.index # make index a column too + cols_map_df = map_df.columns.tolist() + cols_map_df = [cols_map_df[1]] + [cols_map_df[0]] + map_df = map_df[cols_map_df] + + # join map_df to uniprot_regsite_df + output_df = uniprot_regsite_df.merge( + map_df, how="left", left_on=PHOSPHOPEPTIDE, right_on="p-peptide" + ) + + output_df = output_df[ + [ + PHOSPHOPEPTIDE, + SEQUENCE10, + SEQUENCE7, + GENE_NAME, + PHOSPHORESIDUE, + UNIPROT_ID, + DESCRIPTION, + FUNCTION_PHOSPHORESIDUE, + PUTATIVE_UPSTREAM_DOMAINS, + ] + ] + + # cols_output_prelim = output_df.columns.tolist() + # + # print("cols_output_prelim") + # print(cols_output_prelim) + # + # cols_output = cols_output_prelim[:8]+[cols_output_prelim[9]]+[cols_output_prelim[10]] + # + # print("cols_output with p-peptide") + # print(cols_output) + # + # cols_output = [col for col in cols_output if not col == "p-peptide"] + # + # print("cols_output") + # print(cols_output) + # + # output_df = output_df[cols_output] + + # join output_df back to quantitative columns in data_in df + quant_cols = data_in.columns.tolist() + quant_cols = quant_cols[1:] + quant_data = data_in[quant_cols] + + # ----------- Write merge/filter metadata to SQLite database (start) ----------- + # Open SwissProt SQLite database + conn = sql.connect(output_sqlite) + cur = conn.cursor() + + cur.executescript(MRGFLTR_DDL) + + cur.execute( + CITATION_INSERT_STMT, + ("mrgfltr_metadata_view", CITATION_INSERT_PSP), + ) + cur.execute( + CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP) + ) + cur.execute( + CITATION_INSERT_STMT, + ("mrgfltr_metadata_view", CITATION_INSERT_PSP_REF), + ) + cur.execute( + CITATION_INSERT_STMT, ("mrgfltr_metadata", CITATION_INSERT_PSP_REF) + ) + + # Read ppep-to-sequence LUT + ppep_lut_df = pandas.read_sql_query(PPEP_ID_SQL, conn) + # write only metadata for merged/filtered records to SQLite + mrgfltr_metadata_df = output_df.copy() + # replace phosphopeptide seq with ppep.id + mrgfltr_metadata_df = ppep_lut_df.merge( + mrgfltr_metadata_df, + left_on="ppep_seq", + right_on=PHOSPHOPEPTIDE, + how="inner", + ) + mrgfltr_metadata_df.drop( + columns=[PHOSPHOPEPTIDE, "ppep_seq"], inplace=True + ) + # rename columns + mrgfltr_metadata_df.columns = MRGFLTR_METADATA_COLUMNS + mrgfltr_metadata_df.to_sql( + "mrgfltr_metadata", + con=conn, + if_exists="append", + index=False, + method="multi", + ) + + # Close SwissProt SQLite database + conn.close() + # ----------- Write merge/filter metadata to SQLite database (finish) ----------- + + output_df = output_df.merge( + quant_data, + how="right", + left_on=PHOSPHOPEPTIDE, + right_on=PHOSPHOPEPTIDE_MATCH, + ) + output_cols = output_df.columns.tolist() + output_cols = output_cols[:-1] + output_df = output_df[output_cols] + + # cosmetic changes to Upstream column + output_df[PUTATIVE_UPSTREAM_DOMAINS] = output_df[ + PUTATIVE_UPSTREAM_DOMAINS + ].fillna( + "" + ) # fill the NaN with "" for those Phosphopeptides that got a "WARNING: Failed match for " in the upstream mapping + us_series = pandas.Series(output_df[PUTATIVE_UPSTREAM_DOMAINS]) + i = 0 + while i < len(us_series): + # turn blanks into N_A to signify the info was searched for but cannot be found + if us_series[i] == "": + us_series[i] = N_A + i += 1 + output_df[PUTATIVE_UPSTREAM_DOMAINS] = us_series + + end_time = time.process_time() # timer + print( + "%0.6f establisheed output [3]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + (output_rows, output_cols) = output_df.shape + + output_df = output_df.convert_dtypes(convert_integer=True) + + # Output onto Final CSV file + output_df.to_csv(output_filename_csv, index=False) + output_df.to_csv( + output_filename_tab, quoting=None, sep="\t", index=False + ) + + end_time = time.process_time() # timer + print( + "%0.6f wrote output [4]" % (end_time - start_time,), + file=sys.stderr, + ) # timer + + print( + "{:>10} phosphopeptides written to output".format(str(output_rows)) + ) + + end_time = time.process_time() # timer + print( + "%0.6f seconds of non-system CPU time were consumed" + % (end_time - start_time,), + file=sys.stderr, + ) # timer + + # Rev. 7/1/2016 + # Rev. 7/3/2016 : fill NaN in Upstream column to replace to N/A's + # Rev. 7/3/2016: renamed Upstream column to PUTATIVE_UPSTREAM_DOMAINS + # Rev. 12/2/2021: Converted to Python from ipynb; use fast Aho-Corasick searching; \ + # read from SwissProt SQLite database + # Rev. 12/9/2021: Transfer code to Galaxy tool wrapper + + # + # copied from Excel Output Script.ipynb END # + # + + try: + catch( + mqpep_getswissprot, + ) + exit(0) + except Exception as e: + exit("Internal error running mqpep_getswissprot(): %s" % (e)) + + +if __name__ == "__main__": + __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/search_ppep.py Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,560 @@ +#!/usr/bin/env python +# Search and memoize phosphopeptides in Swiss-Prot SQLite table UniProtKB + +import argparse +import os.path +import re +import sqlite3 +import sys # import the sys module for exc_info +import time +import traceback # import the traceback module for format_exception +from codecs import getreader as cx_getreader + +# For Aho-Corasick search for fixed set of substrings +# - add_word +# - make_automaton +# - iter +import ahocorasick + + +# ref: https://stackoverflow.com/a/8915613/15509512 +# answers: "How to handle exceptions in a list comprehensions" +# usage: +# from math import log +# eggs = [1,3,0,3,2] +# print([x for x in [catch(log, egg) for egg in eggs] if x is not None]) +# producing: +# for <built-in function log> +# with args (0,) +# exception: math domain error +# [0.0, 1.0986122886681098, 1.0986122886681098, 0.6931471805599453] +def catch(func, *args, handle=lambda e: e, **kwargs): + + try: + return func(*args, **kwargs) + except Exception as e: + print("For %s" % str(func)) + print(" with args %s" % str(args)) + print(" caught exception: %s" % str(e)) + (ty, va, tb) = sys.exc_info() + print(" stack trace: " + str(traceback.format_exception(ty, va, tb))) + # exit(-1) + return None # was handle(e) + + +def __main__(): + + DROP_TABLES_SQL = """ + DROP VIEW IF EXISTS ppep_gene_site_view; + DROP VIEW IF EXISTS uniprot_view; + DROP VIEW IF EXISTS uniprotkb_pep_ppep_view; + DROP VIEW IF EXISTS ppep_intensity_view; + DROP VIEW IF EXISTS ppep_metadata_view; + + DROP TABLE IF EXISTS sample; + DROP TABLE IF EXISTS ppep; + DROP TABLE IF EXISTS site_type; + DROP TABLE IF EXISTS deppep_UniProtKB; + DROP TABLE IF EXISTS deppep; + DROP TABLE IF EXISTS ppep_gene_site; + DROP TABLE IF EXISTS ppep_metadata; + DROP TABLE IF EXISTS ppep_intensity; + """ + + CREATE_TABLES_SQL = """ + CREATE TABLE deppep + ( id INTEGER PRIMARY KEY + , seq TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE TABLE deppep_UniProtKB + ( deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , UniProtKB_id TEXT REFERENCES UniProtKB(id) ON DELETE CASCADE + , pos_start INTEGER + , pos_end INTEGER + , PRIMARY KEY (deppep_id, UniProtKB_id, pos_start, pos_end) + ON CONFLICT IGNORE + ) + ; + CREATE TABLE ppep + ( id INTEGER PRIMARY KEY + , deppep_id INTEGER REFERENCES deppep(id) ON DELETE CASCADE + , seq TEXT UNIQUE ON CONFLICT IGNORE + , scrubbed TEXT + ); + CREATE TABLE site_type + ( id INTEGER PRIMARY KEY + , type_name TEXT UNIQUE ON CONFLICT IGNORE + ); + CREATE INDEX idx_ppep_scrubbed on ppep(scrubbed) + ; + CREATE TABLE sample + ( id INTEGER PRIMARY KEY + , name TEXT UNIQUE ON CONFLICT IGNORE + ) + ; + CREATE VIEW uniprot_view AS + SELECT DISTINCT + Uniprot_ID + , Description + , Organism_Name + , Organism_ID + , Gene_Name + , PE + , SV + , Sequence + , Description || + CASE WHEN Organism_Name = 'N/A' + THEN '' + ELSE ' OS='|| Organism_Name + END || + CASE WHEN Organism_ID = -1 + THEN '' + ELSE ' OX='|| Organism_ID + END || + CASE WHEN Gene_Name = 'N/A' + THEN '' + ELSE ' GN='|| Gene_Name + END || + CASE WHEN PE = 'N/A' + THEN '' + ELSE ' PE='|| PE + END || + CASE WHEN SV = 'N/A' + THEN '' + ELSE ' SV='|| SV + END AS long_description + , Database + FROM UniProtKB + ; + CREATE VIEW uniprotkb_pep_ppep_view AS + SELECT deppep_UniProtKB.UniprotKB_ID AS accession + , deppep_UniProtKB.pos_start AS pos_start + , deppep_UniProtKB.pos_end AS pos_end + , deppep.seq AS peptide + , ppep.seq AS phosphopeptide + , ppep.scrubbed AS scrubbed + , uniprot_view.Sequence AS sequence + , uniprot_view.Description AS description + , uniprot_view.long_description AS long_description + , ppep.id AS ppep_id + FROM ppep, deppep, deppep_UniProtKB, uniprot_view + WHERE deppep.id = ppep.deppep_id + AND deppep.id = deppep_UniProtKB.deppep_id + AND deppep_UniProtKB.UniprotKB_ID = uniprot_view.Uniprot_ID + ORDER BY UniprotKB_ID, deppep.seq, ppep.seq + ; + CREATE TABLE ppep_gene_site + ( ppep_id INTEGER REFERENCES ppep(id) + , gene_names TEXT + , site_type_id INTEGER REFERENCES site_type(id) + , kinase_map TEXT + , PRIMARY KEY (ppep_id, kinase_map) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_gene_site_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , ppep_id + , gene_names + , type_name + , kinase_map + FROM + ppep, ppep_gene_site, site_type + WHERE + ppep_gene_site.ppep_id = ppep.id + AND + ppep_gene_site.site_type_id = site_type.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_metadata + ( ppep_id INTEGER REFERENCES ppep(id) + , protein_description TEXT + , gene_name TEXT + , FASTA_name TEXT + , phospho_sites TEXT + , motifs_unique TEXT + , accessions TEXT + , motifs_all_members TEXT + , domain TEXT + , ON_FUNCTION TEXT + , ON_PROCESS TEXT + , ON_PROT_INTERACT TEXT + , ON_OTHER_INTERACT TEXT + , notes TEXT + , PRIMARY KEY (ppep_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_metadata_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , protein_description + , gene_name + , FASTA_name + , phospho_sites + , motifs_unique + , accessions + , motifs_all_members + , domain + , ON_FUNCTION + , ON_PROCESS + , ON_PROT_INTERACT + , ON_OTHER_INTERACT + , notes + FROM + ppep, ppep_metadata + WHERE + ppep_metadata.ppep_id = ppep.id + ORDER BY + ppep.seq + ; + CREATE TABLE ppep_intensity + ( ppep_id INTEGER REFERENCES ppep(id) + , sample_id INTEGER + , intensity INTEGER + , PRIMARY KEY (ppep_id, sample_id) ON CONFLICT IGNORE + ) + ; + CREATE VIEW ppep_intensity_view AS + SELECT DISTINCT + ppep.seq AS phospho_peptide + , sample.name AS sample + , intensity + FROM + ppep, sample, ppep_intensity + WHERE + ppep_intensity.sample_id = sample.id + AND + ppep_intensity.ppep_id = ppep.id + ; + """ + + UNIPROT_SEQ_AND_ID_SQL = """ + select Sequence, Uniprot_ID + from UniProtKB + """ + + # Parse Command Line + parser = argparse.ArgumentParser( + description="Phopsphoproteomic Enrichment phosphopeptide SwissProt search (in place in SQLite DB)." + ) + + # inputs: + # Phosphopeptide data for experimental results, including the intensities + # and the mapping to kinase domains, in tabular format. + parser.add_argument( + "--phosphopeptides", + "-p", + nargs=1, + required=True, + dest="phosphopeptides", + help="Phosphopeptide data for experimental results, generated by the Phopsphoproteomic Enrichment Localization Filter tool", + ) + parser.add_argument( + "--uniprotkb", + "-u", + nargs=1, + required=True, + dest="uniprotkb", + help="UniProtKB/Swiss-Prot data, converted from FASTA format by the Phopsphoproteomic Enrichment Kinase Mapping tool", + ) + parser.add_argument( + "--schema", + action="store_true", + dest="db_schema", + help="show updated database schema", + ) + parser.add_argument( + "--warn-duplicates", + action="store_true", + dest="warn_duplicates", + help="show warnings for duplicated sequences", + ) + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + help="show somewhat verbose program tracing", + ) + # "Make it so!" (parse the arguments) + options = parser.parse_args() + if options.verbose: + print("options: " + str(options) + "\n") + + # path to phosphopeptide (e.g., "outputfile_STEP2.txt") input tabular file + if options.phosphopeptides is None: + exit('Argument "phosphopeptides" is required but not supplied') + try: + f_name = os.path.abspath(options.phosphopeptides[0]) + except Exception as e: + exit("Error parsing phosphopeptides argument: %s" % (e)) + + # path to SQLite input/output tabular file + if options.uniprotkb is None: + exit('Argument "uniprotkb" is required but not supplied') + try: + db_name = os.path.abspath(options.uniprotkb[0]) + except Exception as e: + exit("Error parsing uniprotkb argument: %s" % (e)) + + # print("options.schema is %d" % options.db_schema) + + # db_name = "demo/test.sqlite" + # f_name = "demo/test_input.txt" + + con = sqlite3.connect(db_name) + cur = con.cursor() + ker = con.cursor() + + cur.executescript(DROP_TABLES_SQL) + + # if options.db_schema: + # print("\nAfter dropping tables/views that are to be created, schema is:") + # cur.execute("SELECT * FROM sqlite_schema") + # for row in cur.fetchall(): + # if row[4] is not None: + # print("%s;" % row[4]) + + cur.executescript(CREATE_TABLES_SQL) + + if options.db_schema: + print( + "\nAfter creating tables/views that are to be created, schema is:" + ) + cur.execute("SELECT * FROM sqlite_schema") + for row in cur.fetchall(): + if row[4] is not None: + print("%s;" % row[4]) + + def generate_ppep(f): + # get keys from upstream tabular file using readline() + # ref: https://stackoverflow.com/a/16713581/15509512 + # answer to "Use codecs to read file with correct encoding" + file1_encoded = open(f, "rb") + file1 = cx_getreader("latin-1")(file1_encoded) + + count = 0 + re_tab = re.compile("^[^\t]*") + re_quote = re.compile('"') + while True: + count += 1 + # Get next line from file + line = file1.readline() + # if line is empty + # end of file is reached + if not line: + break + if count > 1: + m = re_tab.match(line) + m = re_quote.sub("", m[0]) + yield m + file1.close() + file1_encoded.close() + + # Build an Aho-Corasick automaton from a trie + # - ref: + # - https://pypi.org/project/pyahocorasick/ + # - https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm + # - https://en.wikipedia.org/wiki/Trie + auto = ahocorasick.Automaton() + re_phos = re.compile("p") + # scrub out unsearchable characters per section + # "Match the p_peptides to the @sequences array:" + # of the original + # PhosphoPeptide Upstream Kinase Mapping.pl + # which originally read + # $tmp_p_peptide =~ s/#//g; + # $tmp_p_peptide =~ s/\d//g; + # $tmp_p_peptide =~ s/\_//g; + # $tmp_p_peptide =~ s/\.//g; + # + re_scrub = re.compile("0-9_.#") + ppep_count = 0 + for ppep in generate_ppep(f_name): + ppep_count += 1 + add_to_trie = False + # print(ppep) + scrubbed = re_scrub.sub("", ppep) + deppep = re_phos.sub("", scrubbed) + if options.verbose: + print("deppep: %s; scrubbed: %s" % (deppep, scrubbed)) + # print(deppep) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + if cur.fetchone() is None: + add_to_trie = True + cur.execute("INSERT INTO deppep(seq) VALUES (?)", (deppep,)) + cur.execute("SELECT id FROM deppep WHERE seq = (?)", (deppep,)) + deppep_id = cur.fetchone()[0] + if add_to_trie: + # print((deppep_id, deppep)) + # Build the trie + auto.add_word(deppep, (deppep_id, deppep)) + cur.execute( + "INSERT INTO ppep(seq, scrubbed, deppep_id) VALUES (?,?,?)", + (ppep, scrubbed, deppep_id), + ) + # def generate_deppep(): + # cur.execute("SELECT seq FROM deppep") + # for row in cur.fetchall(): + # yield row[0] + cur.execute("SELECT count(*) FROM (SELECT seq FROM deppep GROUP BY seq)") + for row in cur.fetchall(): + deppep_count = row[0] + + cur.execute( + "SELECT count(*) FROM (SELECT Sequence FROM UniProtKB GROUP BY Sequence)" + ) + for row in cur.fetchall(): + sequence_count = row[0] + + print("%d phosphopeptides were read from input" % ppep_count) + print( + "%d corresponding dephosphopeptides are represented in input" + % deppep_count + ) + # Look for cases where both Gene_Name and Sequence are identical + cur.execute( + """ + SELECT Uniprot_ID, Gene_Name, Sequence + FROM UniProtKB + WHERE Sequence IN ( + SELECT Sequence + FROM UniProtKB + GROUP BY Sequence, Gene_Name + HAVING count(*) > 1 + ) + ORDER BY Sequence + """ + ) + duplicate_count = 0 + old_seq = "" + for row in cur.fetchall(): + if duplicate_count == 0: + print( + "\nEach of the following sequences is associated with several accession IDs (which are listed in the first column) but the same gene ID (which is listed in the second column)." + ) + if row[2] != old_seq: + old_seq = row[2] + duplicate_count += 1 + if options.warn_duplicates: + print("\n%s\t%s\t%s" % row) + else: + if options.warn_duplicates: + print("%s\t%s" % (row[0], row[1])) + if duplicate_count > 0: + print( + "\n%d sequences have duplicated accession IDs\n" % duplicate_count + ) + + print("%s accession sequences will be searched\n" % sequence_count) + + # print(auto.dump()) + + # Convert the trie to an automaton (a finite-state machine) + auto.make_automaton() + + # Execute query for seqs and metadata without fetching the results yet + uniprot_seq_and_id = cur.execute(UNIPROT_SEQ_AND_ID_SQL) + while 1: + batch = uniprot_seq_and_id.fetchmany(size=50) + if not batch: + break + for Sequence, UniProtKB_id in batch: + if Sequence is not None: + for end_index, (insert_order, original_value) in auto.iter( + Sequence + ): + ker.execute( + """ + INSERT INTO deppep_UniProtKB + (deppep_id,UniProtKB_id,pos_start,pos_end) + VALUES (?,?,?,?) + """, + ( + insert_order, + UniProtKB_id, + 1 + end_index - len(original_value), + end_index, + ), + ) + else: + raise ValueError( + "UniProtKB_id %s, but Sequence is None: Check whether SwissProt file is missing sequence for this ID" + % (UniProtKB_id,) + ) + ker.execute( + """ + SELECT count(*) || ' accession-peptide-phosphopeptide combinations were found' + FROM uniprotkb_pep_ppep_view + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' accession matches were found', count(*) AS accession_count + FROM ( + SELECT accession + FROM uniprotkb_pep_ppep_view + GROUP BY accession + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' peptide matches were found' + FROM ( + SELECT peptide + FROM uniprotkb_pep_ppep_view + GROUP BY peptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + ker.execute( + """ + SELECT count(*) || ' phosphopeptide matches were found', count(*) AS phosphopeptide_count + FROM ( + SELECT phosphopeptide + FROM uniprotkb_pep_ppep_view + GROUP BY phosphopeptide + ) + """ + ) + for row in ker.fetchall(): + print(row[0]) + + # link peptides not found in sequence database to a dummy sequence-record + ker.execute( + """ + INSERT INTO deppep_UniProtKB(deppep_id,UniProtKB_id,pos_start,pos_end) + SELECT id, 'No Uniprot_ID', 0, 0 + FROM deppep + WHERE id NOT IN (SELECT deppep_id FROM deppep_UniProtKB) + """ + ) + + con.commit() + ker.execute("vacuum") + con.close() + + +if __name__ == "__main__": + wrap_start_time = time.perf_counter() + __main__() + wrap_stop_time = time.perf_counter() + # print(wrap_start_time) + # print(wrap_stop_time) + print( + "\nThe matching process took %d milliseconds to run.\n" + % ((wrap_stop_time - wrap_start_time) * 1000), + ) + +# vim: sw=4 ts=4 et ai :
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/alpha_levels.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,3 @@ +0.05 +0.1 +0.2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/pSTY_motifs.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,355 @@ +"counter" "pcre" "symbol" "description" "pubmed_id" "classification" "source" +"1" "R.R..(pS|pT)(F|L)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8985174" "kinase substrate" "HPRD" +"2" "R.R..(pS|pT)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10945990" "kinase substrate" "HPRD" +"3" "GRART(S|T)pSFAE" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8524413" "kinase substrate" "HPRD" +"4" "(R|Q|K)(R|K|N|Q|P|H)(R|K)(R|S|T)(N|K|Q|H|D|P)pS(F|W|I|M|N|S)(S|T|H)(R|S|K)(S|T|P|Q)" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"5" "(R|K).(R|K)(S|T).pS" "PKB_group" "Akt kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"6" "(M|V|L|I|F)(R|K|H)...(pS|pT)...(M|V|L|I|F)" "AMPK_group" "AMP-activated protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7902296,7698321" "kinase substrate" "HPRD" +"7" "(M|V|L|I)..(R|K|H).(pS|pT)...(M|V|L|I)" "AMPK_group" "AMP-activated protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7902296" "kinase substrate" "HPRD" +"8" "(M|V|L|I|F)(R|K|H)..(pS|pT)...(M|V|L|I|F)" "AMPK_group" "AMP-activated protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7698321" "kinase substrate" "HPRD" +"9" "(R|K).R..pS...(R|K)" "AMPK_group" "AMP-activated protein kinase 2 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7698321" "kinase substrate" "HPRD" +"10" "(P|L|I|M).(L|I|D|E)pSQ" "ATM" "ATM kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10608806" "kinase substrate" "HPRD" +"11" "LpSQE" "ATM" "ATM kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10801797,11544175" "kinase substrate" "HPRD" +"12" "pSQ" "ATM" "ATM kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"13" "(R|K|N)R.(pS|pT)(M|L|V|I)" "Aurora A" "Aurora-A kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16083426" "kinase substrate" "HPRD" +"14" "(D|E)(pS|pT)..." "GRK-2" "b-Adrenergic Receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1645191" "kinase substrate" "HPRD" +"15" "HpSTSDD" "BCKDK" "Branched chain alpha-ketoacid dehydrogenase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=3947057" "kinase substrate" "HPRD" +"16" "YRpSVDE" "BCKDK" "Branched chain alpha-ketoacid dehydrogenase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=3947057" "kinase substrate" "HPRD" +"17" "(M|V|L|I|F).R..(pS|pT)...(M|V|L|I|F)" "CaM-KI_group" "Calmodulin-dependent protein kinase I substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9452427,7698321,8022798" "kinase substrate" "HPRD" +"18" "(M|I|L|V|F|Y).R..(pS|pT)(M|I|L|V|F|Y)" "CaM-KII_alpha" "Calmodulin-dependent protein kinase II alpha substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9452427" "kinase substrate" "HPRD" +"19" "R..(pS|pT)" "CaM-KII_group" "Calmodulin-dependent protein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"20" "(K|F)(R|K)(Q|M)(Q|M|K|L|F)pS(F|I|M|L|V)(D|E|I)(L|M|K|I)(F|K)" "CaM-KII_group" "Calmodulin-dependent protein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"21" "(M|V|L|I|F).(R|K)..(pS|pT).." "CaM-KII_group" "Calmodulin-dependent protein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8280084" "kinase substrate" "HPRD" +"22" "R..pS" "CaM-KII_group" "Calmodulin-dependent protein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"23" "VPGKARKKpSSCQLL" "CaM-KIV" "Calmodulin-dependent protein kinase IV substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1901412" "kinase substrate" "HPRD" +"24" "PLARTLpSVAGLP" "CaM-KIV" "Calmodulin-dependent protein kinase IV substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1309765" "kinase substrate" "HPRD" +"25" "(M|I|L|V|F|Y).R..(pS|pT)" "CaM-KIV" "Calmodulin-dependent protein kinase IV substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9452427" "kinase substrate" "HPRD" +"26" "E(F|E)D(T|A|G)GpSI(I|F|Y|G)(I|G|F)(F|G)(F|P|L)" "CK1_delta|CK1_group" "Casein Kinase I delta substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"27" "Y(Y|E)(D|Y)(A|D)(A|G)pSI(I|Y|F|G)(I|G|F)(F|G)(F|P|L)" "CK1_group|CK1_gamma Q9HCP0" "Casein Kinase I gamma substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"28" "pSP..(pS|pT)" "CK1_group" "Casein Kinase I substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"29" "(D|E)..(pS|pT)" "CK1_group" "Casein Kinase I substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12925738" "kinase substrate" "HPRD" +"30" "(pS|pT)..(S|T)" "CK1_group" "Casein Kinase I substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12925738" "kinase substrate" "HPRD" +"31" "(pS|pT)...(S|T)(M|L|V|I|F)" "CK1_group" "Casein Kinase I substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12925738" "kinase substrate" "HPRD" +"32" "(E|D|A)(D|E)(E|D)(E|D)pS(E|D|A)(D|E|A)(E|D)(E|D)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"33" "pS.(E|pS|pT)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1650349,3474230" "kinase substrate" "HPRD" +"34" "pS..(E|pS|pT)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1650349,12925738" "kinase substrate" "HPRD" +"35" "(pS|pT)..(E|D)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9272871" "kinase substrate" "HPRD" +"36" "pSD.E" "CK2_group" "Casein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"37" "pS..(E|D)" "CK2_group" "Casein kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"38" "pS(D|E).(D|E).(D|E)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"39" "(D|E)pS(D|E).(D|E)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"40" "pS(D|E)(D|E)(D|E)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"41" "(pS|pT)..(D|E)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=2044770,2117608" "kinase substrate" "HPRD" +"42" "(pS|pT)..(E|D|pS|pY)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7735314" "kinase substrate" "HPRD" +"43" "(S|E|P|G)(D|S|N|E|P)(E|D|G|Q|W)(Y|E|D|S|W|T)(W|E|D)pS(D|E)(D|E|W|N)(E|D)(E|D|N|Q)" "CK2_group" "Casein Kinase II substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"44" "(R|K)pSP(R|P)(R|K|H)" "CDK1" "Cdc2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12586835" "kinase substrate" "HPRD" +"45" "(pS|pT)P.(R|K)" "CDK1" "Cdc2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7874496,9003781" "kinase substrate" "HPRD" +"46" "HHH(R|K)pSPR(R|K)R" "CDK1" "Cdc2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7874496" "kinase substrate" "HPRD" +"47" "P.(pS|pT)PKK.KK" "CDK1" "Cdc2 like protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8407912" "kinase substrate" "HPRD" +"48" "(pS|pT)P.(R|K)" "CDK1|CDK2|CDK4|CDK6" "CDK1,2, 4, 6 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12501191,10607671" "kinase substrate" "HPRD" +"49" "pSP.(R|K)." "CDK_group" "CDK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"50" "PL(pS|pT)PIP(K|R|H)" "CDK4" "CDK4 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9003781" "kinase substrate" "HPRD" +"51" "PL(pS|pT)P.(K|R|H)" "CDK4" "CDK4 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9003781" "kinase substrate" "HPRD" +"52" "pTP.K" "CDK5" "CDK5 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11684694" "kinase substrate" "HPRD" +"53" "(K|H|G)H(H|P)(K|G|H)pSP(R|K)(H|R|K)(R|H|K)" "CDK5" "CDK5 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"54" "(pS|pT)PG(pS|pT)PGTP" "CDK5" "CDK5 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9003781" "kinase substrate" "HPRD" +"55" "(M|I|L|V).(R|K)..(pS|pT)" "CHK1" "Chk1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10648819" "kinase substrate" "HPRD" +"56" "R..(pS|pT)..R" "CLK1" "CLK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10954422" "kinase substrate" "HPRD" +"57" "(R|K).(R|K).(R|K).pS..R" "CLK1" "CLK1|CLK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10480872" "kinase substrate" "HPRD" +"58" "R(R|H)(R|H)(R|E)RE(R|H)pSR(R|D)L" "CLK1" "CLK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11827553" "kinase substrate" "HPRD" +"59" "KK.RRpT(L|V)." "DMPK_group" "DMPK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12897125" "kinase substrate" "HPRD" +"60" "KKR.RpT(L|V)." "DMPK_group" "DMPK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12897125" "kinase substrate" "HPRD" +"61" "(R|K).RR.(pS|pT)(L|V)." "DMPK_group" "DMPK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12897125" "kinase substrate" "HPRD" +"62" "R..(pS|pT)(L|V)R" "DMPK_group" "DMPK1|DMPK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=10913253" "kinase substrate" "HPRD" +"63" ".pSQ" "DNA-PK" "DNA dependent Protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1751287" "kinase substrate" "HPRD" +"64" "P(pS|pT)." "DNA-PK" "DNA dependent Protein kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8086496" "kinase substrate" "HPRD" +"65" "R(R|K)R(E|R)R(E|A)(H|R)pSRR(R|D)(L|E)" "CLK1" "DOA/CDC-like kinase 2 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11827553" "kinase substrate" "HPRD" +"66" "(I|L|V|F|M)RR..(pS|pT)(I|L|M|V|F)" "DCAMKL1" "Doublecortin kinase-1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12590608" "kinase substrate" "HPRD" +"67" "E.pS.R..R" "HRI|EIF2AK2|EIF2AK3" "elF2 alpha kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275,1671834" "kinase substrate" "HPRD" +"68" "(T|P|S)(G|P|E|Y)(P|L|I)(L|M|P)pSP(G|P|F)(P|F|G|Y)(F|Y|I)" "MAP2K1|MAP2K2|MAP2K_group" "ERK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"69" "pTEpY" "MAP2K1|MAP2K2|MAP2K_group" "ERK1 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12646559" "kinase substrate" "HPRD" +"70" "P.(pS|pT)PP" "MAP2K1|MAP2K2|MAP2K_group" "ERK1,2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1939237" "kinase substrate" "HPRD" +"71" "..P.(pS|pT)PPP." "MAP2K1|MAP2K2|MAP2K_group" "ERK1,2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1939237" "kinase substrate" "HPRD" +"72" "P.(pS|pT)P" "MAP2K1|MAP2K2|MAP2K_group" "ERK1, ERK2 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9792705" "kinase substrate" "HPRD" +"73" "pSP" "MAP2K1|MAP2K2|MAP2K_group" "ERK1, ERK2 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"74" "KpSPP" "MAP2K1|MAP2K2|MAP2K_group|CDK5|GSK-3 (HPRD)" "ERK1, ERK2, SAPK, CDK5 and GSK3 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12586839" "kinase substrate" "HPRD" +"75" "(D|Y|W|E)(C)(P|S|C|E)(P|C|S|L|T|V)(L|M|T)pS(P|A)(T|S|G|R|C|F)(W|P|S)(W|F)" "MAP2K1|MAP2K_group" "ERK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"76" "..(pS|pT)E" "GRK-1" "G protein-coupled receptor kinase 1 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1645191" "kinase substrate" "HPRD" +"77" ".(pS|pT)...(A|P|S|T)" "GRK-1" "G protein-coupled receptor kinase 1 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1645191" "kinase substrate" "HPRD" +"78" "(pS|pT)P.(K|R)" "CDK2|MOD_CDK_SPxK_1" "Growth associated histone HI kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate|ELM" "HPRD" +"79" "(K|R)(pS|pT)P" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5|MOD_ProDKin_1" "Growth associated histone HI kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate|ELM" "HPRD" +"80" "(pS|pT)P(K|R)" "CDK2|MOD_CDK_SPK_2" "Growth associated histone HI kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate|ELM" "HPRD" +"81" "pS...pS" "GSK-3 (HPRD)" "GSK3 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339,16141410" "kinase substrate" "HPRD" +"82" "P.pTP" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK3, Erk1, Erk2 and CDK5 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16377132" "kinase substrate" "HPRD" +"83" "R..pSPV" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15358237" "kinase substrate" "HPRD" +"84" "K(pS|pT)P.K" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9592082" "kinase substrate" "HPRD" +"85" "KpSP...K" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9592082" "kinase substrate" "HPRD" +"86" "KpSP..K" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9592082" "kinase substrate" "HPRD" +"87" "KpSP....K" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9592082" "kinase substrate" "HPRD" +"88" "KpTPAKEE" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9819213" "kinase substrate" "HPRD" +"89" "P.pSP" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16020478" "kinase substrate" "HPRD" +"90" ".(pS|pT)P" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16020478" "kinase substrate" "HPRD" +"91" "..pSP" "GSK-3 (HPRD)|MAP2K1|MAP2K2|MAP2K_group|CDK5" "GSK-3, ERK1, ERK2, CDK5 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16020478" "kinase substrate" "HPRD" +"93" "GP(Q|M)pSPI" "JNK_group" "JNK1 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15629715" "kinase substrate" "HPRD" +"94" "LRpT" "LKB1" "LKB1 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=14985505" "kinase substrate" "HPRD" +"95" "(R|K).R..pS" "RSK-1|RSK-2|RSK_group" "MAPKAPK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7498520" "kinase substrate" "HPRD" +"96" "RRR.pS" "RSK_group" "MAPKAPK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7498520" "kinase substrate" "HPRD" +"97" "(L|F|I)...R(Q|S|T)L(pS|pT)(M|L|I|V)" "MAPKAPK2" "MAPKAPK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15807522" "kinase substrate" "HPRD" +"98" "..[^P].R..pS.." "MAPKAPK2" "MAPKAPK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8280084" "kinase substrate" "HPRD" +"99" "pS...(pS|pT)" "MAPKAPK2" "MAPKAPK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15629715,8280084" "kinase substrate" "HPRD" +"100" "pT(G|P|E)pY" "MAPK11|MAPK13|MAPK14" "MAPK 11,13,14 Kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9295308,7535770" "kinase substrate" "HPRD" +"101" "RRFGpS[^P]RRF" "MEKK (HPRD)" "MEKK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7874496" "kinase substrate" "HPRD" +"102" "RRFGpS(M|L|V|I|F)RR(M|L|V|I|F)" "MEKK (HPRD)" "MEKK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7874496" "kinase substrate" "HPRD" +"103" "KKR..pS.(R|K)(R|K)" "MLCK_group" "MLCK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7961752" "kinase substrate" "HPRD" +"104" "FpTY" "mTOR" "mTOR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=14560963" "kinase substrate" "HPRD" +"105" "IRRLpSTRRR" "NEK2" "Nek 2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275,7759549" "kinase substrate" "HPRD" +"106" "(R|N)(F|L|M)(R|K)(R|K)pS(R|I|V|M)(R|I|M|V)(M|I|F|V)(I|F|M)" "NIMA (HPRD)" "NIMA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677,1416988" "kinase substrate" "HPRD" +"107" "FR.(pS|pT)" "NIMA (HPRD)" "NIMA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7759549,8120013" "kinase substrate" "HPRD" +"108" "RF(R|K)(R|K)pS(R|I)(R|I)MI" "NIMA (HPRD)" "NIMA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8120013,8982275" "kinase substrate" "HPRD" +"109" "(R|K).R..(pS|pT)(M|L|V|I)" "p70S6K" "p70 Ribosomal S6 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7498520,1737763" "kinase substrate" "HPRD" +"110" "VFLGFpTYVAP" "p70S6K" "p70 Ribosomal S6 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7498520" "kinase substrate" "HPRD" +"111" "AKRRRLSpSLRA" "PAK1" "PAK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8051089" "kinase substrate" "HPRD" +"112" "VRKRpTLRRL" "PAK1" "PAK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8051089" "kinase substrate" "HPRD" +"113" "(R|K)(R|.).(pS|pT)" "PAK2" "PAK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9405039" "kinase substrate" "HPRD" +"114" "F..F(pS|pT)(F|Y)" "PDK-1" "PDK1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11516946" "kinase substrate" "HPRD" +"115" "KRKQIpSVR" "PHK_group" "Phosphorylase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8180216" "kinase substrate" "HPRD" +"116" "(F|M|K)(R|K)(M|R|Q|F)(M|F|L|I)pS(F|I|M|L)(F|R|K)(L|I)(F|L|I)" "PHK_group" "Phosphorylase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"117" "(K|R)..pS(V|I)" "PHK_group" "Phosphorylase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"118" "(R|K)(R|K)(R|K).(pS|pT)." "Pim1 (HPRD)" "Pim1 kinase substrate sequence" "https://pubmed.ncbi.nlm.nih.gov/?term=1416988" "kinase substrate" "HPRD" +"119" "(R|K)(R|K|A|Q|P)(R|K)(R|Q|H|N|Y)(P|H|K)pS(G|S|T)(P|S|G|Q|H|S|T)(S|P|Q|G|D)(T|S|P|G)" "Pim2 (HPRD)" "Pim2 kinase substrate sequence" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"120" "RR.pS(M|I|L|V|F|Y)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8463304,194899" "kinase substrate" "HPRD" +"121" "R.pS" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"122" "KR..pS" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"123" "R..pS" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"124" "(R|K).(pS|pT)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"125" "K..(pS|pT)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"126" "(R|K)(R|K).(pS|pT)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"127" "K...(pS|pT)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"128" "(pS|pT).(R|K)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"129" "RRRRpSIIFI" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7874496" "kinase substrate" "HPRD" +"130" "RR.pS" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275,1848111" "kinase substrate" "HPRD" +"131" "R(R|K).(pS|pT)(I|L|V|F|Y)(D|C|.).D" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=14679191" "kinase substrate" "HPRD" +"132" "RR.pS" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8163498" "kinase substrate" "HPRD" +"133" "RRR(R|N)pSII(F|D)" "PKA_group" "PKA kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"135" "R(R|K).(pS|pT)[^P]" "PKA_alpha|MOD_PKA_1" "PKA, PKG kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7654713" "kinase substrate|ELM" "HPRD" +"136" "ARKGpSLRQ" "PKC_alpha" "PKC alpha kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"137" "R(R|F)RR(R|K)GpSF(R|K)(R|K)" "PKC_alpha" "PKC alpha kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8887677" "kinase substrate" "HPRD" +"138" "(L|R|F)(R|K)R(K|Q)GpS(F|M)KK.A" "PKC_beta" "PKC beta kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12566450,10574945" "kinase substrate" "HPRD" +"139" "R.RKGpSF" "PKC_delta" "PKC delta kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"140" "KRQGpSVRR" "PKC_epsilon" "PKC epsilon kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"141" "R(K|E|R).pS" "PKC_epsilon" "PKC epsilon kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"142" "AR..R(R|K)RpSFRR" "PKC_eta" "PKC eta kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"143" "F..F(pS|pT)(F|Y)" "PKC_group" "PKC family kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"145" ".R..(pS|pT).R." "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=2473066" "kinase substrate" "HPRD" +"146" "(pS|pT).(R|K)" "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"147" "(R|K)..(pS|pT)" "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"148" "(R|K)..(pS|pT).(R|K)" "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"149" "(K|R).(pS|pT)" "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"150" "(R|K).(pS|pT).(R|K)" "PKC_group" "PKC kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1956339" "kinase substrate" "HPRD" +"151" "(L|V)(V|L|A)R(Q|K|E)MpS" "PKD1" "PKC mu kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"152" "(R|F|W|M)(W|A|K|S)(R|S|K|H)(R|H|S|Q)(R|K|N|P|G|Q)pS(I|F|R|V|K|S|L|M)(K|M|R|S|T)(R|S|K|W)(R|K|G)" "PKC_theta" "PKC theta kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"153" "F.R..pS(F|M)(F|M)" "PKC_zeta" "PKC zeta kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8995387" "kinase substrate" "HPRD" +"154" "(L|V|I)(R|K|Q)(R|K)(R|K|T|Q|M)(N|K|R|L|M|H)pS(F|W|I|M|L|V)(S|N)(R|S|P|Y|W)(S|R|N|L)" "PKD" "PKD kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"155" "R(R|K).(pS|pT)[^P]" "PKA_group|MOD_PKA_1" "PKG kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7654713" "kinase substrate|ELM" "HPRD" +"156" "R..(pS|pT).R..R" "EIF2AK2" "PKR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=1671834" "kinase substrate" "HPRD" +"157" "(D|E).(pS|pT)(I|L|V|M).(D|E)" "PLK1" "Plk1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12738781" "kinase substrate" "HPRD" +"158" ".pS..D.." "PDHK1" "Pyruvate dehydrogenase kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=3002277" "kinase substrate" "HPRD" +"159" "PLpTLP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"160" "PLLpTP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"161" "PLpTP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"162" "PpTLP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"163" "PLpTLP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"164" "PpTLP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"165" "LpTP" "RAF1" "RAF1 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8376361" "kinase substrate" "HPRD" +"166" "KKKKKK(pS|pT)..." "TGF-beta (HPRD)" "TGF beta receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8524844" "kinase substrate" "HPRD" +"167" "(R|K|Q|N)(M|C|W)(R|T|S|N)(E|D|S|N)(R|K|E|D|N)pS(S|D|E)(S|GC|D)(SM|R|N)(N|H|S|R|C)" "TGF-beta (HPRD)" "TGF beta receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15782149" "kinase substrate" "HPRD" +"168" "RR..pS" "DAPK3" "ZIP kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15001356" "kinase substrate" "HPRD" +"169" "KR.RpS" "DAPK3" "ZIP kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15001356" "kinase substrate" "HPRD" +"170" "KRR.pT" "DAPK3" "ZIP kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15001356" "kinase substrate" "HPRD" +"171" "pTEY" "DUSP1 P28562" "Dual specificity protein phosphatase 1 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16183637" "phosphatase substrate" "HPRD" +"172" "pT.pY" "DUSP6 Q16828" "Dual specificity protein phosphatase 6 substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11432864" "phosphatase substrate" "HPRD" +"173" "RRA(pS|pT)VA" "PKA_group|MOD_PKA_1" "PP2A, PP2C substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7508382,1653021,3027075" "kinase substrate|ELM" "HPRD" +"174" ".R..pSVA" "Calcineurin (HPRD)" "PP2B substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7508382" "kinase substrate" "HPRD" +"175" ".pT.pY." "Wip1 O15297" "PP2C delta substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15807522" "kinase substrate" "HPRD" +"1" "KCSpTWP" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12819209" "domain binding" "HPRD" +"2" "R..pS" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9524113,9341175" "domain binding" "HPRD" +"3" "R.R..pS.P" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9524113" "domain binding" "HPRD" +"4" "YpTV" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12196105" "domain binding" "HPRD" +"5" "RS.(pS|pT).P" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15139812" "domain binding" "HPRD" +"6" "R.(Y|F).pS.P" "LIG_14-3-3_CanoR_1" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9428519" "domain binding|ELM" "HPRD" +"7" "RPVSSAApSVY" "14-3-3 (HPRD)" "14-3-3 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9524113" "domain binding" "HPRD" +"8" "pS(D|E)(D|E)E" "BARD1 Q99728" "BARD1 BRCT domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578343" "domain binding" "HPRD" +"9" "DpSG..pS" "BTRC WD40" "Beta-TrCP1 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15070733,12820959" "domain binding" "HPRD" +"10" "pS(F|Y|H)(V|F|Y)(F|Y)" "LIG_BRCT_BRCA1_1" "BRCA1 BRCT domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578343" "domain binding|ELM" "HPRD" +"11" "(I|L)(I|L|P)pTP(R|K)" "hCDC4 Q969H0" "CDC4 WD40 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11734846" "domain binding" "HPRD" +"12" "HFDpTYLI" "LIG_FHA_1" "Chk2 FHA domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11106755,12049740" "domain binding|ELM" "HPRD" +"13" "(R|D|H)(L|Y)(L|M)(K|A)pT(Q|L|M|E|V)(K|L|I|R)" "FHA (HPRD)" "FHA domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11106755" "domain binding" "HPRD" +"14" "S(pS|pT)." "MDC1 FHA" "MDC1 BRCT domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578343" "domain binding" "HPRD" +"15" "S(pS|pT)." "PLK1 PBD" "Plk1 PBD domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=15139812,12595692,14532005" "domain binding" "HPRD" +"16" "pSYII" "RAD9 BRCT (HPRD)" "RAD9 BRCT domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578343" "domain binding" "HPRD" +"17" "(pS|pT)P" "DOC_WW_Pin1_4" "WW domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11607836,11248545,15139812,10037602" "domain binding|ELM" "HPRD" +"1" "pYM.M" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8227078" "kinase substrate" "HPRD" +"2" "EDAIpY" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8125961" "kinase substrate" "HPRD" +"3" ".VIpYAAPF" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"4" "EAIpYAAPF" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11860343,7845468" "kinase substrate" "HPRD" +"5" "EEIpYEEpY" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11860343" "kinase substrate" "HPRD" +"6" "E.IpY..P." "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11860343" "kinase substrate" "HPRD" +"7" "EEIpYYYVH" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11860343" "kinase substrate" "HPRD" +"8" "ERIpYARTK" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11860343" "kinase substrate" "HPRD" +"9" "AEV(I|V|L|F)pYAA(P|F)F" "Abl" "Abl kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468" "kinase substrate" "HPRD" +"10" "pY...YY" "ALK|PLCG1 SH2" "ALK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15938644" "kinase substrate" "HPRD" +"11" "pY(D|E).(I|L|V|M)" "ALK" "ALK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"12" "(D|E)..pY" "ALK" "ALK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"13" "pY....(F|Y)" "ALK" "ALK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"14" "EE(D|E)IpYFFFF" "Csk" "CSK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9425036" "kinase substrate" "HPRD" +"15" "...IpY(M|I|F)FFF" "Csk" "CSK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275" "kinase substrate" "HPRD" +"16" "EEEEpYFELV" "EGFR" "EGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"17" "(E|D|R|A)(D|E)(D|E)(E|D|I)pY(F|V|I|E)(E|F|D)(L|I|F|V)V" "EGFR" "EGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591" "kinase substrate" "HPRD" +"18" ".(D|E)pY." "EGFR" "EGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"19" "pYIPP" "EGFR" "EGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=12522132" "kinase substrate" "HPRD" +"20" ".(D|E)pY(I|L|V)" "EGFR" "EGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"21" "EEEIpYEEIE" "Fes" "Fes kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"22" "(E|A|D)(E|A)(E|A)(I|E|V)pY(D|E)(D|E)(I|V|E)(E|I|V)" "Fes" "Fes kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468" "kinase substrate" "HPRD" +"23" "EEEpYFFLF" "FGFR (HPRD)" "FGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"24" "A(E|A)EEpY(F|V)F(L|F|M|I|V)F" "FGFR (HPRD)" "FGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591" "kinase substrate" "HPRD" +"25" "ME(E|N)(I|V)pY(G|E)IFF" "Fgr" "Fgr kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275" "kinase substrate" "HPRD" +"26" "KSPGEpYVNIEFG" "IGF1R|INSR" "IGF1 receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8530377" "kinase substrate" "HPRD" +"27" "pYM.M" "INSR" "Insulin receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8227078,1312712" "kinase substrate" "HPRD" +"28" "EE(E|N|D)pY(M|F)(M|F)(M|F|I|E)(M|F)" "INSR" "Insulin receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591" "kinase substrate" "HPRD" +"29" ".EEEpYMMMM" "INSR" "Insulin receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"30" "KKSRGDpYMTMQIG" "INSR" "Insulin receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8530377,1312712" "kinase substrate" "HPRD" +"31" "KKKLPATGDpYMNMSPVGD" "INSR" "Insulin receptor kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8530377,1312712" "kinase substrate" "HPRD" +"32" "pY..(L|I|V)" "JAK2" "JAK2 kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=15143187" "kinase substrate" "HPRD" +"33" "pTPpY" "MAP2K7|MAP2K6" "JNK kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=11390361" "kinase substrate" "HPRD" +"34" ".E.IpYGVLF" "Lck" "Lck kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"35" "E.(I|V|L|F)pY(G|A)V(L|V|F|I)(F|L|V|I)" "Lck" "Lck kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468" "kinase substrate" "HPRD" +"36" "DEEIpY(E|G)EL." "Lyn" "Lyn kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275" "kinase substrate" "HPRD" +"37" "(D|E).......(D|E)..pY..L.......Y..(L|I)" "Lyn" "Lyn kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7526393,10452987" "kinase substrate" "HPRD" +"38" "EEEEpYVFI." "PDGFR_group" "PDGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"39" "(L|N)(R|I)TpY" "PDGFR_group" "PDGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8843147" "kinase substrate" "HPRD" +"40" "(D|E)(D|E)(D|E)(D|E)pY(V|E|I)F(I|V|F)" "PDGFR_group" "PDGFR kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468,8578591" "kinase substrate" "HPRD" +"41" "(D|E).......(D|E)..pY..L.......Y..(L|I)" "SRC_group" "Src family kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7526393,10452987" "kinase substrate" "HPRD" +"42" "(I|V|L|S).pY..(L|I)" "SRC_group" "Src family kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=9469421" "kinase substrate" "HPRD" +"43" "pYM.M" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8227078" "kinase substrate" "HPRD" +"44" "YIpYGSFK" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7558590" "kinase substrate" "HPRD" +"45" "EEEIpY(G|E)EFD" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "kinase substrate" "HPRD" +"46" "D(D|E)(E|D|G)(I|V|L)pY(G|E)E(F|I)F" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468" "kinase substrate" "HPRD" +"47" "(D|E).......(D|E)..pY..L.......Y..(L|I)" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7612891,15173175" "kinase substrate" "HPRD" +"48" "(D|E)(D|E)(E|D|G)(I|V|L)pY(G|E|D)E(F|I|L|V)(D|E)" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=7845468" "kinase substrate" "HPRD" +"49" "pY(A|G|S|T|D|E)" "SRC" "Src kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=16273072" "kinase substrate" "HPRD" +"50" "(E|D|pT|pY).pYEE" "SYK" "Syk kinase" "https://pubmed.ncbi.nlm.nih.gov/?term=8982275" "kinase substrate" "HPRD" +"51" "(D|E)pYpY(R|K)" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11694501,11163213" "phosphatase substrate" "HPRD" +"52" "EFpY(G|A)TY(G|A)" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"53" "E(Y|F|D)pYM" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12237455" "phosphatase substrate" "HPRD" +"54" "(E|P)(M|L|I|V|F)pY(G|A).(M|L|I|V|F|Y)A" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"55" "RD.Y.TDYpYR" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12237455" "phosphatase substrate" "HPRD" +"56" "E(F|D|Y)pY" "PTP1B (HPRD)" "PTP1B phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9527843,9843364,12237455" "phosphatase substrate" "HPRD" +"57" "DpYpYR" "PTPN6 SH2|PTPN11 SH2" "PTP1B, TC-PTP phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11163213" "phosphatase substrate" "HPRD" +"58" "(D|E)FpY(G|A)(F|Y)(A|G)" "PTPRH SH2 (HPRD)" "PTPRH phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"59" "F(M|L|V|I)pY" "PTPRJ SH2 (HPRD)" "PTPRJ phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"60" "(D|E).(L|I|V).pY..(L|I|V)" "PTPN6 SH2" "SHP1 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10660565" "phosphatase substrate" "HPRD" +"61" "(D|E).(L|I|V)..pY..(L|I|V)" "PTPN6 SH2" "SHP1 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10660565" "phosphatase substrate" "HPRD" +"62" "(D|E)(D|E)(D|E|L).pY..(F|M|L|V|I)(D|E)" "PTPN6 SH2" "SHP1 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14699166" "phosphatase substrate" "HPRD" +"63" "(D|E).pY" "PTPN6 SH2" "SHP1 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11994017" "phosphatase substrate" "HPRD" +"64" "(E|P)(F|I|L)pYA.(F|I|L|V)" "PTPN6 SH2" "SHP1 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"65" "pYIDL" "PTPN11 SH2" "SHP2 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7504175,9756938" "phosphatase substrate" "HPRD" +"66" "pYASI" "PTPN11 SH2" "SHP2 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16020478,9756938" "phosphatase substrate" "HPRD" +"67" "EFpYA.(V|I)G(R|K|H)S" "PTPN11 SH2" "SHP2 phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14578355" "phosphatase substrate" "HPRD" +"68" "(D|E)(D|E)...pYVA" "TC-PTP SH2 (HPRD)" "TC-PTP phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7678807" "phosphatase substrate" "HPRD" +"69" "(E|D|Y)pY" "TC-PTP SH2 (HPRD)" "TC-PTP phosphatase substrate motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11352902,12237455" "phosphatase substrate" "HPRD" +"1" "pY(E|M|V)(N|V|I)" "SH3BP2 SH2" "3BP2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"2" "pYENP" "ABL1 SH2" "Abl SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959" "domain binding" "HPRD" +"3" "pY..P" "CRK SH2" "Crk SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11994738,11607838,7511210" "domain binding" "HPRD" +"4" "pYDHP" "CRK SH2" "Crk SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11994738,7680959" "domain binding" "HPRD" +"5" "pY(T|A|S)(K|R|Q|N)(M|I|V|R)" "Csk SH2" "Csk SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11994738" "domain binding" "HPRD" +"6" "pY(Y|I|V)N(F|L|I|V)" "GRB2 SH2" "Grb2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"7" "pYE.(V|I)" "FES SH2" "Fes SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11994738,7511210" "domain binding" "HPRD" +"8" "pYEE(I|V)" "FGR SH2" "Fgr SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7680959" "domain binding" "HPRD" +"9" "pYEDP" "Fyn SH2" "Fyn SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8622893" "domain binding" "HPRD" +"10" "pY(M|I|L|V).(M|I|L|V)" "FES SH2|SH3BP2 SH2|Csk SH2|GRB2 SH2|SYK SH2" "GRB2, 3BP2, Csk, Fes, Syk C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959" "domain binding" "HPRD" +"11" "pY(Q|Y|V)N(Y|Q|F)" "GRB2 SH2" "Grb2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"12" "pY.N" "GRB2 SH2" "Grb2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,11994738" "domain binding" "HPRD" +"13" "(F|Y)pY(E|T|Y|S)N(I|L|V|P|T|Y|S)" "GRB7 SH2|GRB10 SH2" "GRB7, GRB10 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14679191" "domain binding" "HPRD" +"14" "pYF.(F|P|L|Y)" "PTPN6 SH2" "HCP SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"15" "pY(A|E|V)(Y|F|E|S|N|V)(P|F|I|H)" "ITK SH2" "Itk SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10636929" "domain binding" "HPRD" +"16" "pYDYV" "Lck SH2|Src SH2" "Lck and Src SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16245368" "domain binding" "HPRD" +"17" "pYDEP" "NCK SH2" "Nck SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959" "domain binding" "HPRD" +"18" "pYM.M" "PIK3R1 SH2" "PI3 Kinase p85 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,11994738" "domain binding" "HPRD" +"19" "pY..M" "PIK3R1 SH2" "PI3 Kinase p85 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=1380456" "domain binding" "HPRD" +"20" "pYMPMS" "PIK3R1 SH2" "PI3 Kinase p85 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16020478" "domain binding" "HPRD" +"21" "pY(L|I|V)E(L|I|V)" "PLCG1 SH2|PTPN11 SH2" "PLCgamma C and N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210,7680959" "domain binding" "HPRD" +"22" "pY..P" "RASA_group SH2" "RasGAP C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9233798" "domain binding" "HPRD" +"23" "pYILV.(M|L|I|V|P)" "RASA_group SH2" "RasGAP N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9233798" "domain binding" "HPRD" +"24" "TIpY..(V|I)" "SH2D1A SH2|SH2D1B SH2" "SAP and EAT2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10549287" "domain binding" "HPRD" +"25" "pY(L|V)N(V|P)" "GRB2 SH2|STAT3 SH2" "Sem5 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7680959" "domain binding" "HPRD" +"26" "pY(T|V|I).L" "SHB SH2" "Shb SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7537362" "domain binding" "HPRD" +"27" "pY(I|E|Y|L).(I|L|M)" "SHC_group SH2|SHC1 SH2|SHC2 SH2" "SHC SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"28" "(I|V|L|S).pY..(L|I)" "PTPN11 SH2|PTPN6 SH2" "SHIP2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10789675" "domain binding" "HPRD" +"29" "(V|I|L).pYA.(L|V)" "PTPN6 SH2" "SHP1 C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11052678" "domain binding" "HPRD" +"30" "..pYYM(K|R)" "PTPN6 SH2" "SHP1 C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11052678" "domain binding" "HPRD" +"31" "L(Y|H)pY(M|F).(F|M)" "PTPN6 SH2" "SHP1 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11052678" "domain binding" "HPRD" +"32" "L.pYA.L" "PTPN6 SH2" "SHP1 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11052678" "domain binding" "HPRD" +"33" "(I|V).pY..(L|V)" "PTPN6 SH2" "SHP1 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9148918" "domain binding" "HPRD" +"34" "(V|I|L).pY(M|L|F).P" "PTPN6 SH2|PTPN11 SH2" "SHP1, SHP2 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16274240" "domain binding" "HPRD" +"35" "(T|V|I|Y).pY(A|S|T|V).(I|V|L)" "PTPN11 SH2" "SHP2 CSH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16274240" "domain binding" "HPRD" +"36" "(I|L|V)(I|L|V)(I|L|V|F|T|Y)pY(T|I|L|V)(I|L)(I|L|V|P)" "PTPN11 SH2" "SHP2 C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14679191" "domain binding" "HPRD" +"37" "(H|F).V.(T|S|A)pY" "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16142918" "domain binding" "HPRD" +"38" "(I|V|L).pY(F|M).P" "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16274240" "domain binding" "HPRD" +"39" "pY(I|V).(I|V)" "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7680959" "domain binding" "HPRD" +"40" "(I|L|V|M).pY(T|V|A).(I|V|L|F)" "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16274240" "domain binding" "HPRD" +"41" "(I|V).pY(L|M|T)Y(A|P|T)SG" "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=16274240" "domain binding" "HPRD" +"42" "W(M|T|V)pY(Y|R)(I|L)." "PTPN11 SH2" "SHP2 N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11052678" "domain binding" "HPRD" +"43" "pYIPP" "PLCG1 SH2|PTPN11 SH2" "SHP2, PLCgamma SH2 domain binding motifs" "https://pubmed.ncbi.nlm.nih.gov/?term=9516477" "domain binding" "HPRD" +"44" "pYM.M" "PIK3R1 SH2|Src SH2" "Src and Abl SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8227078" "domain binding" "HPRD" +"45" "pY(R|K|H|Q|E|D)(R|K|H|Q|E|D)(I|P)" "Src SH2|Fyn SH2|Lck SH2|FGR SH2|ABL1 SH2|CRK SH2|NCK SH2" "Src, Fyn, Lck, Fgr, Abl, Crk, Nck SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7680959" "domain binding" "HPRD" +"46" "PP.pY" "Src SH2|Fyn SH2|Lck SH2|Csk SH2|NCK SH2|SHC1 SH2" "Src, Fyn,Csk, Nck and SHC SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=11724572" "domain binding" "HPRD" +"47" "pYEEI" "Src SH2|Fyn SH2|Lck SH2" "Src,Lck and Fyn SH2 domains binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591,7680959" "domain binding" "HPRD" +"48" "pY(D|E)(P|R)(R|P|Q)" "STAT1 SH2" "STAT1 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12591923" "domain binding" "HPRD" +"49" "pY..Q" "STAT3 SH2" "STAT3 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14966128" "domain binding" "HPRD" +"50" "pY(M|L|V|I|F)(P|R|K|H)Q" "STAT3 SH2" "STAT3 SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=12591923" "domain binding" "HPRD" +"51" "pY(Q|T|E)(E|Q)(L|I)" "SYK SH2" "Syk C-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"52" "pYTT(I|L|M)" "SYK SH2" "Syk N-terminal SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "domain binding" "HPRD" +"53" "(D|E).......(D|E)..pY..L.......Y..(L|I)" "SYK SH2|SHC_group SH2|Lyn SH2|ZAP70" "Syk, ZAP-70, Shc, Lyn SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=10452987" "domain binding" "HPRD" +"54" "pYEN(F|I|V)" "FES SH2|SH3BP2 SH2" "Tensin SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8578591" "domain binding" "HPRD" +"55" "pY(M|L|E)EP" "VAV1 SH2|VAV2 SH2|VAV_group SH2" "Vav SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7511210" "domain binding" "HPRD" +"56" "pYESP" "VAV1 SH2|VAV2 SH2|VAV_group SH2" "Vav SH2 domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9151714" "domain binding" "HPRD" +"57" "D(N|D).pY" "CBL PTB" "Cbl PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9407100,10078535" "domain binding" "HPRD" +"58" "N.LpY" "DOK_group PTB" "Dok1 PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=14607833,12665569" "domain binding" "HPRD" +"59" "N..pY" "FRS2 PTB" "FRIP PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9697832" "domain binding" "HPRD" +"60" "NP.pY" "SHC1 PTB" "Shc PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=7542744,7541030" "domain binding" "HPRD" +"61" "DD.pY" "SHB SH2" "Shb PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=9484780" "domain binding" "HPRD" +"62" "NP.pYF.R" "ShcA PTB" "ShcA PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8662772" "domain binding" "HPRD" +"63" "HN(M|L|V|I)(M|L|V|I|N)NP(S|T)pY" "ShcC PTB" "ShcC PTB domain binding motif" "https://pubmed.ncbi.nlm.nih.gov/?term=8662772" "domain binding" "HPRD" +"1" "R.(pS|pT)" "PKA_group" "PKA" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"2" "R(R|K).(pS|pT)" "PKA_group" "PKA" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"3" "KR..(pS|pT)" "PKA_group" "PKA" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"4" "S..(pS|pT)" "CK1_group" "CK1" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"5" "(S|T)...pS" "CK1_group" "CK1" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"6" "(pS|pT)..E" "CK2_group" "CK2" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"7" "pS...S" "GSK3" "GSK3" "https://pubmed.ncbi.nlm.nih.gov/2156841" "kinase substrate" "Phosida" +"8" "(pS|pT)P.(K|R)" "CDK2" "CDK2" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"9" "R..(pS|pT)" "CaM-KII_group" "CAMK2" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"10" "R..(pS|pT)V" "CaM-KII_group" "CAMK2" "https://pubmed.ncbi.nlm.nih.gov/1956339" "kinase substrate" "Phosida" +"11" "P.(pS|pT)P" "MAP2K_group" "ERK/MAPK" "https://pubmed.ncbi.nlm.nih.gov/8325833" "kinase substrate" "Phosida" +"12" "V.(pS|pT)P" "MAP2K_group" "ERK/MAPK" "https://pubmed.ncbi.nlm.nih.gov/8325833" "kinase substrate" "Phosida" +"13" "PE(pS|pT)P" "MAP2K_group" "ERK/MAPK" "https://pubmed.ncbi.nlm.nih.gov/8325833" "kinase substrate" "Phosida" +"14" "R(R|S|T).(pS|pT).(S|T)" "PKB_group" "PKB/AKT" "https://pubmed.ncbi.nlm.nih.gov/15789031" "kinase substrate" "Phosida" +"15" "R.R..(pS|pT)" "PKB_group" "PKB/AKT" "https://pubmed.ncbi.nlm.nih.gov/15789031" "kinase substrate" "Phosida" +"16" "R..(pS|pT).R" "PKC_group" "PKC" "https://pubmed.ncbi.nlm.nih.gov/15782149" "kinase substrate" "Phosida" +"17" "(L|V|I).(R|K)..(pS|pT)" "PKD" "PKD" "https://pubmed.ncbi.nlm.nih.gov/15782149" "kinase substrate" "Phosida" +"18" "(I|E|V)pY(E|G)(E|D|P|N)(I|V|L)" "Lck" "LCK" "https://pubmed.ncbi.nlm.nih.gov/7845468" "kinase substrate" "Phosida" +"19" "(I|V|L)pY..(P|F)" "ABL1" "ABL" "https://pubmed.ncbi.nlm.nih.gov/7845468" "kinase substrate" "Phosida" +"20" "(E|D)..pY..(D|E|A|G|S|T)" "SRC_group" "SRC" "https://pubmed.ncbi.nlm.nih.gov/16273072" "kinase substrate" "Phosida" +"21" "pY..(I|L|V|M)" "ALK" "ALK" "https://pubmed.ncbi.nlm.nih.gov/16273072" "kinase substrate" "Phosida" +"22" "(D|P|S|A|E|N).pY(V|L|D|E|I|N|P)" "EGFR" "EGFR" "https://pubmed.ncbi.nlm.nih.gov/16381900" "kinase substrate" "Phosida" +"23" "(pS|pT)P.(K|R)" "CDK1" "CDK1" "https://pubmed.ncbi.nlm.nih.gov/12501191" "kinase substrate" "Phosida" +"24" "(pS|pT)P(K|R)" "CDK1" "CDK1" "https://pubmed.ncbi.nlm.nih.gov/12501191" "kinase substrate" "Phosida" +"25" "(R|K).(pS|pT)(I|L|V)" "Aurora A" "AURORA" "https://pubmed.ncbi.nlm.nih.gov/12408861" "kinase substrate" "Phosida" +"26" "(R|K|N)R.(pS|pT)(M|L|V|I)" "Aurora A" "AURORA-A" "https://pubmed.ncbi.nlm.nih.gov/16083426" "kinase substrate" "Phosida" +"27" "(D|E).(pS|pT)(V|I|L|M).(D|E)" "PLK" "PLK" "https://pubmed.ncbi.nlm.nih.gov/12738781" "kinase substrate" "Phosida" +"28" "(E|D).(pS|pT)(F|L|I|Y|W|V|M)" "PLK" "PLK1" "https://pubmed.ncbi.nlm.nih.gov/12738781" "kinase substrate" "Phosida" +"29" "L..(pS|pT)" "NEK6" "NEK6" "https://pubmed.ncbi.nlm.nih.gov/12023960" "kinase substrate" "Phosida" +"30" "L.R..(pS|pT)" "CHK1" "CHK1/2" "https://pubmed.ncbi.nlm.nih.gov/17464182" "kinase substrate" "Phosida" +"31" "(M|I|L|V).(R|K)..(pS|pT)" "CHK1" "CHK1" "https://pubmed.ncbi.nlm.nih.gov/10648819" "kinase substrate" "Phosida" +"32" "F..F(pS|pT)(F|Y)" "PDK1" "PDK1" "https://pubmed.ncbi.nlm.nih.gov/11516946" "kinase substrate" "Phosida" +"33" "(F|L|M)(R|K)(R|K)(pS|pT)" "NIMA" "NIMA" "https://pubmed.ncbi.nlm.nih.gov/8887677" "kinase substrate" "Phosida"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input_for_anova.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,24 @@ +Phosphopeptide Sequence10 Sequence7 Gene_Name Phosphoresidue UniProt_ID Description Function Phosphoresidue(PSP=PhosphoSitePlus.org) Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains Intensity.shL.1A Intensity.shL.1B Intensity.shL.1C Intensity.shR.2A Intensity.shR.2B Intensity.shR.2C +AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | BARD1 Q99728 38150000 39445000 56305000 55338000 7010600 70203000 +AAAITDMADLEELSRLpSPLPPGpSPGSAAR MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE LEELSRLsPLPPGSP | LSPLPPGsPGSAARG AEBP2; AEBP2 pS18, pS24; pS18, pS24 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A N/A 5416400 7101800 385280000 208060000 41426000 352400000 +ADALQAGASQFETpSAAK LQAGASQFETpSAAKLKRKYWW GASQFETsAAKLKRK VAMP2; VAMP3 pS80; pS63 P63027; Q15836 VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKD3 | PKCiota 44627000 41445000 69094000 42521000 5738000 61819000 +DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | PDHK1 75542000 44814000 32924000 35016000 11023000 4669900 +EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | GSK3 12562000 16302000 23000000 7857800 0 18830000 +EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | GRK-2 | PDHK1 0 0 0 0 0 0 +IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | GRK-2 | DOC_WW_Pin1_4 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000 +IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha 22562000 18225000 9119700 11689000 0 0 +KGLLApTpSGNDGTIR VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR HERC1 pT3445, pS3446 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A N/A 7843600 0 241700000 0 0 10042600 +KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 18629000 0 0 0 +KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) MDC1 FHA | GSK3 | PLK1 PBD 7090300 8341200 9691500 10030000 1675200 9952100 +LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A DOC_WW_Pin1_4 0 11706000 12495000 0 7273000 8877800 +NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | GRK-2 | BARD1 Q99728 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000 +pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 8765300 0 2355900 14706000 +pTYVDPFTpYEDPNQAVR EEKHLNQGVRpTYVDPFTYEDP; GVRTYVDPFTpYEDPNQAVREF HLNQGVRtYVDPFTY | TYVDPFTyEDPNQAV EPHA4; EPHA4 pT595, pY602; pT544, pY551 P54764; P54764-2 EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1; EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 N/A EPHA4 | EphA1 | EphA2 | EphA3 | EphA5 | EphA7 | EphA6 | Abl | EphA8 | Fgr | Yes | BLK | HCK | EphB6 | EphB3 725460 0 1651300 655850 646420 0 +QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | NEK6 68201000 87774000 138300000 95357000 19966000 149110000 +RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A GSK3 19262000 11103000 19454000 0 1816900 22028000 +SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A PLK1 | PDHK1 31407000 17665000 20892000 23194000 5132400 54893000 +SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A GRK-2 11766000 13176000 20540000 16963000 4364700 21308000 +STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta 3057100 4718800 12052000 5047700 1070900 8333500 +TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion N/A 20400000 9738500 7862300 0 0 76518000 +TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | DOC_WW_Pin1_4 29764000 20957000 24855000 30752000 8304800 23771000 +YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) GRK-2 323250000 127970000 0 67123000 12790000 71378000
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_input_for_preproc.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,39 @@ +Proteins Positions within proteins Leading proteins Protein Fasta headers Localization prob Score diff PEP Score Delta score Score for localization Localization prob shL.1A Score diff shL.1A PEP shL.1A Score shL.1A Localization prob shL.1B Score diff shL.1B PEP shL.1B Score shL.1B Localization prob shL.1C Score diff shL.1C PEP shL.1C Score shL.1C Localization prob shR.2A Score diff shR.2A PEP shR.2A Score shR.2A Localization prob shR.2B Score diff shR.2B PEP shR.2B Score shR.2B Localization prob shR.2C Score diff shR.2C PEP shR.2C Score shR.2C Diagnostic peak Number of Phospho (STY) Amino acid Sequence window Modification window Peptide window coverage Phospho (STY) Probabilities Phospho (STY) Score diffs Position in peptide Charge Mass error [ppm] Identification type shL.1A Identification type shL.1B Identification type shL.1C Identification type shR.2A Identification type shR.2B Identification type shR.2C Intensity Intensity___1 Intensity___2 Intensity___3 Ratio mod/base Intensity shL.1A Intensity shL.1B Intensity shL.1C Intensity shR.2A Intensity shR.2B Intensity shR.2C Ratio mod/base shL.1A Ratio mod/base shL.1B Ratio mod/base shL.1C Ratio mod/base shR.2A Ratio mod/base shR.2B Ratio mod/base shR.2C Intensity shL.1A___1 Intensity shL.1A___2 Intensity shL.1A___3 Intensity shL.1B___1 Intensity shL.1B___2 Intensity shL.1B___3 Intensity shL.1C___1 Intensity shL.1C___2 Intensity shL.1C___3 Intensity shR.2A___1 Intensity shR.2A___2 Intensity shR.2A___3 Intensity shR.2B___1 Intensity shR.2B___2 Intensity shR.2B___3 Intensity shR.2C___1 Intensity shR.2C___2 Intensity shR.2C___3 Occupancy shL.1A Occupancy ratioshL.1A Occupancy error scale shL.1A Occupancy shL.1B Occupancy ratioshL.1B Occupancy error scale shL.1B Occupancy shL.1C Occupancy ratioshL.1C Occupancy error scale shL.1C Occupancy shR.2A Occupancy ratioshR.2A Occupancy error scale shR.2A Occupancy shR.2B Occupancy ratioshR.2B Occupancy error scale shR.2B Occupancy shR.2C Occupancy ratioshR.2C Occupancy error scale shR.2C Reverse Potential contaminant id Protein group IDs Positions Position Peptide IDs Mod. peptide IDs Evidence IDs MS/MS IDs Best localization evidence ID Best localization MS/MS ID Best localization raw file Best localization scan number Best score evidence ID Best score MS/MS ID Best score raw file Best score scan number Best PEP evidence ID Best PEP MS/MS ID Best PEP raw file Best PEP scan number +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 108;108;124;124;131;104;104;120 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877317 8.54376 0.001041 110.11 55.028 110.11 1 S TGDHIPTPQDLPQRKSSLVTSKLAG______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPXXXXXXXXX KS(0.877)S(0.123)LVTSK KS(8.54)S(-8.54)LVT(-58.58)S(-72.01)K 2 2 0.022801 By MS/MS 18629000 18629000 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 18629000 0 0 0 0 0 0 0 0 0 0 0 700 529 108 108 12310;20039 13742;22688 99166 91729 99166 91729 QE05099 5593 99166 91729 QE05099 5593 99166 91729 QE05099 5593 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN 109;109;125;125;132;105;105;121 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 0.877764 9.23011 0.00135208 98.182 25.939 55.754 1 S GDHIPTPQDLPQRKSSLVTSKLAG_______ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPXXXXXXXXXX KS(0.105)S(0.878)LVT(0.015)S(0.002)K KS(-9.23)S(9.23)LVT(-17.65)S(-25.69)K 3 2 -0.061619 By MS/MS By MS/MS By matching By matching By matching By MS/MS 81973000 81973000 0 0 7090300 8341200 9691500 10030000 1675200 9952100 7090300 0 0 8341200 0 0 9691500 0 0 10030000 0 0 1675200 0 0 9952100 0 0 701 529 109 109 12310;20039 13742;22688 99164;99165;99168;99169;160369;160370;160371;160372;160373;160374 91727;91728;91731;142479 99164 91727 QE05097 5219 99167 91730 QE05100 5516 99167 91730 QE05100 5516 +CON__P02662 46 CON__P02662 CON__P02662 0.99978 36.4544 1.10E-08 122.19 116.48 122.19 2 S VFGKEKVNELSKDIGSESTEDQAMEDIKQME X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPPPXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 4 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 2 14 46 46 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 +CON__P02662 48 CON__P02662 CON__P02662 0.971522 15.3284 1.10E-08 122.19 116.48 122.19 2 S GKEKVNELSKDIGSESTEDQAMEDIKQMEAE X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;Phospho (STY);X;X;X;X;X;Oxidation (M);X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPXXXXX DIGS(1)ES(0.972)T(0.029)EDQAMEDIK DIGS(36.45)ES(15.33)T(-15.33)EDQAMEDIK 6 2 0.56139 By MS/MS By MS/MS By MS/MS 49187000 0 49187000 0 NaN 16494000 0 20139000 0 0 12553000 NaN NaN NaN NaN NaN NaN 0 16494000 0 0 0 0 0 20139000 0 0 0 0 0 0 0 0 12553000 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 3 14 48 48 3452 3862;3863 27864;27865;27866;27867 25820;25821;25822;25823 27865 25821 QE05099 36641 27865 25821 QE05099 36641 27865 25821 QE05099 36641 +CON__P02662 115 CON__P02662 CON__P02662 1 50.1781 4.91E-07 124.08 88.205 50.178 1 S RLKKYKVPQLEIVPNSAEERLHSMKEGIHAQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPXXXXXXXXXXX VPQLEIVPNS(1)AEER VPQLEIVPNS(50.18)AEER 10 3 -0.26085 By MS/MS By matching By MS/MS By matching By matching By MS/MS 228160000 228160000 0 0 NaN 36938000 3667100 7945800 0 2359500 8418700 NaN NaN NaN NaN NaN NaN 36938000 0 0 3667100 0 0 7945800 0 0 0 0 0 2359500 0 0 8418700 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 4 14 115 115 23142 26196 185609;185610;185611;185612;185613;185614;185615 165233;165234;165235;165236 185612 165236 QE05102 41518 185610 165234 QE05097 41110 185610 165234 QE05097 41110 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-8|ENSA_HUMAN 2;2;2;2;2;2 sp|O43768-2|ENSA_HUMAN sp|O43768-2|ENSA_HUMAN 1.0 73.249 3.69e-06 83.395 74.925 83.395 1 S ______________MSQKQEEENPAEETGEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(1)QKQEEENPAEETGEEK S(73.25)QKQEEENPAEET(-73.25)GEEK 1 2 -0.84902 By matching By matching By MS/MS 25828000 25828000 0 0 0 0 8765300 0 2355900 14706000 0 0 0 0 0 0 8765300 0 0 0 0 0 2355900 0 0 14706000 0 0 702 529 2 2 19781 22398 158249;158250;158251 140920 158249 140920 QE05102 12907 158249 140920 QE05102 12907 158249 140920 QE05102 12907 +sp|O43768-2|ENSA_HUMAN;sp|O43768|ENSA_HUMAN;sp|O43768-9|ENSA_HUMAN;sp|O43768-3|ENSA_HUMAN;sp|O43768-4|ENSA_HUMAN;sp|O43768-6|ENSA_HUMAN;sp|O43768-5|ENSA_HUMAN;sp|O43768-7|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN;sp|P56211|ARP19_HUMAN 67;67;83;83;90;63;63;79;46;62 sp|O43768-2|ENSA_HUMAN;sp|P56211-2|ARP19_HUMAN sp|O43768-2|ENSA_HUMAN 0.999907 42.1841 4.04e-05 77.894 72.756 77.894 1 S DFLMKRLQKGQKYFDSGDYNMAKAKMKNKQL;DFLRKRLQKGQKYFDSGDYNMAKAKMKNKQL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPXXXXXXXX YFDS(1)GDYNMAK Y(-44.9)FDS(42.18)GDY(-42.18)NMAK 4 2 0.090313 By MS/MS By MS/MS By matching By MS/MS By MS/MS 602510000 602510000 0 0 323250000 127970000 0 67123000 12790000 71378000 323250000 0 0 127970000 0 0 0 0 0 67123000 0 0 12790000 0 0 71378000 0 0 703 529;2007 67;46 67 23817 26932 190543;190544;190545;190546;190547 169398;169399;169400;169401 190543 169398 QE05097 28697 190543 169398 QE05097 28697 190543 169398 QE05097 28697 +sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1577;304 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S KPESTDDEEKIGNEESDLEEACILPHSPINV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPPPPPPPP IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 6 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1295 867 1577 1577 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 +sp|O95714|HERC2_HUMAN;sp|Q9BVR0|HRC23_HUMAN 1588;315 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 100.152 1.12e-15 100.15 94.415 100.15 2 S GNEESDLEEACILPHSPINVDKRPIAIKSPK X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX IGNEES(1)DLEEACILPHS(1)PINVDK IGNEES(100.15)DLEEACILPHS(100.15)PINVDK 17 3 -0.31776 By matching By matching By matching By matching By MS/MS By MS/MS 398730000 0 398730000 0 83882000 60609000 77868000 70320000 41821000 64234000 0 83882000 0 0 60609000 0 0 77868000 0 0 70320000 0 0 41821000 0 0 64234000 0 1296 867 1588 1588 11517 12858 93270;93271;93272;93273;93274;93275 86700;86701 93271 86701 QE05102 51298 93271 86701 QE05102 51298 93271 86701 QE05102 51298 +sp|O95714|HERC2_HUMAN 2928 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 44.9549 6.81e-12 84.285 78.578 44.955 1 S IRAEEEDLAAVPFLASDNEEEEDEKGNSGSL X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPXXXXXX IRAEEEDLAAVPFLAS(1)DNEEEEDEK IRAEEEDLAAVPFLAS(44.95)DNEEEEDEK 16 3 -0.24823 By MS/MS By MS/MS By matching By matching 61597000 61597000 0 0 22562000 18225000 9119700 11689000 0 0 22562000 0 0 18225000 0 0 9119700 0 0 11689000 0 0 0 0 0 0 0 0 1297 867 2928 2928 11904 13281 96043;96044;96045;96046 89048;89049 96044 89049 QE05098 52942 96043 89048 QE05097 52381 96043 89048 QE05097 52381 +sp|O95714|HERC2_HUMAN 1938 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KYDLKLAELPAAAQPSAEDSDTEDDSEAEQT X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXPPPPPPPPPPPPPPPPPPPPPPPPPP LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 11 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1298 867 1938 1938 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 +sp|O95714|HERC2_HUMAN 1942 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 0.427104 0.0 4.17e-06 44.164 42.292 44.164 S KLAELPAAAQPSAEDSDTEDDSEAEQTERNI X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX LAELPAAAQPS(0.427)AEDS(0.427)DT(0.142)EDDS(0.003)EAEQTER LAELPAAAQPS(0)AEDS(0)DT(-4.78)EDDS(-20.87)EAEQT(-37.92)ER 15 3 -1.2171 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1299 867 1942 1942 12395 13829 99721 92163 QE05099 31358 99721 92163 QE05099 31358 99721 92163 QE05099 31358 +sp|O95714|HERC2_HUMAN 3462 sp|O95714|HERC2_HUMAN sp|O95714|HERC2_HUMAN 1.0 41.1171 0.0267288 41.117 33.02 41.117 1 S NGEECMLAVDIEDRLSPNPWQEKREIVSSED X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXPPPPPPPPPXXXXXXXX LS(1)PNPWQEK LS(41.12)PNPWQEK 2 2 0.64603 By matching By MS/MS By matching By matching 40352000 40352000 0 0 0 11706000 12495000 0 7273000 8877800 0 0 0 11706000 0 0 12495000 0 0 0 0 0 7273000 0 0 8877800 0 0 1300 867 3462 3462 14140 15756 112737;112738;112739;112740 102778 112737 102778 QE05099 28079 112737 102778 QE05099 28079 112737 102778 QE05099 28079 +sp|Q08945|SSRP1_HUMAN 667 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.824557 6.72928 2.29e-05 88.385 80.253 88.385 1 S SSRQLSESFKSKEFVSSDESSSGENKSKKKR X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPPXXXXX EFVS(0.825)S(0.175)DESSSGENK EFVS(6.73)S(-6.73)DES(-34.1)S(-47.3)S(-52.91)GENK 4 2 -0.31453 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 78553000 78553000 0 0 12562000 16302000 23000000 7857800 0 18830000 12562000 0 0 16302000 0 0 23000000 0 0 7857800 0 0 0 0 0 18830000 0 0 3469 2387 667 667 6499 7276 53820;53821;53822;53823;53824 51145;51146;51147;51148;51149 53820 51145 QE05097 12983 53820 51145 QE05097 12983 53820 51145 QE05097 12983 +sp|Q08945|SSRP1_HUMAN 444 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999939 44.165 7.94e-20 97.469 93.771 97.469 1 S GLKEGMNPSYDEYADSDEDQHDAYLERMKEE X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXPPPPPPPPPPPPPPPPPPPPPPPPXXXX EGMNPSYDEYADS(1)DEDQHDAYLER EGMNPS(-49.21)Y(-49.82)DEY(-44.17)ADS(44.17)DEDQHDAY(-90.19)LER 13 3 0.19918 By MS/MS By MS/MS 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3470 2387 444 444 6658 7448 55048;55049 52320;52321 55048 52320 QE05099 31926 55048 52320 QE05099 31926 55048 52320 QE05099 31926 +sp|Q08945|SSRP1_HUMAN 659 sp|Q08945|SSRP1_HUMAN sp|Q08945|SSRP1_HUMAN 0.999878 39.1416 0.00235198 117.7 65.216 117.7 1 S SRGSSSKSSSRQLSESFKSKEFVSSDESSSG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPXXXXXXXXXXXXX QLSES(1)FK QLS(-39.14)ES(39.14)FK 5 2 0.14738 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 558700000 558700000 0 0 68201000 87774000 138300000 95357000 19966000 149110000 68201000 0 0 87774000 0 0 138300000 0 0 95357000 0 0 19966000 0 0 149110000 0 0 3471 2387 659 659 16873 19002 134380;134381;134382;134383;134384;134385 120469;120470;120471;120472;120473 134381 120470 QE05098 17736 134381 120470 QE05098 17736 134381 120470 QE05098 17736 +sp|Q15751|HERC1_HUMAN 3446 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2167 0.0187791 47.548 7.8172 47.548 2 S VMTCVWCNKKGLLATSGNDGTIRVWNVTKKQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPXXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 7 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 4421 2824 3446 3446 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 +sp|Q15751|HERC1_HUMAN 1491 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.9956 24.4686 0.000725254 80.245 41.065 80.245 1 S STSASEGGGLMTRSESLTAESRLVHTSPNYR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX S(0.004)ES(0.996)LT(0.001)AESR S(-24.47)ES(24.47)LT(-30.8)AES(-48.77)R 3 2 -0.02332 By matching By MS/MS By MS/MS By MS/MS By matching By MS/MS 88117000 88117000 0 0 11766000 13176000 20540000 16963000 4364700 21308000 11766000 0 0 13176000 0 0 20540000 0 0 16963000 0 0 4364700 0 0 21308000 0 0 4422 2824 1491 1491 18146 20455 144586;144587;144588;144589;144590;144591 129449;129450;129451;129452 144587 129450 QE05099 10286 144587 129450 QE05099 10286 144587 129450 QE05099 10286 +sp|Q15751|HERC1_HUMAN 1510 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S ESRLVHTSPNYRLIKSRSESDLSQPESDEEG X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXPPPPPPPPPPPPPPPP S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 1 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4423 2824 1510 1510 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 +sp|Q15751|HERC1_HUMAN 1512 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.473289 2.22394 8.37e-06 56.783 53.982 56.783 S RLVHTSPNYRLIKSRSESDLSQPESDEEGYA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPPPP S(0.284)RS(0.473)ES(0.219)DLS(0.024)QPESDEEGYALSGR S(-2.22)RS(2.22)ES(-3.34)DLS(-13.02)QPES(-39.32)DEEGY(-52.92)ALS(-56.34)GR 3 3 -0.16378 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4424 2824 1512 1512 19884 22510 159107 141524 QE05101 26243 159107 141524 QE05101 26243 159107 141524 QE05101 26243 +sp|Q15751|HERC1_HUMAN 1514 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.330689 0.0 7.97e-05 45.193 39.23 45.193 S VHTSPNYRLIKSRSESDLSQPESDEEGYALS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX S(0.331)RS(0.331)ES(0.331)DLS(0.008)QPESDEEGYALSGR S(0)RS(0)ES(0)DLS(-16.27)QPES(-35.13)DEEGY(-44.24)ALS(-45.11)GR 5 3 0.88872 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4425 2824 1514 1514 19884 22510 159108 141525 QE05102 26609 159108 141525 QE05102 26609 159108 141525 QE05102 26609 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 18;18 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.998316 27.7896 1.21e-62 181.56 176.76 181.56 2 S AAITDMADLEELSRLSPLPPGSPGSAARGRA X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPPPPPPPXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 17 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5468 3335 18 18 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 24;24 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.809237 6.27624 1.21e-62 181.56 176.76 181.56 2 S ADLEELSRLSPLPPGSPGSAARGRAEPPEEE X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X PPPPPPPPPPPPPPPPPPPPPPXXXXXXXXX AAAITDMADLEELS(0.002)RLS(0.998)PLPPGS(0.809)PGS(0.191)AAR AAAIT(-99.88)DMADLEELS(-27.79)RLS(27.79)PLPPGS(6.28)PGS(-6.28)AAR 23 3 0.97551 By matching By matching By matching By MS/MS By MS/MS By MS/MS 499850000 0 499850000 0 2708200 3550900 192640000 104030000 20713000 176200000 0 2708200 0 0 3550900 0 0 192640000 0 0 104030000 0 0 20713000 0 0 176200000 0 5469 3335 24 24 28 35 264;265;266;267;268;269 236;237;238;239 264 236 QE05100 65231 264 236 QE05100 65231 264 236 QE05100 65231 +sp|Q6ZN18-2|AEBP2_HUMAN;sp|Q6ZN18|AEBP2_HUMAN 206;206 sp|Q6ZN18-2|AEBP2_HUMAN sp|Q6ZN18-2|AEBP2_HUMAN 0.999982 48.3708 1.18e-09 128.05 118.25 128.05 1 S TGGGGSSATSGGRRGSLEMSSDGEPLSRMDS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPPPPPPPXXX RGS(1)LEMSSDGEPLSR RGS(48.37)LEMS(-48.37)S(-54.13)DGEPLS(-99.69)R 3 2 -0.10602 By MS/MS By MS/MS By MS/MS By matching By MS/MS 73663000 73663000 0 0 19262000 11103000 19454000 0 1816900 22028000 19262000 0 0 11103000 0 0 19454000 0 0 0 0 0 1816900 0 0 22028000 0 0 5470 3335 206 206 17255 19413 137099;137100;137101;137102;137103 122913;122914;122915;122916 137099 122913 QE05097 23240 137099 122913 QE05097 23240 137099 122913 QE05097 23240 + REV__sp|P35908|K22E_HUMAN REV__sp|P35908|K22E_HUMAN 1 71.692 0.00457965 71.692 14.102 71.692 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPXXXXXXXXXXXX IIKELS(1)DGR IIKELS(71.69)DGR 6 2 2.0005 By matching By MS/MS By matching By matching By matching 431850000 431850000 0 0 NaN 103010000 67359000 64124000 74201000 0 55805000 NaN NaN NaN NaN NaN NaN 103010000 0 0 67359000 0 0 64124000 0 0 74201000 0 0 0 0 0 55805000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 61 57 252 252 11589 12932 93729;93730;93731;93732;93733;93734 87100 93729 87100 QE05098 47490 93729 87100 QE05098 47490 93729 87100 QE05098 47490 + REV__sp|Q9NSB4|KRT82_HUMAN REV__sp|Q9NSB4|KRT82_HUMAN 1 45.368 0.0161156 45.368 28.697 45.368 1 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPXXXXXXXXX VDGS(1)VCDLRR VDGS(45.37)VCDLRR 4 2 0.77096 By matching By matching By matching By matching By matching By MS/MS 1670400000 1670400000 0 0 NaN 218420000 241200000 328130000 240860000 52984000 294390000 NaN NaN NaN NaN NaN NaN 218420000 0 0 241200000 0 0 328130000 0 0 240860000 0 0 52984000 0 0 294390000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + + 62 58 330 330 22307 25289 178961;178962;178963;178964;178965;178966;178967 159240 178961 159240 QE05102 16922 178961 159240 QE05102 16922 178961 159240 QE05102 16922 + REV__sp|Q6S5H4-2|POTEB_HUMAN REV__sp|Q6S5H4-2|POTEB_HUMAN 1 51.2862 0.045235 51.286 32.662 51.286 S X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXXPPPPPPPPPXXXXXXXXX EVS(1)EIEELK EVS(51.29)EIEELK 3 2 0.81181 By matching By matching By matching By matching By matching 50767000 50767000 0 0 0.044169 0 8469100 14247000 11062000 1262600 15726000 0 0.056281 0.030122 0.051456 0.037786 0.081346 0 0 0 8469100 0 0 14247000 0 0 11062000 0 0 1262600 0 0 15726000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN + 63 59 22 22 8166 9110 66515;66516;66517;66518;66519 61714;61715 66516 61715 QE05100 38402 66516 61715 QE05100 38402 66516 61715 QE05100 38402 +sp|Q8IUD2-4|RB6I2_HUMAN;sp|Q8IUD2-2|RB6I2_HUMAN;sp|Q8IUD2-3|RB6I2_HUMAN;sp|Q8IUD2|RB6I2_HUMAN;sp|Q8IUD2-5|RB6I2_HUMAN;sp|O15083|ERC2_HUMAN 191;191;191;191;191;187 sp|Q8IUD2-4|RB6I2_HUMAN sp|Q8IUD2-4|RB6I2_HUMAN 0.999998 58.0663 0.00181554 89.827 67.799 89.827 1 S ESKLSSSMNSIKTFWSPELKKERALRKDEAS X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPXXXXXXXXXXX TFWS(1)PELK T(-58.07)FWS(58.07)PELK 4 2 0.075831 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 138400000 138400000 0 0 29764000 20957000 24855000 30752000 8304800 23771000 29764000 0 0 20957000 0 0 24855000 0 0 30752000 0 0 8304800 0 0 23771000 0 0 6037 3584 191 191 21148 23984 169817;169818;169819;169820;169821;169822 151176;151177;151178;151179;151180;151181 169822 151181 QE05102 49176 169822 151181 QE05102 49176 169822 151181 QE05102 49176 +sp|Q9NRX5|SERC1_HUMAN 364 sp|Q9NRX5|SERC1_HUMAN sp|Q9NRX5|SERC1_HUMAN 0.999996 54.0798 2.24e-16 159.22 148.1 159.22 1 S DESTLIEDGGARSDGSLEDGDDVHRAVDNER X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXXPPPPPPPPPPPPPXXXXXX SDGS(1)LEDGDDVHR S(-54.08)DGS(54.08)LEDGDDVHR 4 2 0.64808 By MS/MS By MS/MS By matching By MS/MS By MS/MS By MS/MS 222110000 222110000 0 0 31407000 17665000 20892000 23194000 5132400 54893000 31407000 0 0 17665000 0 0 20892000 0 0 23194000 0 0 5132400 0 0 54893000 0 0 8729 5187 364 364 17793 20026 141355;141356;141357;141358;141359;141360;141361;141362;141363;141364;141365 126543;126544;126545;126546;126547;126548;126549 141361 126549 QE05102 10564 141361 126549 QE05102 10564 141361 126549 QE05102 10564 +sp|Q9Y3B9|RRP15_HUMAN 11 sp|Q9Y3B9|RRP15_HUMAN sp|Q9Y3B9|RRP15_HUMAN 0.997432 25.8922 9.39e-31 175.33 139.7 175.33 1 S _____MAAAAPDSRVSEEENLKKTPKKKMKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPPPPXXXXXXXXX AAAAPDS(0.003)RVS(0.997)EEENLK AAAAPDS(-25.89)RVS(25.89)EEENLK 10 2 -0.029697 By matching By matching By MS/MS By MS/MS By MS/MS By MS/MS 266450000 266450000 0 0 38150000 39445000 56305000 55338000 7010600 70203000 38150000 0 0 39445000 0 0 56305000 0 0 55338000 0 0 7010600 0 0 70203000 0 0 9895 5791 11 11 12 17 158;159;160;161;162;163 166;167;168;169 159 167 QE05100 23225 159 167 QE05100 23225 159 167 QE05100 23225 +sp|Q15751|HERC1_HUMAN 3445 sp|Q15751|HERC1_HUMAN sp|Q15751|HERC1_HUMAN 0.999981 47.2024 0.0187791 47.548 7.8172 47.548 2 T RVMTCVWCNKKGLLATSGNDGTIRVWNVTKK X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXPPPPPPPPPPPPPPXXXXXXX KGLLAT(1)S(1)GNDGTIR KGLLAT(47.2)S(47.22)GNDGT(-47.2)IR 6 2 -0.95722 By matching By MS/MS By matching 129800000 0 129800000 0 3921800 0 120850000 0 0 5021300 0 3921800 0 0 0 0 0 120850000 0 0 0 0 0 0 0 0 5021300 0 10983 2824 3445 3445 12194 13609 98227;98228;98229 90789 98227 90789 QE05099 12004 98227 90789 QE05099 12004 98227 90789 QE05099 12004 +sp|O75379|VAMP4_HUMAN 30 sp|O75379|VAMP4_HUMAN sp|O75379|VAMP4_HUMAN 1 67.6437 1.44E-52 203.56 187.24 67.644 1 S TGSVKSERRNLLEDDSDEEEDFFLRGPSGPR X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXPPPPPPPPPPPPPPPPPPPPPP NLLEDDS(1)DEEEDFFLR NLLEDDS(67.64)DEEEDFFLR 7 3 -0.051914 By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS By MS/MS 7929000000 7929000000 0 0 NaN 1592100000 973800000 1011600000 1450300000 631970000 878760000 NaN NaN NaN NaN NaN NaN 1592100000 0 0 973800000 0 0 1011600000 0 0 1450300000 0 0 631970000 0 0 878760000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 963 669 30 30 15558;15559 17538;17539 124829;124830;124831;124832;124833;124834;124835;124836;124837;124838;124839;124840;124841;124842;124843;124844;124845;124846 112951;112952;112953;112954;112955;112956;112957;112958;112959;112960;112961;112962;112963;112964;112965;112966;112967;112968;112969;112970;112971;112972 124840 112969 QE05102 57877 124833 112957 QE05099 57820 124833 112957 QE05099 57820 +sp|O95183|VAMP5_HUMAN 48 sp|O95183|VAMP5_HUMAN sp|O95183|VAMP5_HUMAN 0.72657 5.36697 5.72E-05 79.514 55.133 79.514 1 S KLAELQQRSDQLLDMSSTFNKTTQNLAQKKC X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPPXXXXXXXXXX SDQLLDMS(0.727)S(0.211)T(0.062)FNK S(-64.13)DQLLDMS(5.37)S(-5.37)T(-10.67)FNK 8 2 -0.18713 By matching By matching By MS/MS By matching By matching By matching 86590000 86590000 0 0 0.032027 17447000 15753000 20219000 14001000 6284700 12885000 0.028348 0.025719 0.032895 0.033925 0.083789 0.034516 17447000 0 0 15753000 0 0 20219000 0 0 14001000 0 0 6284700 0 0 12885000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1189 809 48 48 17891 20149 142427;142428;142429;142430;142431;142432 127454 142427 127454 QE05099 48504 142427 127454 QE05099 48504 142427 127454 QE05099 48504 +sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN 63;80 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.920811 10.6555 1.81E-09 124.1 98.278 107.25 1 S DRADALQAGASQFETSAAKLKRKYWWKNCKM X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXPPPPPPPPPPPPPPPPPXXXXXXXXXXXX ADALQAGASQFET(0.079)S(0.921)AAK ADALQAGAS(-49.99)QFET(-10.66)S(10.66)AAK 14 2 0.23449 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 265240000 265240000 0 0 0.036151 44627000 41445000 69094000 42521000 5738000 61819000 0.03226 0.028442 0.039791 0.036967 0.030963 0.043392 44627000 0 0 41445000 0 0 69094000 0 0 42521000 0 0 5738000 0 0 61819000 0 0 0.47624 0.90925 12.188 0.51677 1.0694 7.2217 NaN NaN NaN 0.81588 4.4311 19.209 NaN NaN NaN 0.4388 0.78189 5.9861 4442 2836 63 63 279 319 2297;2298;2299;2300;2301;2302 1992;1993;1994;1995;1996 2300 1995 QE05100 30086 2301 1996 QE05102 30007 2301 1996 QE05102 30007 +sp|Q15836|VAMP3_HUMAN;sp|P63027|VAMP2_HUMAN;sp|P23763-2|VAMP1_HUMAN;sp|P23763-3|VAMP1_HUMAN;sp|P23763|VAMP1_HUMAN 44;61;63;63;63 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 1 65.4951 2.36E-06 126.19 98.602 65.495 1 S MRVNVDKVLERDQKLSELDDRADALQAGASQ X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXXXXPPPPPPPPPPXXXXXXXXXX DQKLS(1)ELDDR DQKLS(65.5)ELDDR 5 3 -0.72518 By MS/MS By MS/MS By MS/MS By MS/MS By matching By MS/MS 412950000 412950000 0 0 NaN 75542000 44814000 32924000 35016000 11023000 4669900 NaN NaN NaN NaN NaN NaN 75542000 0 0 44814000 0 0 32924000 0 0 35016000 0 0 11023000 0 0 4669900 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4443 2836 44 44 4530 5083 37093;37094;37095;37096;37097;37098;37099;37100;37101;37102;37103;37104 34712;34713;34714;34715;34716;34717;34718;34719 37100 34719 QE05102 18436 37093 34712 QE05097 18245 37093 34712 QE05097 18245 +sp|Q15836|VAMP3_HUMAN 11 sp|Q15836|VAMP3_HUMAN sp|Q15836|VAMP3_HUMAN 0.97018 15.1316 0.000117365 79.652 72.041 79.652 1 S _____MSTGPTAATGSNRRLQQTQNQVDEVV X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXPPPPPPPPPPPPPXXXXXXXXXXXX STGPTAAT(0.03)GS(0.97)NRR S(-66.94)T(-63.48)GPT(-42.47)AAT(-15.13)GS(15.13)NRR 10 2 -0.15791 By matching By matching By MS/MS By matching By matching By MS/MS 34280000 34280000 0 0 NaN 3057100 4718800 12052000 5047700 1070900 8333500 NaN NaN NaN NaN NaN NaN 3057100 0 0 4718800 0 0 12052000 0 0 5047700 0 0 1070900 0 0 8333500 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4444 2836 11 11 20280 22978 162490;162491;162492;162493;162494;162495 144222;144223 162490 144222 QE05099 7582 162490 144222 QE05099 7582 162490 144222 QE05099 7582 +sp|Q9BV40|VAMP8_HUMAN 55 sp|Q9BV40|VAMP8_HUMAN sp|Q9BV40|VAMP8_HUMAN 0.959784 13.7778 3.78E-05 91.969 27.98 91.969 1 S NLEHLRNKTEDLEATSEHFKTTSQKVARKFW X;X;X;X;X;X;X;X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPXXXXXXXXXXX TEDLEAT(0.04)S(0.96)EHFK T(-83.18)EDLEAT(-13.78)S(13.78)EHFK 8 2 0.40785 By matching By matching By matching By MS/MS 114520000 114520000 0 0 NaN 20400000 9738500 7862300 0 0 76518000 NaN NaN NaN NaN NaN NaN 20400000 0 0 9738500 0 0 7862300 0 0 0 0 0 0 0 0 76518000 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 7902 4687 55 55 21013 23827 168874;168875;168876;168877 150433 168874 150433 QE05102 19524 168874 150433 QE05102 19524 168874 150433 QE05102 19524 +sp|P54764-2|EPHA4_HUMAN;sp|P54764|EPHA4_HUMAN 551;602 sp|P54764-2|EPHA4_HUMAN sp|P54764-2|EPHA4_HUMAN 0.871707 6.48916 4.61E-08 65.374 58.758 65.374 + 2 Y KHLNQGVRTYVDPFTYEDPNQAVREFAKEID X;X;X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;Phospho (STY);X;X;X;X;X;X;X;X;X;X;X;X;X;X;X XXXXXXXXPPPPPPPPPPPPPPPPXXXXXXX T(0.499)Y(0.501)VDPFT(0.128)Y(0.872)EDPNQAVR T(0.85)Y(-0.85)VDPFT(-6.49)Y(6.49)EDPNQAVR 8 3 0.97415 By matching By MS/MS By matching By matching 3679100 0 3679100 0 NaN 725460 0 1651300 655850 646420 0 NaN NaN NaN NaN NaN NaN 0 725460 0 0 0 0 0 1651300 0 0 655850 0 0 646420 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 242 260 551 551 972 999 4968;4969;4970;4971 3421 4968 3421 QE04980 9557 4968 3421 QE04980 9557 4968 3421 QE04980 9557
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_kinase_substrate.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,5 @@ +GENE KINASE KIN_ACC_ID KIN_ORGANISM SUBSTRATE SUB_GENE_ID SUB_ACC_ID SUB_GENE SUB_ORGANISM SUB_MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN IN_VIVO_RXN IN_VITRO_RXN CST_CAT# +Csnk2a1 CK2A1 Q60737 human VAMP4 53330 O70480 Vamp4 human S30 454285 RNLLEDDsDEEEDFF X +EPHA2 EphA2 P29317 human EphA2 1969 P29317 EPHA2 human Y588 450859 QLkPLktyVDPHtyE EphA2_TM X X 7423; 12677 +EPHA4 EphA4 P54764 human EphA4 2043 P54764 EPHA4 human Y596 450856 LNQGVRtyVDPFtyE EphA2_TM X +EPHA4 EphA4 P54764 human EphA4 2043 P54764 EPHA4 human Y602 450857 tyVDPFtyEDPNQAV EphA2_TM X
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_networkin.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,101 @@ +#substrate position id networkin_score tree netphorest_group netphorest_score string_identifier string_score substrate_name sequence string_path +VAMP4 (ENSP00000236192) 30 CK2alpha 35.6396 KIN CK2_group 0.5228 ENSP00000236192 0.85 VAMP4 LLEDDsDEEED "ENSP00000217244, 0.68 ENSP00000236192" +SSRP1 (ENSP00000278412) 444 CK2alpha 28.6345 KIN CK2_group 0.3768 ENSP00000278412 0.874 SSRP1 DEYADsDEDQH "ENSP00000217244, 0.6992 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CK2alpha 22.2088 KIN CK2_group 0.3168 ENSP00000278412 0.874 SSRP1 SKEFVsSDESS "ENSP00000217244, 0.6992 ENSP00000278412" +HERC2 (ENSP00000261609) 1577 CK2alpha 10.7686 KIN CK2_group 0.5253 ENSP00000261609 0.4514 HERC2 IGNEEsDLEEA "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" +HERC2 (ENSP00000261609) 2928 CK2alpha 10.7686 KIN CK2_group 0.4698 ENSP00000261609 0.4514 HERC2 VPFLAsDNEEE "ENSP00000217244, 0.764 ENSP00000346659, 0.76 ENSP00000261609" +RRP15 (ENSP00000355899) 11 CK2alpha 8.5484 KIN CK2_group 0.3566 ENSP00000355899 0.461 RRP15 PDSRVsEEENL "ENSP00000217244, 0.3688 ENSP00000355899" +SSRP1 (ENSP00000278412) 444 CK2a2 7.8435 KIN CK2_group 0.3768 ENSP00000278412 0.615 SSRP1 DEYADsDEDQH "ENSP00000262506, 0.492 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CK2a2 7.7757 KIN CK2_group 0.3168 ENSP00000278412 0.615 SSRP1 SKEFVsSDESS "ENSP00000262506, 0.492 ENSP00000278412" +VAMP2 (ENSP00000314214) 80 PKD3 6.9217 KIN PKD_group 0.0744 ENSP00000314214 0.949 VAMP2 SQFETsAAKLK "ENSP00000234179, 0.7592 ENSP00000314214" +VAMP2 (ENSP00000314214) 61 CK2alpha 6.3122 KIN CK2_group 0.3338 ENSP00000314214 0.4391 VAMP2 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000222812, 0.7544 ENSP00000314214" +VAMP1 (ENSP00000380148) 63 CK2alpha 6.1363 KIN CK2_group 0.3338 ENSP00000380148 0.4364 VAMP1 RDQKLsELDDR "ENSP00000217244, 0.7944 ENSP00000222812, 0.7544 ENSP00000380148" +ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" +ERC1 (ENSP00000354158) 191 IKKalpha 5.3194 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.96 ERC1 IKTFWsPELKK "ENSP00000359424, 0.768 ENSP00000354158" +VAMP2 (ENSP00000314214) 61 PKAbeta 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000359719, 0.64 ENSP00000314214" +VAMP2 (ENSP00000314214) 61 PKAgamma 4.9293 KIN PKA_group 0.1153 ENSP00000314214 0.8 VAMP2 RDQKLsELDDR "ENSP00000366488, 0.64 ENSP00000314214" +VAMP3 (ENSP00000054666) 44 CK2alpha 4.2842 KIN CK2_group 0.3338 ENSP00000054666 0.4201 VAMP3 RDQKLsELDDR "ENSP00000217244, 0.7992 ENSP00000317714, 0.6792 ENSP00000054666" +VAMP2 (ENSP00000314214) 80 PKCiota 3.8971 KIN PKC_group 0.0928 ENSP00000314214 0.899 VAMP2 SQFETsAAKLK "ENSP00000295797, 0.7192 ENSP00000314214" +SSRP1 (ENSP00000278412) 444 CDK7 3.6159 KIN CDK7 0.0186 ENSP00000278412 0.903 SSRP1 DEYADsDEDQH "ENSP00000256443, 0.7224 ENSP00000278412" +SSRP1 (ENSP00000278412) 444 CK1alpha 3.3573 KIN CK1_group 0.1264 ENSP00000278412 0.404 SSRP1 DEYADsDEDQH "ENSP00000261798, 0.3232 ENSP00000278412" +VAMP3 (ENSP00000054666) 11 PKCalpha 3.0633 KIN PKC_group 0.4633 ENSP00000054666 0.3277 VAMP3 TAATGsNRRLQ "ENSP00000284384, 0.6232 ENSP00000359025, 0.6352 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 PKCalpha 3.0524 KIN PKC_group 0.4345 ENSP00000278412 0.237 SSRP1 RQLSEsFKSKE "ENSP00000284384, 0.4552 ENSP00000351885, 0.76 ENSP00000278412" +VAMP2 (ENSP00000314214) 61 PKCiota 2.7785 KIN PKC_group 0.0463 ENSP00000314214 0.899 VAMP2 RDQKLsELDDR "ENSP00000295797, 0.7192 ENSP00000314214" +SSRP1 (ENSP00000278412) 659 CDK7 2.5961 KIN CDK7 0.0104 ENSP00000278412 0.903 SSRP1 RQLSEsFKSKE "ENSP00000256443, 0.7224 ENSP00000278412" +SSRP1 (ENSP00000278412) 667 CDK7 2.5961 KIN CDK7 0.0124 ENSP00000278412 0.903 SSRP1 SKEFVsSDESS "ENSP00000256443, 0.7224 ENSP00000278412" +ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" +ERC1 (ENSP00000354158) 191 IKKbeta 2.571 KIN IKKalpha_IKKbeta_group 0.031 ENSP00000354158 0.946 ERC1 IKTFWsPELKK "ENSP00000339151, 0.7568 ENSP00000354158" +SSRP1 (ENSP00000278412) 659 PKCbeta 2.4948 KIN PKC_group 0.4345 ENSP00000278412 0.1743 SSRP1 RQLSEsFKSKE "ENSP00000305355, 0.7976 ENSP00000366013, 0.7192 ENSP00000284811, 0.7448 ENSP00000278412" +VAMP3 (ENSP00000054666) 11 PKCbeta 2.4948 KIN PKC_group 0.4633 ENSP00000054666 0.2393 VAMP3 TAATGsNRRLQ "ENSP00000305355, 0.512 ENSP00000348986, 0.7616 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 CK2a2 2.4345 KIN CK2_group 0.0356 ENSP00000278412 0.615 SSRP1 RQLSEsFKSKE "ENSP00000262506, 0.492 ENSP00000278412" +ERC1 (ENSP00000354158) 191 HIPK2 2.2748 KIN HIPK1_HIPK2_group 0.0463 ENSP00000354158 0.4159 ERC1 IKTFWsPELKK "ENSP00000263551, 0.7696 ENSP00000286332, 0.7192 ENSP00000354158" +VAMP3 (ENSP00000054666) 11 PKCzeta 2.0773 KIN PKC_group 0.4633 ENSP00000054666 0.4263 VAMP3 TAATGsNRRLQ "ENSP00000367830, 0.7688 ENSP00000320935, 0.796 ENSP00000054666" +SSRP1 (ENSP00000278412) 659 DNAPK 2.0042 KIN DNAPK 0.0584 ENSP00000278412 0.56 SSRP1 RQLSEsFKSKE "ENSP00000313420, 0.448 ENSP00000278412" +EPHA4 (ENSP00000386829) 602 EphA4 35.9325 KIN Eph_group 0.1443 ENSP00000281821 1 EPHA4 VDPFTyEDPNQ 1 KIN +EPHA4 (ENSP00000386829) 596 EphA4 35.921 KIN Eph_group 0.1442 ENSP00000281821 1 EPHA4 QGVRTyVDPFT 1 KIN +EPHA4 (ENSP00000386829) 779 EphA4 17.3679 KIN Eph_group 0.0482 ENSP00000281821 1 EPHA4 DPEAAyTTRGG 1 KIN +EPHA4 (ENSP00000386829) 798 EphA4 17.3679 KIN Eph_group 0.0482 ENSP00000281821 1 EPHA4 PEAIAyRKFTS 1 KIN +EPHA4 (ENSP00000386829) 928 EphA4 17.3679 KIN Eph_group 0.0482 ENSP00000281821 1 EPHA4 IKMDRyKDNFT 1 KIN +EPHA4 (ENSP00000386829) 602 EphA1 5.7706 KIN Eph_group 0.1443 ENSP00000281821 0.907 EPHA4 VDPFTyEDPNQ "ENSP00000275815, 0.7256 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA1 5.7688 KIN Eph_group 0.1442 ENSP00000281821 0.907 EPHA4 QGVRTyVDPFT "ENSP00000275815, 0.7256 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA2 5.7678 KIN Eph_group 0.1443 ENSP00000281821 0.904 EPHA4 VDPFTyEDPNQ "ENSP00000351209, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA3 5.7678 KIN Eph_group 0.1443 ENSP00000281821 0.904 EPHA4 VDPFTyEDPNQ "ENSP00000337451, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA5 5.7678 KIN Eph_group 0.1443 ENSP00000281821 0.904 EPHA4 VDPFTyEDPNQ "ENSP00000273854, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA7 5.7678 KIN Eph_group 0.1443 ENSP00000281821 0.904 EPHA4 VDPFTyEDPNQ "ENSP00000358309, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA6 5.7668 KIN Eph_group 0.1443 ENSP00000281821 0.903 EPHA4 VDPFTyEDPNQ "ENSP00000374323, 0.7224 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA2 5.7659 KIN Eph_group 0.1442 ENSP00000281821 0.904 EPHA4 QGVRTyVDPFT "ENSP00000351209, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA3 5.7659 KIN Eph_group 0.1442 ENSP00000281821 0.904 EPHA4 QGVRTyVDPFT "ENSP00000337451, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA5 5.7659 KIN Eph_group 0.1442 ENSP00000281821 0.904 EPHA4 QGVRTyVDPFT "ENSP00000273854, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA7 5.7659 KIN Eph_group 0.1442 ENSP00000281821 0.904 EPHA4 QGVRTyVDPFT "ENSP00000358309, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA6 5.765 KIN Eph_group 0.1442 ENSP00000281821 0.903 EPHA4 QGVRTyVDPFT "ENSP00000374323, 0.7224 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 Abl 5.6735 KIN Abl_group 0.0573 ENSP00000281821 0.806 EPHA4 DPEAAyTTRGG "ENSP00000361423, 0.6448 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphA8 3.8493 KIN Eph_group 0.1443 ENSP00000281821 0.576 EPHA4 VDPFTyEDPNQ "ENSP00000166244, 0.7984 ENSP00000403005, 0.78 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphA8 3.8481 KIN Eph_group 0.1442 ENSP00000281821 0.576 EPHA4 QGVRTyVDPFT "ENSP00000166244, 0.7984 ENSP00000403005, 0.78 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 Abl 3.757 KIN Abl_group 0.0432 ENSP00000281821 0.806 EPHA4 QGVRTyVDPFT "ENSP00000361423, 0.6448 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 Fgr 3.5442 KIN Src_group 0.0705 ENSP00000281821 0.902 EPHA4 VDPFTyEDPNQ "ENSP00000363115, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 Yes 3.5442 KIN Src_group 0.0705 ENSP00000281821 0.902 EPHA4 VDPFTyEDPNQ "ENSP00000324740, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 BLK 3.5431 KIN Src_group 0.0705 ENSP00000281821 0.9 EPHA4 VDPFTyEDPNQ "ENSP00000259089, 0.72 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 Fgr 2.8234 KIN Src_group 0.0583 ENSP00000281821 0.902 EPHA4 DPEAAyTTRGG "ENSP00000363115, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 Yes 2.8234 KIN Src_group 0.0583 ENSP00000281821 0.902 EPHA4 DPEAAyTTRGG "ENSP00000324740, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 BLK 2.8225 KIN Src_group 0.0583 ENSP00000281821 0.9 EPHA4 DPEAAyTTRGG "ENSP00000259089, 0.72 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA1 2.7892 KIN Eph_group 0.0482 ENSP00000281821 0.907 EPHA4 DPEAAyTTRGG "ENSP00000275815, 0.7256 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA1 2.7892 KIN Eph_group 0.0482 ENSP00000281821 0.907 EPHA4 PEAIAyRKFTS "ENSP00000275815, 0.7256 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA1 2.7892 KIN Eph_group 0.0482 ENSP00000281821 0.907 EPHA4 IKMDRyKDNFT "ENSP00000275815, 0.7256 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA2 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 DPEAAyTTRGG "ENSP00000351209, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA3 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 DPEAAyTTRGG "ENSP00000337451, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA5 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 DPEAAyTTRGG "ENSP00000273854, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA7 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 DPEAAyTTRGG "ENSP00000358309, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA2 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 PEAIAyRKFTS "ENSP00000351209, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA3 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 PEAIAyRKFTS "ENSP00000337451, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA5 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 PEAIAyRKFTS "ENSP00000273854, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA7 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 PEAIAyRKFTS "ENSP00000358309, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA2 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 IKMDRyKDNFT "ENSP00000351209, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA3 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 IKMDRyKDNFT "ENSP00000337451, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA5 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 IKMDRyKDNFT "ENSP00000273854, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA7 2.7878 KIN Eph_group 0.0482 ENSP00000281821 0.904 EPHA4 IKMDRyKDNFT "ENSP00000358309, 0.7232 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 EphA6 2.7874 KIN Eph_group 0.0482 ENSP00000281821 0.903 EPHA4 DPEAAyTTRGG "ENSP00000374323, 0.7224 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 EphA6 2.7874 KIN Eph_group 0.0482 ENSP00000281821 0.903 EPHA4 PEAIAyRKFTS "ENSP00000374323, 0.7224 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 EphA6 2.7874 KIN Eph_group 0.0482 ENSP00000281821 0.903 EPHA4 IKMDRyKDNFT "ENSP00000374323, 0.7224 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 Fgr 2.7541 KIN Src_group 0.036 ENSP00000281821 0.902 EPHA4 QGVRTyVDPFT "ENSP00000363115, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 Yes 2.7541 KIN Src_group 0.036 ENSP00000281821 0.902 EPHA4 QGVRTyVDPFT "ENSP00000324740, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 BLK 2.7532 KIN Src_group 0.036 ENSP00000281821 0.9 EPHA4 QGVRTyVDPFT "ENSP00000259089, 0.72 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 Fgr 2.7477 KIN Src_group 0.0263 ENSP00000281821 0.902 EPHA4 PEAIAyRKFTS "ENSP00000363115, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 Yes 2.7477 KIN Src_group 0.0263 ENSP00000281821 0.902 EPHA4 PEAIAyRKFTS "ENSP00000324740, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 Fgr 2.7472 KIN Src_group 0.0257 ENSP00000281821 0.902 EPHA4 IKMDRyKDNFT "ENSP00000363115, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 Yes 2.7472 KIN Src_group 0.0257 ENSP00000281821 0.902 EPHA4 IKMDRyKDNFT "ENSP00000324740, 0.7216 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 BLK 2.7468 KIN Src_group 0.0263 ENSP00000281821 0.9 EPHA4 PEAIAyRKFTS "ENSP00000259089, 0.72 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 BLK 2.7463 KIN Src_group 0.0257 ENSP00000281821 0.9 EPHA4 IKMDRyKDNFT "ENSP00000259089, 0.72 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 HCK 2.7098 KIN Src_group 0.036 ENSP00000281821 0.899 EPHA4 QGVRTyVDPFT "ENSP00000365012, 0.7192 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 HCK 2.7098 KIN Src_group 0.0705 ENSP00000281821 0.899 EPHA4 VDPFTyEDPNQ "ENSP00000365012, 0.7192 ENSP00000281821" +EPHA4 (ENSP00000386829) 779 HCK 2.7098 KIN Src_group 0.0583 ENSP00000281821 0.899 EPHA4 DPEAAyTTRGG "ENSP00000365012, 0.7192 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 HCK 2.7098 KIN Src_group 0.0263 ENSP00000281821 0.899 EPHA4 PEAIAyRKFTS "ENSP00000365012, 0.7192 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 HCK 2.7098 KIN Src_group 0.0257 ENSP00000281821 0.899 EPHA4 IKMDRyKDNFT "ENSP00000365012, 0.7192 ENSP00000281821" +EPHA4 (ENSP00000386829) 780 PKCalpha 2.5567 KIN PKC_group 0.3699 ENSP00000281821 0.401 EPHA4 PEAAYtTRGGK "ENSP00000284384, 0.7464 ENSP00000244007, 0.7784 ENSP00000281821" +EPHA4 (ENSP00000386829) 780 PKCbeta 2.4948 KIN PKC_group 0.3699 ENSP00000281821 0.3759 EPHA4 PEAAYtTRGGK "ENSP00000305355, 0.7464 ENSP00000244007, 0.7296 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 Abl 2.1653 KIN Abl_group 0.0221 ENSP00000281821 0.806 EPHA4 VDPFTyEDPNQ "ENSP00000361423, 0.6448 ENSP00000281821" +EPHA4 (ENSP00000386829) 798 Abl 2.1376 KIN Abl_group 0.0221 ENSP00000281821 0.806 EPHA4 PEAIAyRKFTS "ENSP00000361423, 0.6448 ENSP00000281821" +EPHA4 (ENSP00000386829) 928 Abl 2.1099 KIN Abl_group 0.0221 ENSP00000281821 0.806 EPHA4 IKMDRyKDNFT "ENSP00000361423, 0.6448 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphB6 2.04 KIN Eph_group 0.1443 ENSP00000281821 0.5258 EPHA4 VDPFTyEDPNQ "ENSP00000376684, 0.7976 ENSP00000226091, 0.7976 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphB6 2.0393 KIN Eph_group 0.1442 ENSP00000281821 0.5258 EPHA4 QGVRTyVDPFT "ENSP00000376684, 0.7976 ENSP00000226091, 0.7976 ENSP00000281821" +EPHA4 (ENSP00000386829) 602 EphB3 2.0282 KIN Eph_group 0.1443 ENSP00000281821 0.5231 EPHA4 VDPFTyEDPNQ "ENSP00000332118, 0.7976 ENSP00000226091, 0.7936 ENSP00000281821" +EPHA4 (ENSP00000386829) 596 EphB3 2.0276 KIN Eph_group 0.1442 ENSP00000281821 0.5231 EPHA4 QGVRTyVDPFT "ENSP00000332118, 0.7976 ENSP00000226091, 0.7936 ENSP00000281821"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_regulatory_sites.tabular Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,9 @@ +32017 +"PhosphoSitePlus(R) (PSP) was created by Cell Signaling Technology Inc. It is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License. When using PSP data or analyses in printed publications or in online resources, the following acknowledgements must be included: (a) the words ""PhosphoSitePlus(R), www.phosphosite.org"" must be included at appropriate places in the text or webpage, and (b) the following citation must be included in the bibliography: ""Hornbeck PV, Zhang B, Murray B, Kornhauser JM, Latham V, Skrzypek E PhosphoSitePlus, 2014: mutations, PTMs and recalibrations. Nucleic Acids Res. 2015 43:D512-20. PMID: 25514926.""" + +GENE PROTEIN PROT_TYPE ACC_ID GENE_ID HU_CHR_LOC ORGANISM MOD_RSD SITE_GRP_ID SITE_+/-7_AA DOMAIN ON_FUNCTION ON_PROCESS ON_PROT_INTERACT ON_OTHER_INTERACT PMIDs LT_LIT MS_LIT MS_CST NOTES +ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S109-p 477819 DLPQRKSsLVTSKLA Endosulfine "molecular association, regulation; protein conformation" SNCA(DISRUPTS) 18973346 1 34 50 +VAMP8 VAMP8 "Membrane protein, integral; Vesicle" Q9BV40 8673 2p11.2 human S55-p 12738929 TEDLEATsEHFKTTS Synaptobrevin "activity, inhibited" 27402227 1 8 0 "abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion" +ENSA ENSA "Inhibitor; Protein phosphatase, regulatory subunit" O43768 2029 1q21.3 human S67-p 455934 KGQKYFDsGDYNMAK Endosulfine "molecular association, regulation" cell cycle regulation PPP2CA(INDUCES) 27889260 3 56 47 +Vamp4 VAMP4 "Membrane protein, integral; Vesicle" O70480 53330 1 H2.1|1 70.29 cM mouse S30-p 454285 RNLLEDDsDEEEDFF "molecular association, regulation; intracellular localization" PACS-1(INDUCES) 14608369 1 64 10 +EPHA4 EphA4 "EC 2.7.10.1; KINASE; Kinase, protein; Membrane protein, integral; Protein kinase, TK; Protein kinase, tyrosine (receptor)" P54764 2043 2q36.1 human Y602-p 450857 TYVDPFTyEDPNQAV EphA2_TM "molecular association, regulation" Fyn(INDUCES) 8622893 6 16 155
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test_swissprot.fasta Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,72 @@ +>sp|Q9Y3B9|RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 +MAAAAPDSRVSEEENLKKTPKKKMKMVTGAVASVLEDEATDTSDSEGSCGSEKDHFYSDDDAIEADSEGDAEPCDKENENDGESSVGTNMGWADAMAKVLNKKTPESKPTILVKNKKLEKEKEKLKQERLEKIKQRDKRLEWEMMCRVKPDVVQDKETERNLQRIATRGVVQLFNAVQKHQKNVDEKVKEAGSSMRKRAKLISTVSKKDFISVLRGMDGSTNETASSRKKPKAKQTEVKSEEGPGWTILRDDFMMGASMKDWDKESDGPDDSRPESASDSDT +>sp|Q08945|SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 +MAETLEFNDVYQEVKGSMNDGRLRLSRQGIIFKNSKTGKVDNIQAGELTEGIWRRVALGHGLKLLTKNGHVYKYDGFRESEFEKLSDFFKTHYRLELMEKDLCVKGWNWGTVKFGGQLLSFDIGDQPVFEIPLSNVSQCTTGKNEVTLEFHQNDDAEVSLMEVRFYVPPTQEDGVDPVEAFAQNVLSKADVIQATGDAICIFRELQCLTPRGRYDIRIYPTFLHLHGKTFDYKIPYTTVLRLFLLPHKDQRQMFFVISLDPPIKQGQTRYHFLILLFSKDEDISLTLNMNEEEVEKRFEGRLTKNMSGSLYEMVSRVMKALVNRKITVPGNFQGHSGAQCITCSYKASSGLLYPLERGFIYVHKPPVHIRFDEISFVNFARGTTTTRSFDFEIETKQGTQYTFSSIEREEYGKLFDFVNAKKLNIKNRGLKEGMNPSYDEYADSDEDQHDAYLERMKEEGKIREENANDSSDDSGEETDESFNPGEEEEDVAEEFDSNASASSSSNEGDSDRDEKKRKQLKKAKMAKDRKSRKKPVEVKKGKDPNAPKRPMSAYMLWLNASREKIKSDHPGISITDLSKKAGEIWKGMSKEKKEEWDRKAEDARRDYEKAMKEYEGGRGESSKRDKSKKKKKVKVKMEKKSTPSRGSSSKSSSRQLSESFKSKEFVSSDESSSGENKSKKKRRRSEDSEEEELASTPPSSEDSASGSDE +>sp|Q96SA4|SERC2_HUMAN Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 PE=2 SV=3 +MGACLGACSLLSCASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-2|SERC2_HUMAN Isoform 2 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MGAEGAPDFLSCPRVRRASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-3|SERC2_HUMAN Isoform 3 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q96SA4-4|SERC2_HUMAN Isoform 4 of Serine incorporator 2 OS=Homo sapiens OX=9606 GN=SERINC2 +MDGRMMRSMRLREEESPGPSHTASCLCGSAPCILCSCCPASRNSTVSRLIFTFFLFLGVLVSIIMLSPGVESQLYKLPWVCEEGAGIPTVLQGHIDCGSLLGYRAVYRMCFATAAFFFFFTLLMLCVSSSRDPRAAIQNGFWFFKFLILVGLTVGAFYIPDGSFTNIWFYFGVVGSFLFILIQLVLLIDFAHSWNQRWLGKAEECDSRAWYAGLFFFTLLFYLLSIAAVALMFMYYTEPSGCHEGKVFISLNLTFCVCVSIAAVLPKVQDAQPNSGLLQASVITLYTMFVTWSALSSIPEQKCNPHLPTQLGNETVVAGPEGYETQWWDAPSIVGLIIFLLCTLFISLRSSDHRQVNSLMQTEECPPMLDATQQQQQVAACEGRAFDNEQDGVTYSYSFFHFCLVLASLHVMMTLTNWYKPGETRKMISTWTAVWVKICASWAGLLLYLWTLVAPLLLRNRDFS +>sp|Q9NRX5|SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 +MGSVLGLCSMASWIPCLCGSAPCLLCRCCPSGNNSTVTRLIYALFLLVGVCVACVMLIPGMEEQLNKIPGFCENEKGVVPCNILVGYKAVYRLCFGLAMFYLLLSLLMIKVKSSSDPRAAVHNGFWFFKFAAAIAIIIGAFFIPEGTFTTVWFYVGMAGAFCFILIQLVLLIDFAHSWNESWVEKMEEGNSRCWYAALLSATALNYLLSLVAIVLFFVYYTHPASCSENKAFISVNMLLCVGASVMSILPKIQESQPRSGLLQSSVITVYTMYLTWSAMTNEPETNCNPSLLSIIGYNTTSTVPKEGQSVQWWHAQGIIGLILFLLCVFYSSIRTSNNSQVNKLTLTSDESTLIEDGGARSDGSLEDGDDVHRAVDNERDGVTYSYSFFHFMLFLASLYIMMTLTNWYRYEPSREMKSQWTAVWVKISSSWIGIVLYVWTLVAPLVLTNRDFD +>sp|O43768|ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1 +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-2|ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-3|ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-4|ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIASYPLSLGLKEVLRMKSVEQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-5|ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-6|ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|O43768-7|ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MAGGLGCDVCYWFVEDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAGGQVE +>sp|O43768-8|ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGVWGIVSYPLSLELKEVLRMKSVEVLLDPFLEVLLLNRSRGEFEI +>sp|O43768-9|ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA +MSQKQEEENPAEETGEEKQDTQEKEGILPERAEEAKLKAKYPSLGQKPGGSDFLMKRLQKGDYKSLHWSVLLCADEMQKYFDSGDYNMAKAKMKNKQLPSAGPDKNLVTGDHIPTPQDLPQRKSSLVTSKLAG +>sp|Q15751|HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 +MATMIPPVKLKWLEHLNSSWITEDSESIATREGVAVLYSKLVSNKEVVPLPQQVLCLKGPQLPDFERESLSSDEQDHYLDALLSSQLALAKMVCSDSPFAGALRKRLLVLQRVFYALSNKYHDKGKVKQQQHSPESSSGSADVHSVSERPRSSTDALIEMGVRTGLSLLFALLRQSWMMPVSGPGLSLCNDVIHTAIEVVSSLPPLSLANESKIPPMGLDCLSQVTTFLKGVTIPNSGADTLGRRLASELLLGLAAQRGSLRYLLEWIEMALGASAVVHTMEKGKLLSSQEGMISFDCFMTILMQMRRSLGSSADRSQWREPTRTSDGLCSLYEAALCLFEEVCRMASDYSRTCASPDSIQTGDAPIVSETCEVYVWGSNSSHQLVEGTQEKILQPKLAPSFSDAQTIEAGQYCTFVISTDGSVRACGKGSYGRLGLGDSNNQSTLKKLTFEPHRSIKKVSSSKGSDGHTLAFTTEGEVFSWGDGDYGKLGHGNSSTQKYPKLIQGPLQGKVVVCVSAGYRHSAAVTEDGELYTWGEGDFGRLGHGDSNSRNIPTLVKDISNVGEVSCGSSHTIALSKDGRTVWSFGGGDNGKLGHGDTNRVYKPKVIEALQGMFIRKVCAGSQSSLALTSTGQVYAWGCGACLGCGSSEATALRPKLIEELAATRIVDVSIGDSHCLALSHDNEVYAWGNNSMGQCGQGNSTGPITKPKKVSGLDGIAIQQISAGTSHSLAWTALPRDRQVVAWHRPYCVDLEESTFSHLRSFLERYCDKINSEIPPLPFPSSREHHSFLKLCLKLLSNHLALALAGGVATSILGRQAGPLRNLLFRLMDSTVPDEIQEVVIETLSVGATMLLPPLRERMELLHSLLPQGPDRWESLSKGQRMQLDIILTSLQDHTHVASLLGYSSPSDAADLSSVCTGYGNLSDQPYGTQSCHPDTHLAEILMKTLLRNLGFYTDQAFGELEKNSDKFLLGTSSSENSQPAHLHELLCSLQKQLLAFCHINNISENSSSVALLHKHLQLLLPHATDIYSRSANLLKESPWNGSVGEKLRDVIYVSAAGSMLCQIVNSLLLLPVSVARPLLSYLLDLLPPLDCLNRLLPAADLLEDQELQWPLHGGPELIDPAGLPLPQPAQSWVWLVDLERTIALLIGRCLGGMLQGSPVSPEEQDTAYWMKTPLFSDGVEMDTPQLDKCMSCLLEVALSGNEEQKPFDYKLRPEIAVYVDLALGCSKEPARSLWISMQDYAVSKDWDSATLSNESLLDTVSRFVLAALLKHTNLLSQACGESRYQPGKHLSEVYRCVYKVRSRLLACKNLELIQTRSSSRDRWISENQDSADVDPQEHSFTRTIDEEAEMEEQAERDREEGHPEPEDEEEEREHEVMTAGKIFQCFLSAREVARSRDRDRMNSGAGSGARADDPPPQSQQERRVSTDLPEGQDVYTAACNSVIHRCALLILGVSPVIDELQKRREEGQLQQPSTSASEGGGLMTRSESLTAESRLVHTSPNYRLIKSRSESDLSQPESDEEGYALSGRRNVDLDLAASHRKRGPMHSQLESLSDSWARLKHSRDWLCNSSYSFESDFDLTKSLGVHTLIENVVSFVSGDVGNAPGFKEPEESMSTSPQASIIAMEQQQLRAELRLEALHQILVLLSGMEEKGSISLAGSRLSSGFQSSTLLTSVRLQFLAGCFGLGTVGHTGGKGESGRLHHYQDGIRAAKRNIQIEIQVAVHKIYQQLSATLERALQANKHHIEAQQRLLLVTVFALSVHYQPVDVSLAISTGLLNVLSQLCGTDTMLGQPLQLLPKTGVSQLSTALKVASTRLLQILAITTGTYADKLSPKVVQSLLDLLCSQLKNLLSQTGVLHMASFGEGEQEDGEEEEKKVDSSGETEKKDFRAALRKQHAAELHLGDFLVFLRRVVSSKAIQSKMASPKWTEVLLNIASQKCSSGIPLVGNLRTRLLALHVLEAVLPACESGVEDDQMAQIVERLFSLLSDCMWETPIAQAKHAIQIKEKEQEIKLQKQGELEEEDENLPIQEVSFDPEKAQCCLVENGQILTHGSGGKGYGLASTGVTSGCYQWKFYIVKENRGNEGTCVGVSRWPVHDFNHRTTSDMWLYRAYSGNLYHNGEQTLTLSSFTQGDFITCVLDMEARTISFGKNGEEPKLAFEDVDAAELYPCVMFYSSNPGEKVKICDMQMRGTPRDLLPGDPICSPVAAVLAEATIQLIRILHRTDRWTYCINKKMMERLHKIKICIKESGQKLKKSRSVQSREENEMREEKESKEEEKGKHTRHGLADLSELQLRTLCIEVWPVLAVIGGVDAGLRVGGRCVHKQTGRHATLLGVVKEGSTSAKVQWDEAEITISFPTFWSPSDTPLYNLEPCEPLPFDVARFRGLTASVLLDLTYLTGVHEDMGKQSTKRHEKKHRHESEEKGDVEQKPESESALDMRTGLTSDDVKSQSTTSSKSENEIASFSLDPTLPSVESQHQITEGKRKNHEHMSKNHDVAQSEIRAVQLSYLYLGAMKSLSALLGCSKYAELLLIPKVLAENGHNSDCASSPVVHEDVEMRAALQFLMRHMVKRAVMRSPIKRALGLADLERAQAMIYKLVVHGLLEDQFGGKIKQEIDQQAEESDPAQQAQTPVTTSPSASSTTSFMSSSLEDTTTATTPVTDTETVPASESPGVMPLSLLRQMFSSYPTTTVLPTRRAQTPPISSLPTSPSDEVGRRQSLTSPDSQSARPANRTALSDPSSRLSTSPPPPAIAVPLLEMGFSLRQIAKAMEATGARGEADAQNITVLAMWMIEHPGHEDEEEPQSGSTADSRPGAAVLGSGGKSNDPCYLQSPGDIPSADAAEMEEGFSESPDNLDHTENAASGSGPSARGRSAVTRRHKFDLAARTLLARAAGLYRSVQAHRNQSRREGISLQQDPGALYDFNLDEELEIDLDDEAMEAMFGQDLTSDNDILGMWIPEVLDWPTWHVCESEDREEVVVCELCECSVVSFNQHMKRNHPGCGRSANRQGYRSNGSYVDGWFGGECGSGNPYYLLCGTCREKYLAMKTKSKSTSSERYKGQAPDLIGKQDSVYEEDWDMLDVDEDEKLTGEEEFELLAGPLGLNDRRIVPEPVQFPDSDPLGASVAMVTATNSMEETLMQIGCHGSVEKSSSGRITLGEQAAALANPHDRVVALRRVTAAAQVLLARTMVMRALSLLSVSGSSCSLAAGLESLGLTDIRTLVRLMCLAAAGRAGLSTSPSAMASTSERSRGGHSKANKPISCLAYLSTAVGCLASNAPSAAKLLVQLCTQNLISAATGVNLTTVDDSIQRKFLPSFLRGIAEENKLVTSPNFVVTQALVALLADKGAKLRPNYDKSEVEKKGPLELANALAACCLSSRLSSQHRQWAAQQLVRTLAAHDRDNQTTLQTLADMGGDLRKCSFIKLEAHQNRVMTCVWCNKKGLLATSGNDGTIRVWNVTKKQYSLQQTCVFNRLEGDAEESLGSPSDPSFSPVSWSISGKYLAGALEKMVNIWQVNGGKGLVDIQPHWVSALAWPEEGPATAWSGESPELLLVGRMDGSLGLIEVVDVSTMHRRELEHCYRKDVSVTCIAWFSEDRPFAVGYFDGKLLLGTKEPLEKGGIVLIDAHKDTLISMKWDPTGHILMTCAKEDSVKLWGSISGCWCCLHSLCHPSIVNGIAWCRLPGKGSKLQLLMATGCQSGLVCVWRIPQDTTQTNVTSAEGWWEQESNCQDGYRKSSGAKCVYQLRGHITPVRTVAFSSDGLALVSGGLGGLMNIWSLRDGSVLQTVVIGSGAIQTTVWIPEVGVAACSNRSKDVLVVNCTAEWAAANHVLATCRTALKQQGVLGLNMAPCMRAFLERLPMMLQEQYAYEKPHVVCGDQLVHSPYMQCLASLAVGLHLDQLLCNPPVPPHHQNCLPDPASWNPNEWAWLECFSTTIKAAEALTNGAQFPESFTVPDLEPVPEDELVFLMDNSKWINGMDEQIMSWATSRPEDWHLGGKCDVYLWGAGRHGQLAEAGRNVMVPAAAPSFSQAQQVICGQNCTFVIQANGTVLACGEGSYGRLGQGNSDDLHVLTVISALQGFVVTQLVTSCGSDGHSMALTESGEVFSWGDGDYGKLGHGNSDRQRRPRQIEALQGEEVVQMSCGFKHSAVVTSDGKLFTFGNGDYGRLGLGNTSNKKLPERVTALEGYQIGQVACGLNHTLAVSADGSMVWAFGDGDYGKLGLGNSTAKSSPQKIDVLCGIGIKKVACGTQFSVALTKDGHVYTFGQDRLIGLPEGRARNHNRPQQIPVLAGVIIEDVAVGAEHTLALASNGDVYAWGSNSEGQLGLGHTNHVREPTLVTGLQGKNVRQISAGRCHSAAWTAPPVPPRAPGVSVPLQLGLPDTVPPQYGALREVSIHTVRARLRLLYHFSDLMYSSWRLLNLSPNNQNSTSHYNAGTWGIVQGQLRPLLAPRVYTLPMVRSIGKTMVQGKNYGPQITVKRISTRGRKCKPIFVQIARQVVKLNASDLRLPSRAWKVKLVGEGADDAGGVFDDTITEMCQELETGIVDLLIPSPNATAEVGYNRDRFLFNPSACLDEHLMQFKFLGILMGVAIRTKKPLDLHLAPLVWKQLCCVPLTLEDLEEVDLLYVQTLNSILHIEDSGITEESFHEMIPLDSFVGQSADGKMVPIIPGGNSIPLTFSNRKEYVERAIEYRLHEMDRQVAAVREGMSWIVPVPLLSLLTAKQLEQMVCGMPEISVEVLKKVVRYREVDEQHQLVQWFWHTLEEFSNEERVLFMRFVSGRSRLPANTADISQRFQIMKVDRPYDSLPTSQTCFFQLRLPPYSSQLVMAERLRYAINNCRSIDMDNYMLSRNVDNAEGSDTDY +>sp|O95714|HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 +MPSESFCLAAQARLDSKWLKTDIQLAFTRDGLCGLWNEMVKDGEIVYTGTESTQNGELPPRKDDSVEPSGTKKEDLNDKEKKDEEETPAPIYRAKSILDSWVWGKQPDVNELKECLSVLVKEQQALAVQSATTTLSALRLKQRLVILERYFIALNRTVFQENVKVKWKSSGISLPPVDKKSSRPAGKGVEGLARVGSRAALSFAFAFLRRAWRSGEDADLCSELLQESLDALRALPEASLFDESTVSSVWLEVVERATRFLRSVVTGDVHGTPATKGPGSIPLQDQHLALAILLELAVQRGTLSQMLSAILLLLQLWDSGAQETDNERSAQGTSAPLLPLLQRFQSIICRKDAPHSEGDMHLLSGPLSPNESFLRYLTLPQDNELAIDLRQTAVVVMAHLDRLATPCMPPLCSSPTSHKGSLQEVIGWGLIGWKYYANVIGPIQCEGLANLGVTQIACAEKRFLILSRNGRVYTQAYNSDTLAPQLVQGLASRNIVKIAAHSDGHHYLALAATGEVYSWGCGDGGRLGHGDTVPLEEPKVISAFSGKQAGKHVVHIACGSTYSAAITAEGELYTWGRGNYGRLGHGSSEDEAIPMLVAGLKGLKVIDVACGSGDAQTLAVTENGQVWSWGDGDYGKLGRGGSDGCKTPKLIEKLQDLDVVKVRCGSQFSIALTKDGQVYSWGKGDNQRLGHGTEEHVRYPKLLEGLQGKKVIDVAAGSTHCLALTEDSEVHSWGSNDQCQHFDTLRVTKPEPAALPGLDTKHIVGIACGPAQSFAWSSCSEWSIGLRVPFVVDICSMTFEQLDLLLRQVSEGMDGSADWPPPQEKECVAVATLNLLRLQLHAAISHQVDPEFLGLGLGSILLNSLKQTVVTLASSAGVLSTVQSAAQAVLQSGWSVLLPTAEERARALSALLPCAVSGNEVNISPGRRFMIDLLVGSLMADGGLESALHAAITAEIQDIEAKKEAQKEKEIDEQEANASTFHRSRTPLDKDLINTGICESSGKQCLPLVQLIQQLLRNIASQTVARLKDVARRISSCLDFEQHSRERSASLDLLLRFQRLLISKLYPGESIGQTSDISSPELMGVGSLLKKYTALLCTHIGDILPVAASIASTSWRHFAEVAYIVEGDFTGVLLPELVVSIVLLLSKNAGLMQEAGAVPLLGGLLEHLDRFNHLAPGKERDDHEELAWPGIMESFFTGQNCRNNEEVTLIRKADLENHNKDGGFWTVIDGKVYDIKDFQTQSLTGNSILAQFAGEDPVVALEAALQFEDTRESMHAFCVGQYLEPDQEIVTIPDLGSLSSPLIDTERNLGLLLGLHASYLAMSTPLSPVEIECAKWLQSSIFSGGLQTSQIHYSYNEEKDEDHCSSPGGTPASKSRLCSHRRALGDHSQAFLQAIADNNIQDHNVKDFLCQIERYCRQCHLTTPIMFPPEHPVEEVGRLLLCCLLKHEDLGHVALSLVHAGALGIEQVKHRTLPKSVVDVCRVVYQAKCSLIKTHQEQGRSYKEVCAPVIERLRFLFNELRPAVCNDLSIMSKFKLLSSLPRWRRIAQKIIRERRKKRVPKKPESTDDEEKIGNEESDLEEACILPHSPINVDKRPIAIKSPKDKWQPLLSTVTGVHKYKWLKQNVQGLYPQSPLLSTIAEFALKEEPVDVEKMRKCLLKQLERAEVRLEGIDTILKLASKNFLLPSVQYAMFCGWQRLIPEGIDIGEPLTDCLKDVDLIPPFNRMLLEVTFGKLYAWAVQNIRNVLMDASAKFKELGIQPVPLQTITNENPSGPSLGTIPQARFLLVMLSMLTLQHGANNLDLLLNSGMLALTQTALRLIGPSCDNVEEDMNASAQGASATVLEETRKETAPVQLPVSGPELAAMMKIGTRVMRGVDWKWGDQDGPPPGLGRVIGELGEDGWIRVQWDTGSTNSYRMGKEGKYDLKLAELPAAAQPSAEDSDTEDDSEAEQTERNIHPTAMMFTSTINLLQTLCLSAGVHAEIMQSEATKTLCGLLRMLVESGTTDKTSSPNRLVYREQHRSWCTLGFVRSIALTPQVCGALSSPQWITLLMKVVEGHAPFTATSLQRQILAVHLLQAVLPSWDKTERARDMKCLVEKLFDFLGSLLTTCSSDVPLLRESTLRRRRVRPQASLTATHSSTLAEEVVALLRTLHSLTQWNGLINKYINSQLRSITHSFVGRPSEGAQLEDYFPDSENPEVGGLMAVLAVIGGIDGRLRLGGQVMHDEFGEGTVTRITPKGKITVQFSDMRTCRVCPLNQLKPLPAVAFNVNNLPFTEPMLSVWAQLVNLAGSKLEKHKIKKSTKQAFAGQVDLDLLRCQQLKLYILKAGRALLSHQDKLRQILSQPAVQETGTVHTDDGAVVSPDLGDMSPEGPQPPMILLQQLLASATQPSPVKAIFDKQELEAAALAVCQCLAVESTHPSSPGFEDCSSSEATTPVAVQHIRPARVKRRKQSPVPALPIVVQLMEMGFSRRNIEFALKSLTGASGNASSLPGVEALVGWLLDHSDIQVTELSDADTVSDEYSDEEVVEDVDDAAYSMSTGAVVTESQTYKKRADFLSNDDYAVYVRENIQVGMMVRCCRAYEEVCEGDVGKVIKLDRDGLHDLNVQCDWQQKGGTYWVRYIHVELIGYPPPSSSSHIKIGDKVRVKASVTTPKYKWGSVTHQSVGVVKAFSANGKDIIVDFPQQSHWTGLLSEMELVPSIHPGVTCDGCQMFPINGSRFKCRNCDDFDFCETCFKTKKHNTRHTFGRINEPGQSAVFCGRSGKQLKRCHSSQPGMLLDSWSRMVKSLNVSSSVNQASRLIDGSEPCWQSSGSQGKHWIRLEIFPDVLVHRLKMIVDPADSSYMPSLVVVSGGNSLNNLIELKTININPSDTTVPLLNDCTEYHRYIEIAIKQCRSSGIDCKIHGLILLGRIRAEEEDLAAVPFLASDNEEEEDEKGNSGSLIRKKAAGLESAATIRTKVFVWGLNDKDQLGGLKGSKIKVPSFSETLSALNVVQVAGGSKSLFAVTVEGKVYACGEATNGRLGLGISSGTVPIPRQITALSSYVVKKVAVHSGGRHATALTVDGKVFSWGEGDDGKLGHFSRMNCDKPRLIEALKTKRIRDIACGSSHSAALTSSGELYTWGLGEYGRLGHGDNTTQLKPKMVKVLLGHRVIQVACGSRDAQTLALTDEGLVFSWGDGDFGKLGRGGSEGCNIPQNIERLNGQGVCQIECGAQFSLALTKSGVVWTWGKGDYFRLGHGSDVHVRKPQVVEGLRGKKIVHVAVGALHCLAVTDSGQVYAWGDNDHGQQGNGTTTVNRKPTLVQGLEGQKITRVACGSSHSVAWTTVDVATPSVHEPVLFQTARDPLGASYLGVPSDADSSAASNKISGASNSKPNRPSLAKILLSLDGNLAKQQALSHILTALQIMYARDAVVGALMPAAMIAPVECPSFSSAAPSDASAMASPMNGEECMLAVDIEDRLSPNPWQEKREIVSSEDAVTPSAVTPSAPSASARPFIPVTDDLGAASIIAETMTKTKEDVESQNKAAGPEPQALDEFTSLLIADDTRVVVDLLKLSVCSRAGDRGRDVLSAVLSGMGTAYPQVADMLLELCVTELEDVATDSQSGRLSSQPVVVESSHPYTDDTSTSGTVKIPGAEGLRVEFDRQCSTERRHDPLTVMDGVNRIVSVRSGREWSDWSSELRIPGDELKWKFISDGSVNGWGWRFTVYPIMPAAGPKELLSDRCVLSCPSMDLVTCLLDFRLNLASNRSIVPRLAASLAACAQLSALAASHRMWALQRLRKLLTTEFGQSININRLLGENDGETRALSFTGSALAALVKGLPEALQRQFEYEDPIVRGGKQLLHSPFFKVLVALACDLELDTLPCCAETHKWAWFRRYCMASRVAVALDKRTPLPRLFLDEVAKKIRELMADSENMDVLHESHDIFKREQDEQLVQWMNRRPDDWTLSAGGSGTIYGWGHNHRGQLGGIEGAKVKVPTPCEALATLRPVQLIGGEQTLFAVTADGKLYATGYGAGGRLGIGGTESVSTPTLLESIQHVFIKKVAVNSGGKHCLALSSEGEVYSWGEAEDGKLGHGNRSPCDRPRVIESLRGIEVVDVAAGGAHSACVTAAGDLYTWGKGRYGRLGHSDSEDQLKPKLVEALQGHRVVDIACGSGDAQTLCLTDDDTVWSWGDGDYGKLGRGGSDGCKVPMKIDSLTGLGVVKVECGSQFSVALTKSGAVYTWGKGDYHRLGHGSDDHVRRPRQVQGLQGKKVIAIATGSLHCVCCTEDGEVYTWGDNDEGQLGDGTTNAIQRPRLVAALQGKKVNRVACGSAHTLAWSTSKPASAGKLPAQVPMEYNHLQEIPIIALRNRLLLLHHLSELFCPCIPMFDLEGSLDETGLGPSVGFDTLRGILISQGKEAAFRKVVQATMVRDRQHGPVVELNRIQVKRSRSKGGLAGPDGTKSVFGQMCAKMSSFGPDSLLLPHRVWKVKFVGESVDDCGGGYSESIAEICEELQNGLTPLLIVTPNGRDESGANRDCYLLSPAARAPVHSSMFRFLGVLLGIAIRTGSPLSLNLAEPVWKQLAGMSLTIADLSEVDKDFIPGLMYIRDNEATSEEFEAMSLPFTVPSASGQDIQLSSKHTHITLDNRAEYVRLAINYRLHEFDEQVAAVREGMARVVPVPLLSLFTGYELETMVCGSPDIPLHLLKSVATYKGIEPSASLIQWFWEVMESFSNTERSLFLRFVWGRTRLPRTIADFRGRDFVIQVLDKYNPPDHFLPESYTCFFLLKLPRYSCKQVLEEKLKYAIHFCKSIDTDDYARIALTGEPAADDSSDDSDNEDVDSFASDSTQDYLTGH +>sp|Q6ZN18|AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2 +MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ +>sp|Q6ZN18-2|AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 +MAAAITDMADLEELSRLSPLPPGSPGSAARGRAEPPEEEEEEEEEEEEAEAEAVAALLLNGGSGGGGGGGGGGVGGGEAETMSEPSPESASQAGEDEDEEEDDEEEEDESSSSGGGEEESSAESLVGSSGGSSSDETRSLSPGAASSSSGDGDGKEGLEEPKGPRGSQGGGGGGSSSSSVVSSGGDEGYGTGGGGSSATSGGRRGSLEMSSDGEPLSRMDSEDSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKR +>sp|Q6ZN18-3|AEBP2_HUMAN Isoform 3 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 +MYTRRYSSISSTIMDVDSTISSGRSTPAMMNGQGSTTSSSKNIAYNCCWDQCQACFNSSPDLADHIRSIHVDGQRGGVFVCLWKGCKVYNTPSTSQSWLQRHMLTHSGDKPFKCVVGGCNASFASQGGLARHVPTHFSQQNSSKVSSQPKAKEESPSKAGMNKRRKLKNKRRRSLPRPHDFFDAQTLDAIRHRAICFNLSAHIESLGKGHSVVFHSTVIAKRKEDSGKIKLLLHWMPEDILPDVWVNESERHQLKTKVVHLSKLPKDTALLLDPNIYRTMPQKRLKRTLIRKVFNLYLSKQ +>sp|O15083|ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 +MYGSARTITNLEGSPSRSPRLPRSPRLGHRRTSSGGGGGTGKTLSMENIQSLNAAYATSGPMYLSDHEGVASTTYPKGTMTLGRATNRAVYGGRVTAMGSSPNIASAGLSHTDVLSYTDQHGGLTGSSHHHHHQVPSMLRQVRDSTMLDLQAQLKELQRENDLLRKELDIKDSKLGSSMNSIKTFWSPELKKERVLRKEEAARMSVLKEQMRVSHEENQHLQLTIQALQDELRTQRDLNHLLQQESGNRGAEHFTIELTEENFRRLQAEHDRQAKELFLLRKTLEEMELRIETQKQTLNARDESIKKLLEMLQSKGLPSKSLEDDNERTRRMAEAESQVSHLEVILDQKEKENIHLREELHRRSQLQPEPAKTKALQTVIEMKDTKIASLERNIRDLEDEIQMLKANGVLNTEDREEEIKQIEVYKSHSKFMKTKIDQLKQELSKKESELLALQTKLETLSNQNSDCKQHIEVLKESLTAKEQRAAILQTEVDALRLRLEEKESFLNKKTKQLQDLTEEKGTLAGEIRDMKDMLEVKERKINVLQKKIENLQEQLRDKDKQLTNLKDRVKSLQTDSSNTDTALATLEEALSEKERIIERLKEQRERDDRERLEEIESFRKENKDLKEKVNALQAELTEKESSLIDLKEHASSLASAGLKRDSKLKSLEIAIEQKKEECSKLEAQLKKAHNIEDDSRMNPEFADQIKQLDKEASYYRDECGKAQAEVDRLLEILKEVENEKNDKDKKIAELESLTLRHMKDQNKKVANLKHNQQLEKKKNAQLLEEVRRREDSMADNSQHLQIEELMNALEKTRQELDATKARLASTQQSLAEKEAHLANLRIERRKQLEEILEMKQEALLAAISEKDANIALLELSASKKKKTQEEVMALKREKDRLVHQLKQQTQNRMKLMADNYDDDHHHYHHHHHHHHHRSPGRSQHSNHRPSPDQDDEEGIWA +>sp|P23763|VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVIYFFT +>sp|P23763-3|VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVSKYR +>sp|P23763-2|VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 +MSAPAQPPAEGTEGTAPGGGPPGPPPNMTSNRRLQQTQAQVEEVVDIIRVNVDKVLERDQKLSELDDRADALQAGASQFESSAAKLKRKYWWKNCKMMIMLGAICAIIVVVIVRRD +>sp|Q15836|VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 +MSTGPTAATGSNRRLQQTQNQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNCKMWAIGITVLVIFIIIIIVWVVSS +>sp|P63027|VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3 +MSATAATAPPAAPAGEGGPPAPPPNLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERDQKLSELDDRADALQAGASQFETSAAKLKRKYWWKNLKMMIILGVICAIILIIIIVYFST +>sp|O75379|VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 +MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLRGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT +>sp|O75379-2|VAMP4_HUMAN_Isoform 2 of Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 +MPPKFKRHLNDDDVTGSVKSERRNLLEDDSDEEEDFFLGPSGPRFGPRNDKIKHVQNQVDEVIDVMQENITKVIERGERLDELQDKSESLSDNATAFSNRSKQLRRQMWWRGCKIKAIMALVAAILLLVIIILIVMKYRT +>sp|O95183|VAMP5_HUMAN_Vesicle-associated membrane protein 5 OS=Homo sapiens OX=9606 GN=VAMP5 PE=1 SV=1 +MAGIELERCQQQANEVTEIMRNNFGKVLERGVKLAELQQRSDQLLDMSSTFNKTTQNLAQKKCWENIRYRICVGLVVVGVLLIILIVLLVVFLPQSSDSSSAPRTQDAGIASGPGN +>sp|P51809|VAMP7_HUMAN_Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 PE=1 SV=3 +MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK +>sp|P51809-2|VAMP7_HUMAN_Isoform 2 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 +MAILFAVVARGTTILAKHAWCGGNFLEVTEQILAKIPSENNKLTYSHGNYLFHYICQDRIVYLCITDDDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIVCHLQNYQQKSCSSHVYEEPQAHYYHHHRINCVHLYHCFTSLWWIYMAKLCEEIGKKKLPLTKDMREQGVKSNPCDSSLSHTDRWYLPVSSTLFSLFKILFHASRFIFVLSTSLFL +>sp|P51809-3|VAMP7_HUMAN_Isoform 3 of Vesicle-associated membrane protein 7 OS=Homo sapiens OX=9606 GN=VAMP7 +MAILFAVVARGTTILAKHAWCGGNFLEDFERSRAFNFLNEIKKRFQTTYGSRAQTALPYAMNSEFSSVLAAQLKHHSENKGLDKVMETQAQVDELKGIMVRNIDLVAQRGERLELLIDKTENLVDSSVTFKTTSRNLARAMCMKNLKLTIIIIIVSIVFIYIIVSPLCGGFTWPSCVKK +>sp|Q9BV40|VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 +MEEASEGGGNDRVRNLQSEVEGVKNIMTQNVERILARGENLEHLRNKTEDLEATSEHFKTTSQKVARKFWWKNVKMIVLICVIVFIIILFIVLFATGAFS +>sp|P54764|EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1 +MAGIFYFALFSCLFGICDAVTGSRVYPANEVTLLDSRSVQGELGWIASPLEGGWEEVSIMDEKNTPIRTYQVCNVMEPSQNNWLRTDWITREGAQRVYIEIKFTLRDCNSLPGVMGTCKETFNLYYYESDNDKERFIRENQFVKIDTIAADESFTQVDIGDRIMKLNTEIRDVGPLSKKGFYLAFQDVGACIALVSVRVFYKKCPLTVRNLAQFPDTITGADTSSLVEVRGSCVNNSEEKDVPKMYCGADGEWLVPIGNCLCNAGHEERSGECQACKIGYYKALSTDATCAKCPPHSYSVWEGATSCTCDRGFFRADNDAASMPCTRPPSAPLNLISNVNETSVNLEWSSPQNTGGRQDISYNVVCKKCGAGDPSKCRPCGSGVHYTPQQNGLKTTKVSITDLLAHTNYTFEIWAVNGVSKYNPNPDQSVSVTVTTNQAAPSSIALVQAKEVTRYSVALAWLEPDRPNGVILEYEVKYYEKDQNERSYRIVRTAARNTDIKGLNPLTSYVFHVRARTAAGYGDFSEPLEVTTNTVPSRIIGDGANSTVLLVSVSGSVVLVVILIAAFVISRRRSKYSKAKQEADEEKHLNQGVRTYVDPFTYEDPNQAVREFAKEIDASCIKIEKVIGVGEFGEVCSGRLKVPGKREICVAIKTLKAGYTDKQRRDFLSEASIMGQFDHPNIIHLEGVVTKCKPVMIITEYMENGSLDAFLRKNDGRFTVIQLVGMLRGIGSGMKYLSDMSYVHRDLAARNILVNSNLVCKVSDFGMSRVLEDDPEAAYTTRGGKIPIRWTAPEAIAYRKFTSASDVWSYGIVMWEVMSYGERPYWDMSNQDVIKAIEEGYRLPPPMDCPIALHQLMLDCWQKERSDRPKFGQIVNMLDKLIRNPNSLKRTGTESSRPNTALLDPSSPEFSAVVSVGDWLQAIKMDRYKDNFTAAGYTTLEAVVHVNQEDLARIGITAITHQNKILSSVQAMRTQMQQMHGRMVPV +>sp|P54764-2|EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 +MKWEEVSIMDEKNTPIRTYQVCNVMEPSQNNWLRTDWITREGAQRVYIEIKFTLRDCNSLPGVMGTCKETFNLYYYESDNDKERFIRENQFVKIDTIAADESFTQVDIGDRIMKLNTEIRDVGPLSKKGFYLAFQDVGACIALVSVRVFYKKCPLTVRNLAQFPDTITGADTSSLVEVRGSCVNNSEEKDVPKMYCGADGEWLVPIGNCLCNAGHEERSGECQACKIGYYKALSTDATCAKCPPHSYSVWEGATSCTCDRGFFRADNDAASMPCTRPPSAPLNLISNVNETSVNLEWSSPQNTGGRQDISYNVVCKKCGAGDPSKCRPCGSGVHYTPQQNGLKTTKVSITDLLAHTNYTFEIWAVNGVSKYNPNPDQSVSVTVTTNQAAPSSIALVQAKEVTRYSVALAWLEPDRPNGVILEYEVKYYEKDQNERSYRIVRTAARNTDIKGLNPLTSYVFHVRARTAAGYGDFSEPLEVTTNTVPSRIIGDGANSTVLLVSVSGSVVLVVILIAAFVISRRRSKYSKAKQEADEEKHLNQGVRTYVDPFTYEDPNQAVREFAKEIDASCIKIEKVIGVGEFGEVCSGRLKVPGKREICVAIKTLKAGYTDKQRRDFLSEASIMGQFDHPNIIHLEGVVTKCKPVMIITEYMENGSLDAFLRKNDGRFTVIQLVGMLRGIGSGMKYLSDMSYVHRDLAARNILVNSNLVCKVSDFGMSRVLEDDPEAAYTTRGGKIPIRWTAPEAIAYRKFTSASDVWSYGIVMWEVMSYGERPYWDMSNQDVIKAIEEGYRLPPPMDCPIALHQLMLDCWQKERSDRPKFGQIVNMLDKLIRNPNSLKRTGTESSRPNTALLDPSSPEFSAVVSVGDWLQAIKMDRYKDNFTAAGYTTLEAVVHVNQEDLARIGITAITHQNKILSSVQAMRTQMQQMHGRMVPV
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/workflow/ppenrich_suite_wf.ga Mon Jul 11 19:22:25 2022 +0000 @@ -0,0 +1,904 @@ +{ + "a_galaxy_workflow": "true", + "annotation": "phoshpoproteomic enrichment data pre-processing and ANOVA", + "creator": [ + { + "class": "Person", + "identifier": "0000-0002-2882-0508", + "name": "Art Eschenlauer" + } + ], + "format-version": "0.1", + "license": "MIT", + "name": "ppenrich_suite_wf", + "steps": { + "0": { + "annotation": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", + "content_id": null, + "errors": null, + "id": 0, + "input_connections": {}, + "inputs": [ + { + "description": "The Phospho (STY)Sites.txt file produced by MaxQuant (found in the txt folder).", + "name": "Phospho (STY)Sites.txt" + } + ], + "label": "Phospho (STY)Sites.txt", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 290.16561126708984, + "height": 82.1624984741211, + "left": 515.090576171875, + "right": 715.0874328613281, + "top": 208.00311279296875, + "width": 199.99685668945312, + "x": 515.090576171875, + "y": 208.00311279296875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "c366566c-2a61-4918-b4ea-c1f565c4f2ca", + "workflow_outputs": [] + }, + "1": { + "annotation": "THIS IS pST BY DEFAULT. Change if your data are enriched for pY.", + "content_id": null, + "errors": null, + "id": 1, + "input_connections": {}, + "inputs": [ + { + "description": "THIS IS pST BY DEFAULT. Change if your data are enriched for pY.", + "name": "enrichmentType" + } + ], + "label": "enrichmentType", + "name": "Input parameter", + "outputs": [], + "position": { + "bottom": 375.7687225341797, + "height": 61.76249694824219, + "left": 531.1312255859375, + "right": 731.1280822753906, + "top": 314.0062255859375, + "width": 199.99685668945312, + "x": 531.1312255859375, + "y": 314.0062255859375 + }, + "tool_id": null, + "tool_state": "{\"restrictions\": [\"pST\", \"pY\"], \"parameter_type\": \"text\", \"optional\": false}", + "tool_version": null, + "type": "parameter_input", + "uuid": "5f31b776-9e2b-4f3a-a9e6-886ac2062e15", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "1ff7eb95-9dd3-4006-ab0b-03e4f84a1aa5" + } + ] + }, + "2": { + "annotation": "Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)", + "content_id": null, + "errors": null, + "id": 2, + "input_connections": {}, + "inputs": [ + { + "description": "Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)", + "name": "Intensity-column pattern" + } + ], + "label": "Intensity-column pattern", + "name": "Input parameter", + "outputs": [], + "position": { + "bottom": 576.2812118530273, + "height": 102.56249237060547, + "left": 590.1468505859375, + "right": 790.1437072753906, + "top": 473.7187194824219, + "width": 199.99685668945312, + "x": 590.1468505859375, + "y": 473.7187194824219 + }, + "tool_id": null, + "tool_state": "{\"default\": \"^Intensity[^_]\", \"parameter_type\": \"text\", \"optional\": true}", + "tool_version": null, + "type": "parameter_input", + "uuid": "86505e43-20be-40f5-ad66-eeb3527c6a60", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "ebb65015-b681-4798-9504-c8c948f82fee" + } + ] + }, + "3": { + "annotation": "Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)", + "content_id": null, + "errors": null, + "id": 3, + "input_connections": {}, + "inputs": [ + { + "description": "Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)", + "name": "Sample-extraction pattern" + } + ], + "label": "Sample-extraction pattern", + "name": "Input parameter", + "outputs": [], + "position": { + "bottom": 688.256217956543, + "height": 102.56249237060547, + "left": 606.2249755859375, + "right": 806.2218322753906, + "top": 585.6937255859375, + "width": 199.99685668945312, + "x": 606.2249755859375, + "y": 585.6937255859375 + }, + "tool_id": null, + "tool_state": "{\"default\": \"\\\\.\\\\d+[A-Z]$\", \"parameter_type\": \"text\", \"optional\": true}", + "tool_version": null, + "type": "parameter_input", + "uuid": "79f4b36c-dd9b-4d24-a9c8-e0084af50597", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "3f5f7c91-dc90-4e14-84d9-94db5e49a625" + } + ] + }, + "4": { + "annotation": "Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)", + "content_id": null, + "errors": null, + "id": 4, + "input_connections": {}, + "inputs": [ + { + "description": "Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)", + "name": "Group-extraction pattern" + } + ], + "label": "Group-extraction pattern", + "name": "Input parameter", + "outputs": [], + "position": { + "bottom": 804.2999801635742, + "height": 102.56249237060547, + "left": 610.2562255859375, + "right": 810.2530822753906, + "top": 701.7374877929688, + "width": 199.99685668945312, + "x": 610.2562255859375, + "y": 701.7374877929688 + }, + "tool_id": null, + "tool_state": "{\"default\": \"\\\\d+\", \"parameter_type\": \"text\", \"optional\": true}", + "tool_version": null, + "type": "parameter_input", + "uuid": "67f4321c-9b08-4dd2-b448-813f6fdb1b6a", + "workflow_outputs": [ + { + "label": null, + "output_name": "output", + "uuid": "4abd7c2f-9614-4b08-8ea1-8c5c19d69b7c" + } + ] + }, + "5": { + "annotation": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", + "content_id": null, + "errors": null, + "id": 5, + "input_connections": {}, + "inputs": [ + { + "description": "FASTA file of all human canonical isoforms, derived from Swiss-Prot (e.g., merge of https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot_varsplic.fasta.gz and https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz)", + "name": "SwissProt_Human_Canonical_Isoform.fasta" + } + ], + "label": "SwissProt_Human_Canonical_Isoform.fasta", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1096.5749435424805, + "height": 102.56249237060547, + "left": 639.121826171875, + "right": 839.1186828613281, + "top": 994.012451171875, + "width": 199.99685668945312, + "x": 639.121826171875, + "y": 994.012451171875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"fasta\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "870d3075-3ebb-4505-99a2-c3d01b51a86b", + "workflow_outputs": [] + }, + "6": { + "annotation": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", + "content_id": null, + "errors": null, + "id": 6, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from https://networkin.info/download/networkin_human_predictions_3.1.tsv.xz (which is free for non-commercial use - for required citation, see https://networkin.info/)", + "name": "NetworKIN_cutoffscore2.0.tabular" + } + ], + "label": "NetworKIN_cutoffscore2.0.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1227.581169128418, + "height": 102.56249237060547, + "left": 656.1561889648438, + "right": 856.1530456542969, + "top": 1125.0186767578125, + "width": 199.99685668945312, + "x": 656.1561889648438, + "y": 1125.0186767578125 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "0ecd2f07-9b2c-41c5-8bcf-fa45927f61ca", + "workflow_outputs": [] + }, + "7": { + "annotation": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", + "content_id": null, + "errors": null, + "id": 7, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from http://hprd.org/serine_motifs, http://hprd.org/tyrosine_motifs, and http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx", + "name": "pSTY_Motifs.tabular" + } + ], + "label": "pSTY_Motifs.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1336.2092514038086, + "height": 82.1624984741211, + "left": 673.1718139648438, + "right": 873.1686706542969, + "top": 1254.0467529296875, + "width": 199.99685668945312, + "x": 673.1718139648438, + "y": 1254.0467529296875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "d8f605d8-4cf6-48dc-9ec5-ceda9f6ee4b2", + "workflow_outputs": [] + }, + "8": { + "annotation": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "content_id": null, + "errors": null, + "id": 8, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from Kinase_Substrate_Dataset.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "name": "PSP_Kinase_Substrate_Dataset.tabular" + } + ], + "label": "PSP_Kinase_Substrate_Dataset.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1466.596794128418, + "height": 102.56249237060547, + "left": 673.1718139648438, + "right": 873.1686706542969, + "top": 1364.0343017578125, + "width": 199.99685668945312, + "x": 673.1718139648438, + "y": 1364.0343017578125 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "ed06b46c-d6b3-4d52-a6e6-fa5211da5a0a", + "workflow_outputs": [] + }, + "9": { + "annotation": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "content_id": null, + "errors": null, + "id": 9, + "input_connections": {}, + "inputs": [ + { + "description": "Derived from Regulatory_sites.gz found at https://www.phosphosite.org/staticDownloads (free for non-commercial use - see that link for citation.)", + "name": "PSP_Regulatory_sites.tabular" + } + ], + "label": "PSP_Regulatory_sites.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1576.2092514038086, + "height": 82.1624984741211, + "left": 674.1561889648438, + "right": 874.1530456542969, + "top": 1494.0467529296875, + "width": 199.99685668945312, + "x": 674.1561889648438, + "y": 1494.0467529296875 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "47cf1ca8-315d-425f-bb32-0946cd866d5f", + "workflow_outputs": [] + }, + "10": { + "annotation": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", + "content_id": null, + "errors": null, + "id": 10, + "input_connections": {}, + "inputs": [ + { + "description": "List of alpha cutoff values for significance testing; text file having no header and a single line for each cutoff value.", + "name": "alpha_levels.tabular" + } + ], + "label": "alpha_levels.tabular", + "name": "Input dataset", + "outputs": [], + "position": { + "bottom": 1835.699851989746, + "height": 82.1624984741211, + "left": 691.1249389648438, + "right": 891.1217956542969, + "top": 1753.537353515625, + "width": 199.99685668945312, + "x": 691.1249389648438, + "y": 1753.537353515625 + }, + "tool_id": null, + "tool_state": "{\"optional\": false, \"format\": [\"tabular\"], \"tag\": \"\"}", + "tool_version": null, + "type": "data_input", + "uuid": "5d66ff58-9c83-4edd-96c6-6132dc8377c7", + "workflow_outputs": [] + }, + "11": { + "annotation": "Transform the output of MaxQuant for phosphoproteome-enriched samples to prepare it for statistical anlaysis.", + "content_id": "mqppep_preproc", + "errors": null, + "id": 11, + "input_connections": { + "networkin": { + "id": 6, + "output_name": "output" + }, + "p_sty_motifs": { + "id": 7, + "output_name": "output" + }, + "phosphoSites": { + "id": 0, + "output_name": "output" + }, + "protein_fasta": { + "id": 5, + "output_name": "output" + }, + "psp_kinase_substrate": { + "id": 8, + "output_name": "output" + }, + "psp_regulatory_sites": { + "id": 9, + "output_name": "output" + }, + "pst_py_selector": { + "id": 1, + "output_name": "output" + }, + "startCol": { + "id": 2, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "networkin" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "p_sty_motifs" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "phosphoSites" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "protein_fasta" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "psp_kinase_substrate" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide Preprocessing", + "name": "psp_regulatory_sites" + } + ], + "label": "Preprocess MaxQuant Phospho (STY)Sites", + "name": "MaxQuant Phosphopeptide Preprocessing", + "outputs": [ + { + "name": "phosphoPepIntensities", + "type": "tabular" + }, + { + "name": "enrichGraph", + "type": "pdf" + }, + { + "name": "locProbCutoffGraph", + "type": "pdf" + }, + { + "name": "enrichGraph_svg", + "type": "svg" + }, + { + "name": "locProbCutoffGraph_svg", + "type": "svg" + }, + { + "name": "filteredData_tabular", + "type": "tabular" + }, + { + "name": "quantData_tabular", + "type": "tabular" + }, + { + "name": "mapped_phophopeptides", + "type": "tabular" + }, + { + "name": "melted_phophopeptide_map", + "type": "tabular" + }, + { + "name": "mqppep_output_sqlite", + "type": "sqlite" + }, + { + "name": "preproc_tab", + "type": "tabular" + }, + { + "name": "preproc_csv", + "type": "csv" + }, + { + "name": "preproc_sqlite", + "type": "sqlite" + } + ], + "position": { + "bottom": 1652.2499389648438, + "height": 956.231201171875, + "left": 1336.60302734375, + "right": 1536.5998840332031, + "top": 696.0187377929688, + "width": 199.99685668945312, + "x": 1336.60302734375, + "y": 696.0187377929688 + }, + "post_job_actions": { + "HideDatasetActionfilteredData_tabular": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "filteredData_tabular" + }, + "HideDatasetActionmapped_phophopeptides": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "mapped_phophopeptides" + }, + "HideDatasetActionmelted_phophopeptide_map": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "melted_phophopeptide_map" + }, + "HideDatasetActionmqppep_output_sqlite": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "mqppep_output_sqlite" + }, + "HideDatasetActionpreproc_csv": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "preproc_csv" + }, + "HideDatasetActionquantData_tabular": { + "action_arguments": {}, + "action_type": "HideDatasetAction", + "output_name": "quantData_tabular" + }, + "RenameDatasetActionenrichGraph": { + "action_arguments": { + "newname": "#{phosphoSites}.enrichGraph_pdf" + }, + "action_type": "RenameDatasetAction", + "output_name": "enrichGraph" + }, + "RenameDatasetActionenrichGraph_svg": { + "action_arguments": { + "newname": "#{phosphoSites}.enrichGraph_svg" + }, + "action_type": "RenameDatasetAction", + "output_name": "enrichGraph_svg" + }, + "RenameDatasetActionfilteredData_tabular": { + "action_arguments": { + "newname": "#{phosphoSites}.filteredData" + }, + "action_type": "RenameDatasetAction", + "output_name": "filteredData_tabular" + }, + "RenameDatasetActionlocProbCutoffGraph": { + "action_arguments": { + "newname": "#{phosphoSites}.locProbCutoffGraph_pdf" + }, + "action_type": "RenameDatasetAction", + "output_name": "locProbCutoffGraph" + }, + "RenameDatasetActionlocProbCutoffGraph_svg": { + "action_arguments": { + "newname": "#{phosphoSites}.locProbCutoffGraph_svg" + }, + "action_type": "RenameDatasetAction", + "output_name": "locProbCutoffGraph_svg" + }, + "RenameDatasetActionmapped_phophopeptides": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_map" + }, + "action_type": "RenameDatasetAction", + "output_name": "mapped_phophopeptides" + }, + "RenameDatasetActionmelted_phophopeptide_map": { + "action_arguments": { + "newname": "#{phosphoSites}.melted" + }, + "action_type": "RenameDatasetAction", + "output_name": "melted_phophopeptide_map" + }, + "RenameDatasetActionmqppep_output_sqlite": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_mapping_sqlite" + }, + "action_type": "RenameDatasetAction", + "output_name": "mqppep_output_sqlite" + }, + "RenameDatasetActionphosphoPepIntensities": { + "action_arguments": { + "newname": "#{phosphoSites}.ppep_intensities" + }, + "action_type": "RenameDatasetAction", + "output_name": "phosphoPepIntensities" + }, + "RenameDatasetActionpreproc_csv": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_csv" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_csv" + }, + "RenameDatasetActionpreproc_sqlite": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_sqlite" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_sqlite" + }, + "RenameDatasetActionpreproc_tab": { + "action_arguments": { + "newname": "#{phosphoSites}.preproc_tab" + }, + "action_type": "RenameDatasetAction", + "output_name": "preproc_tab" + }, + "RenameDatasetActionquantData_tabular": { + "action_arguments": { + "newname": "#{phosphoSites}.quantData" + }, + "action_type": "RenameDatasetAction", + "output_name": "quantData_tabular" + } + }, + "tool_id": "mqppep_preproc", + "tool_state": "{\"collapseFunc\": \"sum\", \"intervalCol\": \"1\", \"localProbCutoff\": \"0.75\", \"merge_function\": \"sum\", \"networkin\": {\"__class__\": \"RuntimeValue\"}, \"p_sty_motifs\": {\"__class__\": \"RuntimeValue\"}, \"phosphoCol\": \"^Number of Phospho [(]STY[)]$\", \"phosphoSites\": {\"__class__\": \"RuntimeValue\"}, \"protein_fasta\": {\"__class__\": \"RuntimeValue\"}, \"psp_kinase_substrate\": {\"__class__\": \"RuntimeValue\"}, \"psp_regulatory_sites\": {\"__class__\": \"RuntimeValue\"}, \"pst_py_selector\": {\"__class__\": \"ConnectedValue\"}, \"species\": \"human\", \"startCol\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": null, + "type": "tool", + "uuid": "83ae6038-e871-4051-9544-181cdf2a5257", + "workflow_outputs": [ + { + "label": "locProbCutoffGraph_pdf", + "output_name": "locProbCutoffGraph", + "uuid": "c3840695-4deb-4347-94c4-1d60f0de3744" + }, + { + "label": "enrichGraph_svg", + "output_name": "enrichGraph_svg", + "uuid": "dff65302-dc37-4812-9ab1-10178d880412" + }, + { + "label": "locProbCutoffGraph_svg", + "output_name": "locProbCutoffGraph_svg", + "uuid": "b48535ab-ee39-44c3-bc37-5f4e79a147ee" + }, + { + "label": "preproc_tab", + "output_name": "preproc_tab", + "uuid": "ce6d767a-b24d-404c-9eeb-fa8f5156fa93" + }, + { + "label": "preproc_sqlite", + "output_name": "preproc_sqlite", + "uuid": "52c88bda-4863-47e1-afb0-46839fb1b601" + }, + { + "label": "ppep_intensities", + "output_name": "phosphoPepIntensities", + "uuid": "b1729d3e-b934-4e7e-a38f-23d963df3c22" + }, + { + "label": "enrichGraph_pdf", + "output_name": "enrichGraph", + "uuid": "72f605a1-a8a7-4e9e-99e8-0c1360303fc0" + } + ] + }, + "12": { + "annotation": "Perform ANOVA. For imputing missing values, use median of non-missing values from the same treatment group.", + "content_id": "mqppep_anova", + "errors": null, + "id": 12, + "input_connections": { + "alpha_file": { + "id": 10, + "output_name": "output" + }, + "input_file": { + "id": 11, + "output_name": "preproc_tab" + }, + "sample_grouping_regex": { + "id": 4, + "output_name": "output" + }, + "sample_names_regex": { + "id": 3, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA", + "name": "alpha_file" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA", + "name": "input_file" + } + ], + "label": "ANOVA group-median imputed", + "name": "MaxQuant Phosphopeptide ANOVA", + "outputs": [ + { + "name": "imputed_data_file", + "type": "tabular" + }, + { + "name": "imp_qn_lt_file", + "type": "tabular" + }, + { + "name": "report_file", + "type": "pdf" + } + ], + "position": { + "bottom": 2246.653045654297, + "height": 347.1187438964844, + "left": 1028.184326171875, + "right": 1228.1811828613281, + "top": 1899.5343017578125, + "width": 199.99685668945312, + "x": 1028.184326171875, + "y": 1899.5343017578125 + }, + "post_job_actions": { + "RenameDatasetActionimp_qn_lt_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_group-mean-imputed_QN_LT" + }, + "action_type": "RenameDatasetAction", + "output_name": "imp_qn_lt_file" + }, + "RenameDatasetActionimputed_data_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_group-mean-imputed" + }, + "action_type": "RenameDatasetAction", + "output_name": "imputed_data_file" + }, + "RenameDatasetActionreport_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_group-mean-imputed_report" + }, + "action_type": "RenameDatasetAction", + "output_name": "report_file" + } + }, + "tool_id": "mqppep_anova", + "tool_state": "{\"alpha_file\": {\"__class__\": \"RuntimeValue\"}, \"imputation\": {\"imputation_method\": \"group-median\", \"__current_case__\": 0}, \"input_file\": {\"__class__\": \"RuntimeValue\"}, \"intensity_column_regex\": \"^Intensity[^_]\", \"sample_grouping_regex\": {\"__class__\": \"ConnectedValue\"}, \"sample_names_regex\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": null, + "type": "tool", + "uuid": "71cbf127-f9d1-4a10-be61-53daea7ff1f6", + "workflow_outputs": [ + { + "label": "intensities_group-mean-imputed_QN_LT", + "output_name": "imp_qn_lt_file", + "uuid": "3ad9495d-0b38-4527-a0bf-7b2c62eb9dc9" + }, + { + "label": "intensities_group-mean-imputed", + "output_name": "imputed_data_file", + "uuid": "933baff0-3c19-4363-822c-2bce5d436ac1" + }, + { + "label": "intensities_group-mean-imputed_report", + "output_name": "report_file", + "uuid": "792cacc0-e202-44e4-9048-9e1186ea5ba9" + } + ] + }, + "13": { + "annotation": "Perform ANOVA. For imputing missing values, create random values.", + "content_id": "mqppep_anova", + "errors": null, + "id": 13, + "input_connections": { + "alpha_file": { + "id": 10, + "output_name": "output" + }, + "input_file": { + "id": 11, + "output_name": "preproc_tab" + }, + "sample_grouping_regex": { + "id": 4, + "output_name": "output" + }, + "sample_names_regex": { + "id": 3, + "output_name": "output" + } + }, + "inputs": [ + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA", + "name": "alpha_file" + }, + { + "description": "runtime parameter for tool MaxQuant Phosphopeptide ANOVA", + "name": "input_file" + } + ], + "label": "MaxQuant Phosphopeptide ANOVA randomly imputed", + "name": "MaxQuant Phosphopeptide ANOVA", + "outputs": [ + { + "name": "imputed_data_file", + "type": "tabular" + }, + { + "name": "imp_qn_lt_file", + "type": "tabular" + }, + { + "name": "report_file", + "type": "pdf" + } + ], + "position": { + "bottom": 2106.0374145507812, + "height": 367.51873779296875, + "left": 1399.153076171875, + "right": 1599.1499328613281, + "top": 1738.5186767578125, + "width": 199.99685668945312, + "x": 1399.153076171875, + "y": 1738.5186767578125 + }, + "post_job_actions": { + "RenameDatasetActionimp_qn_lt_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_randomly-imputed_QN_LT" + }, + "action_type": "RenameDatasetAction", + "output_name": "imp_qn_lt_file" + }, + "RenameDatasetActionimputed_data_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_randomly-imputed" + }, + "action_type": "RenameDatasetAction", + "output_name": "imputed_data_file" + }, + "RenameDatasetActionreport_file": { + "action_arguments": { + "newname": "#{input_file}.intensities_randomly-imputed_report" + }, + "action_type": "RenameDatasetAction", + "output_name": "report_file" + } + }, + "tool_id": "mqppep_anova", + "tool_state": "{\"alpha_file\": {\"__class__\": \"RuntimeValue\"}, \"imputation\": {\"imputation_method\": \"random\", \"__current_case__\": 3, \"meanPercentile\": \"1\", \"sdPercentile\": \"1.0\"}, \"input_file\": {\"__class__\": \"RuntimeValue\"}, \"intensity_column_regex\": \"^Intensity[^_]\", \"sample_grouping_regex\": {\"__class__\": \"ConnectedValue\"}, \"sample_names_regex\": {\"__class__\": \"ConnectedValue\"}, \"__page__\": null, \"__rerun_remap_job_id__\": null}", + "tool_version": null, + "type": "tool", + "uuid": "e71562a7-c941-429d-99a8-e14721df3670", + "workflow_outputs": [ + { + "label": "intensities_randomly-imputed", + "output_name": "imputed_data_file", + "uuid": "e27c540b-07d0-496f-8b11-b4c1472dce12" + }, + { + "label": "intensities_randomly-imputed_report", + "output_name": "report_file", + "uuid": "abe2dbf4-956d-4625-a0e1-ad1c6c988a7c" + }, + { + "label": "intensities_randomly-imputed_QN_LT", + "output_name": "imp_qn_lt_file", + "uuid": "cb5b1d8f-905b-453a-a479-507e01a8f8f7" + } + ] + } + }, + "tags": [ + "ppenrich" + ], + "uuid": "234db768-520c-4eaa-a5be-061e3d858682", + "version": 2 +}