Repository 'saint_preprocessing'
hg clone https://toolshed.g2.bx.psu.edu/repos/bornea/saint_preprocessing

Changeset 28:dbd1af88f060 (2016-04-26)
Previous changeset 27:2d78642361c3 (2016-04-18) Next changeset 29:bd71998aec8d (2016-04-26)
Commit message:
Uploaded
modified:
pre_process_protein_name_set.R
b
diff -r 2d78642361c3 -r dbd1af88f060 pre_process_protein_name_set.R
--- a/pre_process_protein_name_set.R Mon Apr 18 10:25:06 2016 -0400
+++ b/pre_process_protein_name_set.R Tue Apr 26 14:42:16 2016 -0400
[
b'@@ -1,128 +1,132 @@\n-#######################################################################################\n-# R-code: Protein Name and Tukey\'s Normalization\n-# Author: Adam L Borne\n-# Contributers: Paul A Stewart, Brent Kuenzi\n-#######################################################################################\n-# Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the\n-# intensities of each proteins. Resulting in a one to one list of intensities to \n-# uniprot id.\n-#######################################################################################\n-# Copyright (C)  Adam Borne.\n-# Permission is granted to copy, distribute and/or modify this document\n-# under the terms of the GNU Free Documentation License, Version 1.3\n-# or any later version published by the Free Software Foundation;\n-# with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\n-# A copy of the license is included in the section entitled "GNU\n-# Free Documentation License".\n-#######################################################################################\n-## REQUIRED INPUT ##\n-\n-# 1) peptides_file: MaxQuant peptides file. \n-#######################################################################################\n-\n-\n-ins_check_run <- function() {\n-  if ("affy" %in% rownames(installed.packages())){}\n-  else {\n-    source("https://bioconductor.org/biocLite.R")\n-    biocLite(c(\'mygene\',\'affy\'))\n-  }\n-  if (\'data.table\' %in% rownames(installed.packages())){}\n-  else {\n-    install.packages(\'data.table\', repos=\'http://cran.us.r-project.org\')\n-  }\n-  if (\'stringr\' %in% rownames(installed.packages())){}\n-  else {\n-    install.packages(\'stringr\', repos=\'http://cran.us.r-project.org\')\n-  }\n-  if (\'VennDiagram\' %in% rownames(installed.packages())){}\n-  else {\n-    install.packages(\'VennDiagram\', repos=\'http://cran.us.r-project.org\')\n-  }\n-}\n-\n-ins_check_run()\n-library(data.table)\n-library(affy)\n-library(stringr)\n-library(mygene)\n-library(VennDiagram)\n-\n-\n-main <- function(peptides_file, db_path) {\n-\tpeptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)\n-  peptides_txt = peptides_file\n-\tintensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\\\.*")])\n-  # Pulls out all lines with Intensity in them.\n-\tintensity_columns = intensity_columns[2:length(intensity_columns)]\n-  # Removes the first column that does not have a bait. \n-\tpeptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt))\n-  # This function as below sets every line to a 1 to 1 intensity to each possible protein.\n-\tpeptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.\n-\tpeptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot))\n-  # Removes reverse sequences and any that didn\'t match a uniprot accession.\n-\tcolumns_comb = c("Uniprot", intensity_columns) \n-\tpeptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb)\n-  # Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)/\n-\tswissprot_fasta = scan(db_path, what="character")\n-\tpeptides_txt_mapped_log2 = peptides_mapped_intensity\n-  # Takes the log2 of the intensities. \n-\tfor (i in intensity_columns) { \n-\t\tpeptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))\n-\t}\n-  # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the \n-  # global min; breaks when there\'s only one intensity column.\n-\tglobal_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {\n-\t  min(x[x != -Inf])\n-\t}))\n-\tpeptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0\n-  #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways.\n-\tmapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-'..b'Uniprot id from the script.\r\n+\tpeptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot))\r\n+  # Removes reverse sequences and any that didn\'t match a uniprot accession.\r\n+\tcolumns_comb = c("Uniprot", intensity_columns) \r\n+\tpeptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb)\r\n+  # Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)/\r\n+\tswissprot_fasta = scan(db_path, what="character")\r\n+\tpeptides_txt_mapped_log2 = peptides_mapped_intensity\r\n+  # Takes the log2 of the intensities. \r\n+\tfor (i in intensity_columns) { \r\n+\t\tpeptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))\r\n+\t}\r\n+  # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the \r\n+  # global min; breaks when there\'s only one intensity column.\r\n+\tglobal_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {\r\n+\t  min(x[x != -Inf])\r\n+\t}))\r\n+\tpeptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- NA\r\n+  #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways.\r\n+\tmapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") \r\n+\tmapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")\r\n+\tpeptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly\r\n+  # Runs the Tukey function returning completed table.\r\n+  peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)\r\n+  if (nrow(peptides_txt_mapped_log2) == 0) {\r\n+    print("Uniprot Database does not have any of the proteins in the peptides file")\r\n+    quit()\r\n+  }\r\n+\tprotein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)\r\n+  protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0\r\n+  write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\\t")\t\r\n+\r\n+}\r\n+\r\n+map_peptides_proteins = function(peptides_in) {\r\n+    peptides_in = subset(peptides_in,peptides_in$Proteins != "")\r\n+    results_list = list()\r\n+    k = 1\r\n+    for (i in 1:nrow(peptides_in)) {\r\n+        protein_names = peptides_in[i,"Proteins"]\r\n+        protein_names_split = unlist(str_split(protein_names,";"))\r\n+        for (j in 1:length(protein_names_split)) {\r\n+            peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)\r\n+            results_list[[k]] = peptides_mapped_proteins\r\n+            k = k+1\r\n+            \r\n+        }\r\n+    }\r\n+    return(rbindlist(results_list))\r\n+}\r\n+\r\n+get_protein_values = function(mapped_peptides_in,intensity_columns_list) {\r\n+  unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) \r\n+  # Gets list of all peptides listed.\r\n+  # Generates a blank data frame with clomns of Intensities and rows of Uniprots.\r\n+  Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) \r\n+  for (q in intensity_columns_list) {Tukeys_df[,q] = NA}\r\n+  for (i in 1:length(unique_mapped_proteins_list)) {\r\n+    mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])\r\n+    # Calculate Tukey\'s Biweight from library(affy); returns a single numeric.\r\n+    # Results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)).\r\n+    for (j in intensity_columns_list) {\r\n+      # Populates with new Tukeys values.\r\n+      Tukeys_df[i,j] = 2^(tukey.biweight(na.omit(mapped_peptides_unique_subset[,j])))\r\n+    }\r\n+  }\r\n+  return(Tukeys_df)\r\n+}\r\n+\r\n+\r\n+args <- commandArgs(trailingOnly = TRUE)\r\n+main(args[1], args[2])\r\n'