annotate pre_process_protein_name_set.R @ 31:cb56479f7aca draft

Uploaded
author bornea
date Thu, 28 Jan 2016 13:53:56 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
31
cb56479f7aca Uploaded
bornea
parents:
diff changeset
1 #######################################################################################
cb56479f7aca Uploaded
bornea
parents:
diff changeset
2 # R-code: Protein Name and Tukey's Normalization
cb56479f7aca Uploaded
bornea
parents:
diff changeset
3 # Author: Adam L Borne
cb56479f7aca Uploaded
bornea
parents:
diff changeset
4 # Contributers: Paul A Stewart, Brent Kuenzi
cb56479f7aca Uploaded
bornea
parents:
diff changeset
5 #######################################################################################
cb56479f7aca Uploaded
bornea
parents:
diff changeset
6 # Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the
cb56479f7aca Uploaded
bornea
parents:
diff changeset
7 # intensities of each proteins. Resulting in a one to one list of intensities to
cb56479f7aca Uploaded
bornea
parents:
diff changeset
8 # uniprot id.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
9 #######################################################################################
cb56479f7aca Uploaded
bornea
parents:
diff changeset
10 # Copyright (C) Adam L Borne.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
11 # Permission is granted to copy, distribute and/or modify this document
cb56479f7aca Uploaded
bornea
parents:
diff changeset
12 # under the terms of the GNU Free Documentation License, Version 1.3
cb56479f7aca Uploaded
bornea
parents:
diff changeset
13 # or any later version published by the Free Software Foundation;
cb56479f7aca Uploaded
bornea
parents:
diff changeset
14 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
15 # A copy of the license is included in the section entitled "GNU
cb56479f7aca Uploaded
bornea
parents:
diff changeset
16 # Free Documentation License".
cb56479f7aca Uploaded
bornea
parents:
diff changeset
17 #######################################################################################
cb56479f7aca Uploaded
bornea
parents:
diff changeset
18 ## REQUIRED INPUT ##
cb56479f7aca Uploaded
bornea
parents:
diff changeset
19
cb56479f7aca Uploaded
bornea
parents:
diff changeset
20 # 1) peptides_file: MaxQuant peptides file.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
21 #######################################################################################
cb56479f7aca Uploaded
bornea
parents:
diff changeset
22 ins_check_run <- function() {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
23 if ("affy" %in% rownames(installed.packages())){}
cb56479f7aca Uploaded
bornea
parents:
diff changeset
24 else {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
25 source("https://bioconductor.org/biocLite.R")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
26 biocLite(c('mygene','affy'))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
27 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
28 if ('data.table' %in% rownames(installed.packages())){}
cb56479f7aca Uploaded
bornea
parents:
diff changeset
29 else {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
30 install.packages('data.table', repos='http://cran.us.r-project.org')
cb56479f7aca Uploaded
bornea
parents:
diff changeset
31 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
32 if ('stringr' %in% rownames(installed.packages())){}
cb56479f7aca Uploaded
bornea
parents:
diff changeset
33 else {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
34 install.packages('stringr', repos='http://cran.us.r-project.org')
cb56479f7aca Uploaded
bornea
parents:
diff changeset
35 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
36 if ('VennDiagram' %in% rownames(installed.packages())){}
cb56479f7aca Uploaded
bornea
parents:
diff changeset
37 else {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
38 install.packages('VennDiagram', repos='http://cran.us.r-project.org')
cb56479f7aca Uploaded
bornea
parents:
diff changeset
39 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
40 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
41
cb56479f7aca Uploaded
bornea
parents:
diff changeset
42 ins_check_run()
cb56479f7aca Uploaded
bornea
parents:
diff changeset
43 library(data.table)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
44 library(affy)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
45 library(stringr)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
46 library(mygene)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
47 library(VennDiagram)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
48 #####
cb56479f7aca Uploaded
bornea
parents:
diff changeset
49 #data
cb56479f7aca Uploaded
bornea
parents:
diff changeset
50
cb56479f7aca Uploaded
bornea
parents:
diff changeset
51 #We should chat a bit more about using Tukey's and handling 0's/missing values with Brent.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
52 #Ask me about some updates for doing a bit more filtering of TMT data.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
53
cb56479f7aca Uploaded
bornea
parents:
diff changeset
54 main <- function(peptides_file, db_path) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
55 peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
56 peptides_txt = peptides_file
cb56479f7aca Uploaded
bornea
parents:
diff changeset
57 intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
58 intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
59 peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
60 peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
61 peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession
cb56479f7aca Uploaded
bornea
parents:
diff changeset
62 columns_comb = c("Uniprot", intensity_columns)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
63 peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
64 swissprot_fasta = scan(db_path, what="character")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
65 peptides_txt_mapped_log2 = peptides_mapped_intensity
cb56479f7aca Uploaded
bornea
parents:
diff changeset
66 # Takes the log2 of the intensities.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
67 for (i in intensity_columns) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
68 peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
69 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
70 #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column
cb56479f7aca Uploaded
bornea
parents:
diff changeset
71 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
72 min(x[x != -Inf])
cb56479f7aca Uploaded
bornea
parents:
diff changeset
73 }))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
74 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0
cb56479f7aca Uploaded
bornea
parents:
diff changeset
75 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways
cb56479f7aca Uploaded
bornea
parents:
diff changeset
76 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
77 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
78 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly
cb56479f7aca Uploaded
bornea
parents:
diff changeset
79 # Runs the Tukey function returning completed table
cb56479f7aca Uploaded
bornea
parents:
diff changeset
80 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
81 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
82 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0
cb56479f7aca Uploaded
bornea
parents:
diff changeset
83 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
84
cb56479f7aca Uploaded
bornea
parents:
diff changeset
85 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
86
cb56479f7aca Uploaded
bornea
parents:
diff changeset
87 map_peptides_proteins = function(peptides_in) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
88 #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now
cb56479f7aca Uploaded
bornea
parents:
diff changeset
89 peptides_in = subset(peptides_in,peptides_in$Proteins != "")
cb56479f7aca Uploaded
bornea
parents:
diff changeset
90 results_list = list()
cb56479f7aca Uploaded
bornea
parents:
diff changeset
91 k = 1
cb56479f7aca Uploaded
bornea
parents:
diff changeset
92 for (i in 1:nrow(peptides_in)) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
93 protein_names = peptides_in[i,"Proteins"]
cb56479f7aca Uploaded
bornea
parents:
diff changeset
94 protein_names_split = unlist(strsplit(protein_names,";"))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
95 for (j in 1:length(protein_names_split)) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
96 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
97 results_list[[k]] = peptides_mapped_proteins
cb56479f7aca Uploaded
bornea
parents:
diff changeset
98 k = k+1
cb56479f7aca Uploaded
bornea
parents:
diff changeset
99
cb56479f7aca Uploaded
bornea
parents:
diff changeset
100 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
101 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
102 return(rbindlist(results_list))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
103 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
104
cb56479f7aca Uploaded
bornea
parents:
diff changeset
105 get_protein_values = function(mapped_peptides_in,intensity_columns_list) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
106 unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
107 # Generates a blank data frame with clomns of Intensities and rows of Uniprots.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
108 Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE )
cb56479f7aca Uploaded
bornea
parents:
diff changeset
109 for (q in intensity_columns_list) {Tukeys_df[,q] = NA}
cb56479f7aca Uploaded
bornea
parents:
diff changeset
110 for (i in 1:length(unique_mapped_proteins_list)) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
111 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])
cb56479f7aca Uploaded
bornea
parents:
diff changeset
112 #calculate Tukey's Biweight from library(affy); returns a single numeric
cb56479f7aca Uploaded
bornea
parents:
diff changeset
113 #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
114 for (j in intensity_columns_list) {
cb56479f7aca Uploaded
bornea
parents:
diff changeset
115 #Populates with new Tukeys values.
cb56479f7aca Uploaded
bornea
parents:
diff changeset
116 Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j]))
cb56479f7aca Uploaded
bornea
parents:
diff changeset
117 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
118 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
119 return(Tukeys_df)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
120 }
cb56479f7aca Uploaded
bornea
parents:
diff changeset
121
cb56479f7aca Uploaded
bornea
parents:
diff changeset
122
cb56479f7aca Uploaded
bornea
parents:
diff changeset
123 args <- commandArgs(trailingOnly = TRUE)
cb56479f7aca Uploaded
bornea
parents:
diff changeset
124 main(args[1], args[2])