3
|
1 #######################################################################################
|
|
2 # R-code: Protein Name and Tukey's Normalization
|
|
3 # Author: Adam L Borne
|
|
4 # Contributers: Paul A Stewart, Brent Kuenzi
|
|
5 #######################################################################################
|
|
6 # Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the
|
|
7 # intensities of each proteins. Resulting in a one to one list of intensities to
|
|
8 # uniprot id.
|
|
9 #######################################################################################
|
|
10 # Copyright (C) Adam Borne.
|
|
11 # Permission is granted to copy, distribute and/or modify this document
|
|
12 # under the terms of the GNU Free Documentation License, Version 1.3
|
|
13 # or any later version published by the Free Software Foundation;
|
|
14 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
|
|
15 # A copy of the license is included in the section entitled "GNU
|
|
16 # Free Documentation License".
|
|
17 #######################################################################################
|
|
18 ## REQUIRED INPUT ##
|
|
19
|
|
20 # 1) peptides_file: MaxQuant peptides file.
|
|
21 #######################################################################################
|
|
22
|
|
23
|
|
24 ins_check_run <- function() {
|
|
25 if ("affy" %in% rownames(installed.packages())){}
|
|
26 else {
|
|
27 source("https://bioconductor.org/biocLite.R")
|
|
28 biocLite(c('mygene','affy'))
|
|
29 }
|
|
30 if ('data.table' %in% rownames(installed.packages())){}
|
|
31 else {
|
|
32 install.packages('data.table', repos='http://cran.us.r-project.org')
|
|
33 }
|
|
34 if ('stringr' %in% rownames(installed.packages())){}
|
|
35 else {
|
|
36 install.packages('stringr', repos='http://cran.us.r-project.org')
|
|
37 }
|
|
38 if ('VennDiagram' %in% rownames(installed.packages())){}
|
|
39 else {
|
|
40 install.packages('VennDiagram', repos='http://cran.us.r-project.org')
|
|
41 }
|
|
42 }
|
|
43
|
|
44 ins_check_run()
|
|
45 library(data.table)
|
|
46 library(affy)
|
|
47 library(stringr)
|
|
48 library(mygene)
|
|
49 library(VennDiagram)
|
|
50
|
|
51
|
|
52 main <- function(peptides_file, db_path) {
|
|
53 peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE)
|
|
54 peptides_txt = peptides_file
|
|
55 intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")])
|
|
56 # Pulls out all lines with Intensity in them.
|
|
57 intensity_columns = intensity_columns[2:length(intensity_columns)]
|
|
58 # Removes the first column that does not have a bait.
|
|
59 peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt))
|
|
60 # This function as below sets every line to a 1 to 1 intensity to each possible protein.
|
|
61 peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script.
|
|
62 peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot))
|
|
63 # Removes reverse sequences and any that didn't match a uniprot accession.
|
|
64 columns_comb = c("Uniprot", intensity_columns)
|
|
65 peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb)
|
|
66 # Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)/
|
|
67 swissprot_fasta = scan(db_path, what="character")
|
|
68 peptides_txt_mapped_log2 = peptides_mapped_intensity
|
|
69 # Takes the log2 of the intensities.
|
|
70 for (i in intensity_columns) {
|
|
71 peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i))
|
|
72 }
|
|
73 # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the
|
|
74 # global min; breaks when there's only one intensity column.
|
|
75 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) {
|
|
76 min(x[x != -Inf])
|
|
77 }))
|
|
78 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0
|
|
79 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways.
|
|
80 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
|
|
81 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}")
|
|
82 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly
|
|
83 # Runs the Tukey function returning completed table.
|
|
84 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta)
|
|
85 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns)
|
|
86 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0
|
|
87 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
|
|
88
|
|
89 }
|
|
90
|
|
91 map_peptides_proteins = function(peptides_in) {
|
|
92 peptides_in = subset(peptides_in,peptides_in$Proteins != "")
|
|
93 results_list = list()
|
|
94 k = 1
|
|
95 for (i in 1:nrow(peptides_in)) {
|
|
96 protein_names = peptides_in[i,"Proteins"]
|
|
97 protein_names_split = unlist(strsplit(protein_names,";"))
|
|
98 for (j in 1:length(protein_names_split)) {
|
|
99 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE)
|
|
100 results_list[[k]] = peptides_mapped_proteins
|
|
101 k = k+1
|
|
102
|
|
103 }
|
|
104 }
|
|
105 return(rbindlist(results_list))
|
|
106 }
|
|
107
|
|
108 get_protein_values = function(mapped_peptides_in,intensity_columns_list) {
|
|
109 unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein)
|
|
110 # Gets list of all peptides listed.
|
|
111 # Generates a blank data frame with clomns of Intensities and rows of Uniprots.
|
|
112 Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE )
|
|
113 for (q in intensity_columns_list) {Tukeys_df[,q] = NA}
|
|
114 for (i in 1:length(unique_mapped_proteins_list)) {
|
|
115 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i])
|
|
116 # Calculate Tukey's Biweight from library(affy); returns a single numeric.
|
|
117 # Results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)).
|
|
118 for (j in intensity_columns_list) {
|
|
119 # Populates with new Tukeys values.
|
|
120 Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j]))
|
|
121 }
|
|
122 }
|
|
123 return(Tukeys_df)
|
|
124 }
|
|
125
|
|
126
|
|
127 args <- commandArgs(trailingOnly = TRUE)
|
|
128 main(args[1], args[2])
|