Mercurial > repos > bornea > saint_preprocessing
comparison pre_process_protein_name_set.R @ 3:945f600f34cb draft
Uploaded
author | bornea |
---|---|
date | Tue, 15 Mar 2016 15:59:16 -0400 |
parents | |
children | dbd1af88f060 |
comparison
equal
deleted
inserted
replaced
2:ddc092714127 | 3:945f600f34cb |
---|---|
1 ####################################################################################### | |
2 # R-code: Protein Name and Tukey's Normalization | |
3 # Author: Adam L Borne | |
4 # Contributers: Paul A Stewart, Brent Kuenzi | |
5 ####################################################################################### | |
6 # Assigns uniprot id from MaxQuant peptides file. Filters and normalizes the | |
7 # intensities of each proteins. Resulting in a one to one list of intensities to | |
8 # uniprot id. | |
9 ####################################################################################### | |
10 # Copyright (C) Adam Borne. | |
11 # Permission is granted to copy, distribute and/or modify this document | |
12 # under the terms of the GNU Free Documentation License, Version 1.3 | |
13 # or any later version published by the Free Software Foundation; | |
14 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts. | |
15 # A copy of the license is included in the section entitled "GNU | |
16 # Free Documentation License". | |
17 ####################################################################################### | |
18 ## REQUIRED INPUT ## | |
19 | |
20 # 1) peptides_file: MaxQuant peptides file. | |
21 ####################################################################################### | |
22 | |
23 | |
24 ins_check_run <- function() { | |
25 if ("affy" %in% rownames(installed.packages())){} | |
26 else { | |
27 source("https://bioconductor.org/biocLite.R") | |
28 biocLite(c('mygene','affy')) | |
29 } | |
30 if ('data.table' %in% rownames(installed.packages())){} | |
31 else { | |
32 install.packages('data.table', repos='http://cran.us.r-project.org') | |
33 } | |
34 if ('stringr' %in% rownames(installed.packages())){} | |
35 else { | |
36 install.packages('stringr', repos='http://cran.us.r-project.org') | |
37 } | |
38 if ('VennDiagram' %in% rownames(installed.packages())){} | |
39 else { | |
40 install.packages('VennDiagram', repos='http://cran.us.r-project.org') | |
41 } | |
42 } | |
43 | |
44 ins_check_run() | |
45 library(data.table) | |
46 library(affy) | |
47 library(stringr) | |
48 library(mygene) | |
49 library(VennDiagram) | |
50 | |
51 | |
52 main <- function(peptides_file, db_path) { | |
53 peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE) | |
54 peptides_txt = peptides_file | |
55 intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) | |
56 # Pulls out all lines with Intensity in them. | |
57 intensity_columns = intensity_columns[2:length(intensity_columns)] | |
58 # Removes the first column that does not have a bait. | |
59 peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) | |
60 # This function as below sets every line to a 1 to 1 intensity to each possible protein. | |
61 peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script. | |
62 peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) | |
63 # Removes reverse sequences and any that didn't match a uniprot accession. | |
64 columns_comb = c("Uniprot", intensity_columns) | |
65 peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) | |
66 # Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities)/ | |
67 swissprot_fasta = scan(db_path, what="character") | |
68 peptides_txt_mapped_log2 = peptides_mapped_intensity | |
69 # Takes the log2 of the intensities. | |
70 for (i in intensity_columns) { | |
71 peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i)) | |
72 } | |
73 # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the | |
74 # global min; breaks when there's only one intensity column. | |
75 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { | |
76 min(x[x != -Inf]) | |
77 })) | |
78 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 | |
79 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways. | |
80 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
81 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
82 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly | |
83 # Runs the Tukey function returning completed table. | |
84 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) | |
85 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) | |
86 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 | |
87 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") | |
88 | |
89 } | |
90 | |
91 map_peptides_proteins = function(peptides_in) { | |
92 peptides_in = subset(peptides_in,peptides_in$Proteins != "") | |
93 results_list = list() | |
94 k = 1 | |
95 for (i in 1:nrow(peptides_in)) { | |
96 protein_names = peptides_in[i,"Proteins"] | |
97 protein_names_split = unlist(strsplit(protein_names,";")) | |
98 for (j in 1:length(protein_names_split)) { | |
99 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) | |
100 results_list[[k]] = peptides_mapped_proteins | |
101 k = k+1 | |
102 | |
103 } | |
104 } | |
105 return(rbindlist(results_list)) | |
106 } | |
107 | |
108 get_protein_values = function(mapped_peptides_in,intensity_columns_list) { | |
109 unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) | |
110 # Gets list of all peptides listed. | |
111 # Generates a blank data frame with clomns of Intensities and rows of Uniprots. | |
112 Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) | |
113 for (q in intensity_columns_list) {Tukeys_df[,q] = NA} | |
114 for (i in 1:length(unique_mapped_proteins_list)) { | |
115 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) | |
116 # Calculate Tukey's Biweight from library(affy); returns a single numeric. | |
117 # Results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)). | |
118 for (j in intensity_columns_list) { | |
119 # Populates with new Tukeys values. | |
120 Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) | |
121 } | |
122 } | |
123 return(Tukeys_df) | |
124 } | |
125 | |
126 | |
127 args <- commandArgs(trailingOnly = TRUE) | |
128 main(args[1], args[2]) |