Mercurial > repos > bornea > saint_preprocessing
comparison pre_process_protein_name_set.R @ 28:dbd1af88f060 draft
Uploaded
author | bornea |
---|---|
date | Tue, 26 Apr 2016 14:42:16 -0400 |
parents | 945f600f34cb |
children | e6e456d3ac14 |
comparison
equal
deleted
inserted
replaced
27:2d78642361c3 | 28:dbd1af88f060 |
---|---|
73 # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the | 73 # Get the minimum from each column while ignoring the -Inf; get the min of these mins for the |
74 # global min; breaks when there's only one intensity column. | 74 # global min; breaks when there's only one intensity column. |
75 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { | 75 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { |
76 min(x[x != -Inf]) | 76 min(x[x != -Inf]) |
77 })) | 77 })) |
78 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 | 78 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- NA |
79 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways. | 79 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways. |
80 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 80 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
81 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | 81 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") |
82 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly | 82 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly |
83 # Runs the Tukey function returning completed table. | 83 # Runs the Tukey function returning completed table. |
84 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) | 84 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) |
85 if (nrow(peptides_txt_mapped_log2) == 0) { | |
86 print("Uniprot Database does not have any of the proteins in the peptides file") | |
87 quit() | |
88 } | |
85 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) | 89 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) |
86 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 | 90 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 |
87 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") | 91 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") |
88 | 92 |
89 } | 93 } |
92 peptides_in = subset(peptides_in,peptides_in$Proteins != "") | 96 peptides_in = subset(peptides_in,peptides_in$Proteins != "") |
93 results_list = list() | 97 results_list = list() |
94 k = 1 | 98 k = 1 |
95 for (i in 1:nrow(peptides_in)) { | 99 for (i in 1:nrow(peptides_in)) { |
96 protein_names = peptides_in[i,"Proteins"] | 100 protein_names = peptides_in[i,"Proteins"] |
97 protein_names_split = unlist(strsplit(protein_names,";")) | 101 protein_names_split = unlist(str_split(protein_names,";")) |
98 for (j in 1:length(protein_names_split)) { | 102 for (j in 1:length(protein_names_split)) { |
99 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) | 103 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) |
100 results_list[[k]] = peptides_mapped_proteins | 104 results_list[[k]] = peptides_mapped_proteins |
101 k = k+1 | 105 k = k+1 |
102 | 106 |
115 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) | 119 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) |
116 # Calculate Tukey's Biweight from library(affy); returns a single numeric. | 120 # Calculate Tukey's Biweight from library(affy); returns a single numeric. |
117 # Results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)). | 121 # Results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)). |
118 for (j in intensity_columns_list) { | 122 for (j in intensity_columns_list) { |
119 # Populates with new Tukeys values. | 123 # Populates with new Tukeys values. |
120 Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) | 124 Tukeys_df[i,j] = 2^(tukey.biweight(na.omit(mapped_peptides_unique_subset[,j]))) |
121 } | 125 } |
122 } | 126 } |
123 return(Tukeys_df) | 127 return(Tukeys_df) |
124 } | 128 } |
125 | 129 |