Mercurial > repos > bornea > saint_preproc
comparison pre_process_protein_name_set.R @ 3:6571324e3d2c draft
Uploaded
| author | bornea |
|---|---|
| date | Tue, 10 Nov 2015 13:13:22 -0500 |
| parents | |
| children | c56ebf254e89 |
comparison
equal
deleted
inserted
replaced
| 2:593c4e2b8cf5 | 3:6571324e3d2c |
|---|---|
| 1 library(data.table) | |
| 2 library(affy) | |
| 3 library(stringr) | |
| 4 library(mygene) | |
| 5 library(VennDiagram) | |
| 6 ##### | |
| 7 #data | |
| 8 main <- function(peptides_file) { | |
| 9 peptides_file = read.delim(peptides_file,header=TRUE,stringsAsFactors=FALSE,fill=TRUE) | |
| 10 peptides_txt = peptides_file | |
| 11 intensity_columns = names(peptides_txt[,str_detect(names(peptides_txt),"Intensity\\.*")]) #Pulls out all lines with Intensity in them. | |
| 12 intensity_columns = intensity_columns[2:length(intensity_columns)] #Removes the first column that does not have a bait. | |
| 13 peptides_txt_mapped = as.data.frame(map_peptides_proteins(peptides_txt)) #This function as below sets every line to a 1 to 1 intensity to each possible protein. | |
| 14 peptides_txt_mapped$Uniprot = str_extract(peptides_txt_mapped$mapped_protein, "[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") #Pulls out just Uniprot id from the script. | |
| 15 peptides_txt_mapped = subset(peptides_txt_mapped,!is.na(Uniprot)) #removes reverse sequences and any that didn't match a uniprot accession | |
| 16 columns_comb = c("Uniprot", intensity_columns) | |
| 17 peptides_mapped_intensity = subset(peptides_txt_mapped, select = columns_comb) #Subsets out only the needed cloumns for Tukeys (Uniprot IDS and baited intensities) | |
| 18 swissprot_fasta = scan("/home/philip/galaxy/tools/Moffitt_Tools/uniprot_names.txt",what="character") | |
| 19 peptides_txt_mapped_log2 = peptides_mapped_intensity | |
| 20 # Takes the log2 of the intensities. | |
| 21 for (i in intensity_columns) { | |
| 22 peptides_txt_mapped_log2[,i] = log2(subset(peptides_txt_mapped_log2, select = i)) | |
| 23 } | |
| 24 #get the minimum from each column while ignoring the -Inf; get the min of these mins for the global min; breaks when there's only one intensity column | |
| 25 global_min = min(apply(peptides_txt_mapped_log2[,2:ncol(peptides_txt_mapped_log2)],2,function(x) { | |
| 26 min(x[x != -Inf]) | |
| 27 })) | |
| 28 peptides_txt_mapped_log2[peptides_txt_mapped_log2 == -Inf] <- 0 | |
| 29 #uniprot accessions WITHOUT isoforms; it looks like only contaminants contain isoforms anyways | |
| 30 mapped_protein_uniprotonly = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
| 31 mapped_protein_uniprot_accession = str_extract(peptides_txt_mapped_log2$Uniprot,"[OPQ][0-9][A-Z0-9]{3}[0-9](-[0-9]+)?|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}(-[0-9]+)?|[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") | |
| 32 peptides_txt_mapped_log2$mapped_protein = mapped_protein_uniprotonly | |
| 33 # Runs the Tukey function returning completed table | |
| 34 peptides_txt_mapped_log2 = subset(peptides_txt_mapped_log2,mapped_protein %in% swissprot_fasta) | |
| 35 protein_intensities_tukeys = get_protein_values(peptides_txt_mapped_log2,intensity_columns) | |
| 36 protein_intensities_tukeys[protein_intensities_tukeys == 1] <- 0 | |
| 37 write.table(protein_intensities_tukeys, "./tukeys_output.txt", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t") | |
| 38 | |
| 39 } | |
| 40 | |
| 41 map_peptides_proteins = function(peptides_in) { | |
| 42 #reverse sequences are blank but have a razor protein indicating that they are reverse; exclude these for now | |
| 43 peptides_in = subset(peptides_in,peptides_in$Proteins != "") | |
| 44 results_list = list() | |
| 45 k = 1 | |
| 46 for (i in 1:nrow(peptides_in)) { | |
| 47 protein_names = peptides_in[i,"Proteins"] | |
| 48 protein_names_split = unlist(strsplit(protein_names,";")) | |
| 49 for (j in 1:length(protein_names_split)) { | |
| 50 peptides_mapped_proteins = data.frame(peptides_in[i,],mapped_protein=protein_names_split[j],stringsAsFactors=FALSE) | |
| 51 results_list[[k]] = peptides_mapped_proteins | |
| 52 k = k+1 | |
| 53 | |
| 54 } | |
| 55 } | |
| 56 return(rbindlist(results_list)) | |
| 57 } | |
| 58 | |
| 59 get_protein_values = function(mapped_peptides_in,intensity_columns_list) { | |
| 60 unique_mapped_proteins_list = unique(mapped_peptides_in$mapped_protein) # Gets list of all peptides listed. | |
| 61 # Generates a blank data frame with clomns of Intensities and rows of Uniprots. | |
| 62 Tukeys_df = data.frame(mapped_protein = unique_mapped_proteins_list, stringsAsFactors = FALSE ) | |
| 63 for (q in intensity_columns_list) {Tukeys_df[,q] = NA} | |
| 64 for (i in 1:length(unique_mapped_proteins_list)) { | |
| 65 mapped_peptides_unique_subset = subset(mapped_peptides_in, mapped_protein == unique_mapped_proteins_list[i]) | |
| 66 #calculate Tukey's Biweight from library(affy); returns a single numeric | |
| 67 #results_list[[i]] = data.frame(Protein=unique_mapped_proteins_list[i],Peptides_per_protein=nrow(mapped_peptides_unique_subset)) | |
| 68 for (j in intensity_columns_list) { | |
| 69 #Populates with new Tukeys values. | |
| 70 Tukeys_df[i,j] = 2^(tukey.biweight(mapped_peptides_unique_subset[,j])) | |
| 71 } | |
| 72 } | |
| 73 return(Tukeys_df) | |
| 74 } | |
| 75 | |
| 76 args <- commandArgs(trailingOnly = TRUE) | |
| 77 main(args[1]) |
