annotate CRAPomeQuery.py @ 1:7a0b06a1cabd draft

Uploaded
author bornea
date Mon, 18 Apr 2016 12:17:05 -0400
parents 4d47d78b193a
children d1a26feef9de
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4d47d78b193a Uploaded
bornea
parents:
diff changeset
1 # -*- coding: utf-8 -*-
4d47d78b193a Uploaded
bornea
parents:
diff changeset
2 """
4d47d78b193a Uploaded
bornea
parents:
diff changeset
3 Created on Thu Apr 14 16:58:05 2016
4d47d78b193a Uploaded
bornea
parents:
diff changeset
4
4d47d78b193a Uploaded
bornea
parents:
diff changeset
5 @author: brentkuenzi
4d47d78b193a Uploaded
bornea
parents:
diff changeset
6 """
4d47d78b193a Uploaded
bornea
parents:
diff changeset
7 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
8 # This program will read in a SAINT formatted 'prey.txt' file or a file
4d47d78b193a Uploaded
bornea
parents:
diff changeset
9 # containing a single column list of uniprot accessions (e.g. "P00533" or
4d47d78b193a Uploaded
bornea
parents:
diff changeset
10 # "EGFR_HUMAN")query the CRAPome database (v1.1), and return a file specifying
4d47d78b193a Uploaded
bornea
parents:
diff changeset
11 # the prevalence of each protein in the CRAPome.
4d47d78b193a Uploaded
bornea
parents:
diff changeset
12 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
13 # Copyright (C) Brent Kuenzi.
4d47d78b193a Uploaded
bornea
parents:
diff changeset
14 # Permission is granted to copy, distribute and/or modify this document
4d47d78b193a Uploaded
bornea
parents:
diff changeset
15 # under the terms of the GNU Free Documentation License, Version 1.3
4d47d78b193a Uploaded
bornea
parents:
diff changeset
16 # or any later version published by the Free Software Foundation;
4d47d78b193a Uploaded
bornea
parents:
diff changeset
17 # with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.
4d47d78b193a Uploaded
bornea
parents:
diff changeset
18 # A copy of the license is included in the section entitled "GNU
4d47d78b193a Uploaded
bornea
parents:
diff changeset
19 # Free Documentation License".
4d47d78b193a Uploaded
bornea
parents:
diff changeset
20 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
21 ## REQUIRED INPUT ##
4d47d78b193a Uploaded
bornea
parents:
diff changeset
22 # 1) crappyData: Prey.txt or single column list of Uniprot accessions
4d47d78b193a Uploaded
bornea
parents:
diff changeset
23 crappyData = sys.argv[1] # Prey file or File with single column of accessions
4d47d78b193a Uploaded
bornea
parents:
diff changeset
24 # 2) Species: HUMAN or YEAST
4d47d78b193a Uploaded
bornea
parents:
diff changeset
25 species = sys.argv[2] # HUMAN or YEAST
4d47d78b193a Uploaded
bornea
parents:
diff changeset
26 db_path = sys.argv[4]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
27 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
28 ## Dependencies ##
4d47d78b193a Uploaded
bornea
parents:
diff changeset
29 import urllib2
4d47d78b193a Uploaded
bornea
parents:
diff changeset
30 import sys
4d47d78b193a Uploaded
bornea
parents:
diff changeset
31 import numpy
4d47d78b193a Uploaded
bornea
parents:
diff changeset
32 import os
4d47d78b193a Uploaded
bornea
parents:
diff changeset
33 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
34 ## Global Variables ##
4d47d78b193a Uploaded
bornea
parents:
diff changeset
35 if species == "HUMAN":
4d47d78b193a Uploaded
bornea
parents:
diff changeset
36 database = "Human_CRAPome_v1-1.txt"
4d47d78b193a Uploaded
bornea
parents:
diff changeset
37 if species == "YEAST":
4d47d78b193a Uploaded
bornea
parents:
diff changeset
38 database = "Yeast_CRAPome_v1-1.txt"
4d47d78b193a Uploaded
bornea
parents:
diff changeset
39 ################################################################################
4d47d78b193a Uploaded
bornea
parents:
diff changeset
40 ## CRAPomeQuery ##
4d47d78b193a Uploaded
bornea
parents:
diff changeset
41 class ReturnValue1(object):
4d47d78b193a Uploaded
bornea
parents:
diff changeset
42 def __init__(self, uniprot_acc, gene, swissprot):
4d47d78b193a Uploaded
bornea
parents:
diff changeset
43 self.up = uniprot_acc
4d47d78b193a Uploaded
bornea
parents:
diff changeset
44 self.gn = gene
4d47d78b193a Uploaded
bornea
parents:
diff changeset
45 self.sp = swissprot
4d47d78b193a Uploaded
bornea
parents:
diff changeset
46 def get_info(uniprot_accession_in): #get aa lengths and gene name
4d47d78b193a Uploaded
bornea
parents:
diff changeset
47 error = open('error proteins.txt', 'a+')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
48 i=0
4d47d78b193a Uploaded
bornea
parents:
diff changeset
49 while i==0:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
50 try:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
51 data = urllib2.urlopen("http://www.uniprot.org/uniprot/" + uniprot_accession_in + ".fasta")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
52 break
4d47d78b193a Uploaded
bornea
parents:
diff changeset
53 except urllib2.HTTPError, err:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
54 i = i + 1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
55 if i == 50:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
56 sys.exit("More than 50 errors. Check your file or try again later.")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
57 if err.code == 404:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
58 error.write(uniprot_accession_in + '\t' + "Invalid URL. Check protein" + '\n')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
59 seqlength = 'NA'
4d47d78b193a Uploaded
bornea
parents:
diff changeset
60 genename = 'NA'
4d47d78b193a Uploaded
bornea
parents:
diff changeset
61 return ReturnValue1(seqlength, genename)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
62 elif err.code == 302:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
63 sys.exit("Request timed out. Check connection and try again.")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
64 else:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
65 sys.exit("Uniprot had some other error")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
66 lines = data.readlines()
4d47d78b193a Uploaded
bornea
parents:
diff changeset
67 header = lines[0]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
68 lst = header.split('|')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
69 lst2 = lst[2].split(' ')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
70 swissprot = lst2[0]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
71 uniprot_acc = lst[1]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
72 if lines == []:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
73 error.write(uniprot_accession_in + '\t' + "Blank Fasta" + '\n')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
74 error.close
4d47d78b193a Uploaded
bornea
parents:
diff changeset
75 uniprot_acc = 'NA'
4d47d78b193a Uploaded
bornea
parents:
diff changeset
76 genename = 'NA'
4d47d78b193a Uploaded
bornea
parents:
diff changeset
77 return ReturnValue1(uniprot_acc, genename, swissprot)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
78 if lines != []:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
79 seqlength = 0
4d47d78b193a Uploaded
bornea
parents:
diff changeset
80 header = lines[0]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
81 if 'GN=' in header:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
82 lst = header.split('GN=')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
83 lst2 = lst[1].split(' ')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
84 genename = lst2[0]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
85 error.close
4d47d78b193a Uploaded
bornea
parents:
diff changeset
86 return ReturnValue1(uniprot_acc, genename, swissprot)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
87 if 'GN=' not in header:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
88 genename = 'NA'
4d47d78b193a Uploaded
bornea
parents:
diff changeset
89 error.close
4d47d78b193a Uploaded
bornea
parents:
diff changeset
90 return ReturnValue1(uniprot_acc, genename, swissprot)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
91 def readtab(infile): # read in tab-delim text
4d47d78b193a Uploaded
bornea
parents:
diff changeset
92 with open(infile,'r') as x:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
93 output = []
4d47d78b193a Uploaded
bornea
parents:
diff changeset
94 for line in x:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
95 line = line.strip()
4d47d78b193a Uploaded
bornea
parents:
diff changeset
96 temp = line.split('\t')
4d47d78b193a Uploaded
bornea
parents:
diff changeset
97 output.append(temp)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
98 return output
4d47d78b193a Uploaded
bornea
parents:
diff changeset
99 def crapome(infile): # Query CRAPome
4d47d78b193a Uploaded
bornea
parents:
diff changeset
100 data = readtab(infile)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
101 crapome = readtab(database)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
102 filt = []
4d47d78b193a Uploaded
bornea
parents:
diff changeset
103 for i in data: # Filter CRAPome database on our data
4d47d78b193a Uploaded
bornea
parents:
diff changeset
104 flag = 0 # is protein in CRAPome?
4d47d78b193a Uploaded
bornea
parents:
diff changeset
105 ac_flag = 0 # is it _SPECIES or not
4d47d78b193a Uploaded
bornea
parents:
diff changeset
106 unique = 0 # only take first ID in CRAPome
4d47d78b193a Uploaded
bornea
parents:
diff changeset
107 if "_"+species in i[0]:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
108 ac = i[0]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
109 else:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
110 ac = get_info(i[0]).sp # query swissprot if not _SPECIES
4d47d78b193a Uploaded
bornea
parents:
diff changeset
111 ac_flag +=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
112 for j in crapome:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
113 if ac == j[2]:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
114 if ac_flag == 0: # if _SPECIES
4d47d78b193a Uploaded
bornea
parents:
diff changeset
115 if unique == 0:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
116 filt.append(j)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
117 flag+=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
118 unique+=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
119 if ac_flag != 0: # if not _SPECIES
4d47d78b193a Uploaded
bornea
parents:
diff changeset
120 if unique == 0:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
121 unique+=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
122 j[2] = i[0] # change to user input
4d47d78b193a Uploaded
bornea
parents:
diff changeset
123 filt.append(j)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
124 flag +=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
125 if flag == 0: # if protein is not present in CRAPome database then add it
4d47d78b193a Uploaded
bornea
parents:
diff changeset
126 filt.append(["\t", "\t", i[0], "Invalid identifier / gene not available"])
4d47d78b193a Uploaded
bornea
parents:
diff changeset
127 total = 0 # Experiment counter
4d47d78b193a Uploaded
bornea
parents:
diff changeset
128 query = []
4d47d78b193a Uploaded
bornea
parents:
diff changeset
129 for i in filt: # Create CRAPome file as list
4d47d78b193a Uploaded
bornea
parents:
diff changeset
130 temp=[]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
131 if len(i) > 5:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
132 cnt=0
4d47d78b193a Uploaded
bornea
parents:
diff changeset
133 temp.append(i[2]) # append accession
4d47d78b193a Uploaded
bornea
parents:
diff changeset
134 temp.append(i[0]) # append gene name
4d47d78b193a Uploaded
bornea
parents:
diff changeset
135 ave = []
4d47d78b193a Uploaded
bornea
parents:
diff changeset
136 total = len(i[3:]) # calculate total experiments
4d47d78b193a Uploaded
bornea
parents:
diff changeset
137 for j in i[3:]:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
138 if j != '0':
4d47d78b193a Uploaded
bornea
parents:
diff changeset
139 ave.append(int(j)) # calculate Ave.SC on only experiments with ID
4d47d78b193a Uploaded
bornea
parents:
diff changeset
140 cnt+=1
4d47d78b193a Uploaded
bornea
parents:
diff changeset
141 temp.append(str(cnt) + " / "+str(total)) # format ratio
4d47d78b193a Uploaded
bornea
parents:
diff changeset
142 if ave != []:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
143 temp.append(str(round(numpy.mean(ave),1))) # calculate Ave.SC
4d47d78b193a Uploaded
bornea
parents:
diff changeset
144 temp.append(str(max(ave))) # calculate Max.SC
4d47d78b193a Uploaded
bornea
parents:
diff changeset
145 else:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
146 temp.append(0) # add 0 if has not been ID'd in CRAPome
4d47d78b193a Uploaded
bornea
parents:
diff changeset
147 temp.append(0) # add 0 if has not been ID'd in CRAPome
4d47d78b193a Uploaded
bornea
parents:
diff changeset
148 else:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
149 temp.append(i[2]) # append accession
4d47d78b193a Uploaded
bornea
parents:
diff changeset
150 temp.append(i[3])
4d47d78b193a Uploaded
bornea
parents:
diff changeset
151 query.append(temp) # final query results
4d47d78b193a Uploaded
bornea
parents:
diff changeset
152
4d47d78b193a Uploaded
bornea
parents:
diff changeset
153 header = ["User Input","Mapped Gene Symbol","Num of Expt. (found/total)","Ave SC","Max SC"]
4d47d78b193a Uploaded
bornea
parents:
diff changeset
154 with open("Crappy Data.txt","wt") as x: # write file
4d47d78b193a Uploaded
bornea
parents:
diff changeset
155 x.write("\t".join(header) + "\n")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
156 for i in query:
4d47d78b193a Uploaded
bornea
parents:
diff changeset
157 x.write("\t".join(i) + "\n")
4d47d78b193a Uploaded
bornea
parents:
diff changeset
158 if __name__ == '__main__':
4d47d78b193a Uploaded
bornea
parents:
diff changeset
159 crapome(crappyData)
4d47d78b193a Uploaded
bornea
parents:
diff changeset
160
4d47d78b193a Uploaded
bornea
parents:
diff changeset
161 os.rename("Crappy Data.txt", sys.argv[3])
4d47d78b193a Uploaded
bornea
parents:
diff changeset
162 ## END ##