annotate polyphen2_web/polyphen2_web.py @ 0:09f68bdd1999 draft default tip

Uploaded
author saket-choudhary
date Tue, 07 Oct 2014 19:21:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
1 #!/usr/bin/python
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
2 from bs4 import BeautifulSoup
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
3 import argparse
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
4 import sys
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
5 import time
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
6 import os
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
7 import tempfile
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
8 import requests
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
9 import shutil
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
10 import csv
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi'
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
12 result_url = 'http://genetics.bwh.harvard.edu'
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
13
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
14 TIMEOUT = 60 * 60 * 24
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
15 TIME_DELAY = 30
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
16 MAX_TRIES = 900000000
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
17
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
18 # Genome assembly version used for chromosome
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
19 # coordinates of the SNPs in user input
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
20 UCSCDB = ['hg19', 'hg18']
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
21 # Classifier model used for predictions.
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
22 MODELNAME = ['HumDiv', 'HumVar']
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
23
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
24 # Set of transcripts on which genomic SNPs will be mapped
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
25 SNPFILTER = {
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
26 'All': 0,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
27 'Canonical': 1,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
28 'CCDS': 3,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
29 }
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
30 # Functional SNP categories to include in genomic SNPs annotation report
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
31 SNPFUNCTION = ['c', 'm', '']
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
32
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
33
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
34 def stop_err(msg, err=1):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
35 sys.stderr.write('%s\n' % msg)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
36 sys.exit(err)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
37
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
38
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
39 class Polyphen2Web:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
40
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
41 def __init__(self, ucscdb=None, model_name=None, snp_filter=None,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
42 snp_function=None, file_location=None, email=None):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
43 self.ucscdb = ucscdb
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
44 self.model_name = model_name
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
45 self.snp_filter = snp_filter
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
46 self.snp_function = snp_function
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
47 self.file_location = file_location
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
48 self.notify_me = email
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
49
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
50 def soupify(self, string):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
51 return BeautifulSoup(string)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
52
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
53 def make_request(self):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
54 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
55 tmp_dir = tempfile.mkdtemp()
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
56 path = os.path.join(tmp_dir, 'csv_file')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
57 with open(path, 'wb') as fh:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
58 a = csv.writer(fh)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
59 a.writerows(in_txt)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
60 contents = open(self.file_location, 'r').read().replace(
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
61 '\t', ' ').replace('::::::::::::::', '')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
62 if self.snp_function == 'All':
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
63 self.snp_function = ''
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
64 payload = {
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
65 '_ggi_project': 'PPHWeb2',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
66 '_ggi_origin': 'query',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
67 '_ggi_batch': contents,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
68 '_ggi_target_pipeline': '1',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
69 'MODELNAME': self.model_name,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
70 'UCSCDB': self.ucscdb,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
71 'SNPFILTER': SNPFILTER[self.snp_filter],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
72 'SNPFUNC': self.snp_function,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
73 'NOTIFYME': '',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
74
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
75 }
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
76 if self.notify_me:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
77 payload['NOTIFYME'] = self.notify_me
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
78 request = requests.post(submission_url, data=payload)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
79 content = request.content
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
80 soup = self.soupify(content)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
81 sid_soup = soup.find('input', {'name': 'sid'})
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
82 try:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
83 sid = sid_soup['value']
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
84 except:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
85 sid = None
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
86 shutil.rmtree(tmp_dir)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
87 return sid
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
88
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
89 def poll_for_files(self, sid,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
90 max_tries=MAX_TRIES,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
91 time_delay=TIME_DELAY,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
92 timeout=TIMEOUT):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
93 payload = {
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
94 '_ggi_project': 'PPHWeb2',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
95 '_ggi_origin': 'manage',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
96 '_ggi_target_manage': 'Refresh',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
97 'sid': sid,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
98 }
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
99 content = None
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
100 tries = 0
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
101 url_dict = None
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
102 while True:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
103 tries += 1
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
104 if tries > max_tries:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
105 stop_err('Number of tries exceeded!')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
106 request = requests.post(submission_url, data=payload)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
107 content = request.content
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
108 soup = self.soupify(content)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
109 all_tables = soup.findAll('table')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
110 if all_tables:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
111 try:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
112 running_jobs_table = all_tables[-2]
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
113 except:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
114 running_jobs_table = None
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
115 if running_jobs_table:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
116 rows = running_jobs_table.findAll('tr')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
117 if len(rows) == 1:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
118 row = rows[0]
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
119 hrefs = row.findAll('a')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
120 # print hrefs
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
121 if len(hrefs) >= 3:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
122 short_txt = hrefs[0]['href']
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
123 # print short_txt
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
124 path = short_txt.split('-')[0]
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
125 full_txt = result_url + path + '-full.txt'
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
126 log_txt = result_url + path + '-log.txt'
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
127 snps_txt = result_url + path + '-snps.txt'
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
128 short_txt = result_url + path + \
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
129 '-short.txt' # short_txt
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
130 url_dict = {
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
131 'full_file': full_txt,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
132 'snps_file': snps_txt,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
133 'log_file': log_txt,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
134 'short_file': short_txt,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
135 }
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
136 return url_dict
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
137 time.sleep(time_delay)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
138 return url_dict
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
139
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
140 def save_to_files(self, url_dict, args):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
141 tmp_dir = tempfile.mkdtemp()
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
142 for key, value in url_dict.iteritems():
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
143 r = requests.get(value, stream=True)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
144 if r.status_code == 200:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
145 path = os.path.join(tmp_dir, key)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
146 with open(path, 'wb') as f:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
147 for chunk in r.iter_content(128):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
148 f.write(chunk)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
149 shutil.move(path, args[key])
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
150 if os.path.exists(tmp_dir):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
151 shutil.rmtree(tmp_dir)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
152 return True
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
153
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
154
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
155 def main(args):
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
156 parser = argparse.ArgumentParser()
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
157 parser.add_argument('-u',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
158 '--ucscdb',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
159 dest='ucscdb',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
160 choices=UCSCDB,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
161 required=True, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
162 parser.add_argument('-m', '--model',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
163 dest='modelname', choices=MODELNAME,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
164 required=True, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
165 parser.add_argument('-fl', '--filter',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
166 '--snpfilter', dest='snpfilter',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
167 choices=SNPFILTER.keys(),
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
168 required=True, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
169 parser.add_argument('-i', '--input',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
170 dest='input', nargs='?',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
171 required=True, type=str,
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
172 default=sys.stdin)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
173 parser.add_argument('-e', '--email',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
174 dest='email',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
175 required=False, default=None)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
176 parser.add_argument('--log', dest='log_file',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
177 required=True, default=None, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
178 parser.add_argument('--short', dest='short_file',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
179 required=True, default=None, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
180 parser.add_argument('--full', dest='full_file',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
181 required=True, default=None, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
182 parser.add_argument('--snp', dest='snps_file',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
183 required=True, default=None, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
184 parser.add_argument('--function', dest='snpfunction',
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
185 required=True, type=str)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
186 args_s = vars(parser.parse_args(args))
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
187 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
188 model_name=args_s['modelname'],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
189 snp_filter=args_s['snpfilter'],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
190 snp_function=args_s['snpfunction'],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
191 file_location=args_s['input'],
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
192 email=args_s['email'])
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
193 sid = polyphen2_web.make_request()
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
194 if not sid:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
195 stop_err(
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
196 'Something went wrong! The tracking id could not be retrieved.')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
197 url_dict = polyphen2_web.poll_for_files(sid)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
198 locations = {}
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
199 if not url_dict:
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
200 stop_err('There was error downloading the output files!')
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
201 for key in url_dict.keys():
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
202 locations[key] = args_s[key]
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
203 polyphen2_web.save_to_files(url_dict, locations)
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
204 return True
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
205
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
206 if __name__ == '__main__':
09f68bdd1999 Uploaded
saket-choudhary
parents:
diff changeset
207 main(sys.argv[1:])