Mercurial > repos > saket-choudhary > polyphen2_web
comparison polyphen2_web/polyphen2_web.py @ 0:09f68bdd1999 draft default tip
Uploaded
author | saket-choudhary |
---|---|
date | Tue, 07 Oct 2014 19:21:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:09f68bdd1999 |
---|---|
1 #!/usr/bin/python | |
2 from bs4 import BeautifulSoup | |
3 import argparse | |
4 import sys | |
5 import time | |
6 import os | |
7 import tempfile | |
8 import requests | |
9 import shutil | |
10 import csv | |
11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' | |
12 result_url = 'http://genetics.bwh.harvard.edu' | |
13 | |
14 TIMEOUT = 60 * 60 * 24 | |
15 TIME_DELAY = 30 | |
16 MAX_TRIES = 900000000 | |
17 | |
18 # Genome assembly version used for chromosome | |
19 # coordinates of the SNPs in user input | |
20 UCSCDB = ['hg19', 'hg18'] | |
21 # Classifier model used for predictions. | |
22 MODELNAME = ['HumDiv', 'HumVar'] | |
23 | |
24 # Set of transcripts on which genomic SNPs will be mapped | |
25 SNPFILTER = { | |
26 'All': 0, | |
27 'Canonical': 1, | |
28 'CCDS': 3, | |
29 } | |
30 # Functional SNP categories to include in genomic SNPs annotation report | |
31 SNPFUNCTION = ['c', 'm', ''] | |
32 | |
33 | |
34 def stop_err(msg, err=1): | |
35 sys.stderr.write('%s\n' % msg) | |
36 sys.exit(err) | |
37 | |
38 | |
39 class Polyphen2Web: | |
40 | |
41 def __init__(self, ucscdb=None, model_name=None, snp_filter=None, | |
42 snp_function=None, file_location=None, email=None): | |
43 self.ucscdb = ucscdb | |
44 self.model_name = model_name | |
45 self.snp_filter = snp_filter | |
46 self.snp_function = snp_function | |
47 self.file_location = file_location | |
48 self.notify_me = email | |
49 | |
50 def soupify(self, string): | |
51 return BeautifulSoup(string) | |
52 | |
53 def make_request(self): | |
54 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t') | |
55 tmp_dir = tempfile.mkdtemp() | |
56 path = os.path.join(tmp_dir, 'csv_file') | |
57 with open(path, 'wb') as fh: | |
58 a = csv.writer(fh) | |
59 a.writerows(in_txt) | |
60 contents = open(self.file_location, 'r').read().replace( | |
61 '\t', ' ').replace('::::::::::::::', '') | |
62 if self.snp_function == 'All': | |
63 self.snp_function = '' | |
64 payload = { | |
65 '_ggi_project': 'PPHWeb2', | |
66 '_ggi_origin': 'query', | |
67 '_ggi_batch': contents, | |
68 '_ggi_target_pipeline': '1', | |
69 'MODELNAME': self.model_name, | |
70 'UCSCDB': self.ucscdb, | |
71 'SNPFILTER': SNPFILTER[self.snp_filter], | |
72 'SNPFUNC': self.snp_function, | |
73 'NOTIFYME': '', | |
74 | |
75 } | |
76 if self.notify_me: | |
77 payload['NOTIFYME'] = self.notify_me | |
78 request = requests.post(submission_url, data=payload) | |
79 content = request.content | |
80 soup = self.soupify(content) | |
81 sid_soup = soup.find('input', {'name': 'sid'}) | |
82 try: | |
83 sid = sid_soup['value'] | |
84 except: | |
85 sid = None | |
86 shutil.rmtree(tmp_dir) | |
87 return sid | |
88 | |
89 def poll_for_files(self, sid, | |
90 max_tries=MAX_TRIES, | |
91 time_delay=TIME_DELAY, | |
92 timeout=TIMEOUT): | |
93 payload = { | |
94 '_ggi_project': 'PPHWeb2', | |
95 '_ggi_origin': 'manage', | |
96 '_ggi_target_manage': 'Refresh', | |
97 'sid': sid, | |
98 } | |
99 content = None | |
100 tries = 0 | |
101 url_dict = None | |
102 while True: | |
103 tries += 1 | |
104 if tries > max_tries: | |
105 stop_err('Number of tries exceeded!') | |
106 request = requests.post(submission_url, data=payload) | |
107 content = request.content | |
108 soup = self.soupify(content) | |
109 all_tables = soup.findAll('table') | |
110 if all_tables: | |
111 try: | |
112 running_jobs_table = all_tables[-2] | |
113 except: | |
114 running_jobs_table = None | |
115 if running_jobs_table: | |
116 rows = running_jobs_table.findAll('tr') | |
117 if len(rows) == 1: | |
118 row = rows[0] | |
119 hrefs = row.findAll('a') | |
120 # print hrefs | |
121 if len(hrefs) >= 3: | |
122 short_txt = hrefs[0]['href'] | |
123 # print short_txt | |
124 path = short_txt.split('-')[0] | |
125 full_txt = result_url + path + '-full.txt' | |
126 log_txt = result_url + path + '-log.txt' | |
127 snps_txt = result_url + path + '-snps.txt' | |
128 short_txt = result_url + path + \ | |
129 '-short.txt' # short_txt | |
130 url_dict = { | |
131 'full_file': full_txt, | |
132 'snps_file': snps_txt, | |
133 'log_file': log_txt, | |
134 'short_file': short_txt, | |
135 } | |
136 return url_dict | |
137 time.sleep(time_delay) | |
138 return url_dict | |
139 | |
140 def save_to_files(self, url_dict, args): | |
141 tmp_dir = tempfile.mkdtemp() | |
142 for key, value in url_dict.iteritems(): | |
143 r = requests.get(value, stream=True) | |
144 if r.status_code == 200: | |
145 path = os.path.join(tmp_dir, key) | |
146 with open(path, 'wb') as f: | |
147 for chunk in r.iter_content(128): | |
148 f.write(chunk) | |
149 shutil.move(path, args[key]) | |
150 if os.path.exists(tmp_dir): | |
151 shutil.rmtree(tmp_dir) | |
152 return True | |
153 | |
154 | |
155 def main(args): | |
156 parser = argparse.ArgumentParser() | |
157 parser.add_argument('-u', | |
158 '--ucscdb', | |
159 dest='ucscdb', | |
160 choices=UCSCDB, | |
161 required=True, type=str) | |
162 parser.add_argument('-m', '--model', | |
163 dest='modelname', choices=MODELNAME, | |
164 required=True, type=str) | |
165 parser.add_argument('-fl', '--filter', | |
166 '--snpfilter', dest='snpfilter', | |
167 choices=SNPFILTER.keys(), | |
168 required=True, type=str) | |
169 parser.add_argument('-i', '--input', | |
170 dest='input', nargs='?', | |
171 required=True, type=str, | |
172 default=sys.stdin) | |
173 parser.add_argument('-e', '--email', | |
174 dest='email', | |
175 required=False, default=None) | |
176 parser.add_argument('--log', dest='log_file', | |
177 required=True, default=None, type=str) | |
178 parser.add_argument('--short', dest='short_file', | |
179 required=True, default=None, type=str) | |
180 parser.add_argument('--full', dest='full_file', | |
181 required=True, default=None, type=str) | |
182 parser.add_argument('--snp', dest='snps_file', | |
183 required=True, default=None, type=str) | |
184 parser.add_argument('--function', dest='snpfunction', | |
185 required=True, type=str) | |
186 args_s = vars(parser.parse_args(args)) | |
187 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'], | |
188 model_name=args_s['modelname'], | |
189 snp_filter=args_s['snpfilter'], | |
190 snp_function=args_s['snpfunction'], | |
191 file_location=args_s['input'], | |
192 email=args_s['email']) | |
193 sid = polyphen2_web.make_request() | |
194 if not sid: | |
195 stop_err( | |
196 'Something went wrong! The tracking id could not be retrieved.') | |
197 url_dict = polyphen2_web.poll_for_files(sid) | |
198 locations = {} | |
199 if not url_dict: | |
200 stop_err('There was error downloading the output files!') | |
201 for key in url_dict.keys(): | |
202 locations[key] = args_s[key] | |
203 polyphen2_web.save_to_files(url_dict, locations) | |
204 return True | |
205 | |
206 if __name__ == '__main__': | |
207 main(sys.argv[1:]) |