0
|
1 #!/usr/bin/python
|
|
2 from bs4 import BeautifulSoup
|
|
3 import argparse
|
|
4 import sys
|
|
5 import time
|
|
6 import os
|
|
7 import tempfile
|
|
8 import requests
|
|
9 import shutil
|
|
10 import csv
|
|
11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi'
|
|
12 result_url = 'http://genetics.bwh.harvard.edu'
|
|
13
|
|
14 TIMEOUT = 60 * 60 * 24
|
|
15 TIME_DELAY = 30
|
|
16 MAX_TRIES = 900000000
|
|
17
|
|
18 # Genome assembly version used for chromosome
|
|
19 # coordinates of the SNPs in user input
|
|
20 UCSCDB = ['hg19', 'hg18']
|
|
21 # Classifier model used for predictions.
|
|
22 MODELNAME = ['HumDiv', 'HumVar']
|
|
23
|
|
24 # Set of transcripts on which genomic SNPs will be mapped
|
|
25 SNPFILTER = {
|
|
26 'All': 0,
|
|
27 'Canonical': 1,
|
|
28 'CCDS': 3,
|
|
29 }
|
|
30 # Functional SNP categories to include in genomic SNPs annotation report
|
|
31 SNPFUNCTION = ['c', 'm', '']
|
|
32
|
|
33
|
|
34 def stop_err(msg, err=1):
|
|
35 sys.stderr.write('%s\n' % msg)
|
|
36 sys.exit(err)
|
|
37
|
|
38
|
|
39 class Polyphen2Web:
|
|
40
|
|
41 def __init__(self, ucscdb=None, model_name=None, snp_filter=None,
|
|
42 snp_function=None, file_location=None, email=None):
|
|
43 self.ucscdb = ucscdb
|
|
44 self.model_name = model_name
|
|
45 self.snp_filter = snp_filter
|
|
46 self.snp_function = snp_function
|
|
47 self.file_location = file_location
|
|
48 self.notify_me = email
|
|
49
|
|
50 def soupify(self, string):
|
|
51 return BeautifulSoup(string)
|
|
52
|
|
53 def make_request(self):
|
|
54 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t')
|
|
55 tmp_dir = tempfile.mkdtemp()
|
|
56 path = os.path.join(tmp_dir, 'csv_file')
|
|
57 with open(path, 'wb') as fh:
|
|
58 a = csv.writer(fh)
|
|
59 a.writerows(in_txt)
|
|
60 contents = open(self.file_location, 'r').read().replace(
|
|
61 '\t', ' ').replace('::::::::::::::', '')
|
|
62 if self.snp_function == 'All':
|
|
63 self.snp_function = ''
|
|
64 payload = {
|
|
65 '_ggi_project': 'PPHWeb2',
|
|
66 '_ggi_origin': 'query',
|
|
67 '_ggi_batch': contents,
|
|
68 '_ggi_target_pipeline': '1',
|
|
69 'MODELNAME': self.model_name,
|
|
70 'UCSCDB': self.ucscdb,
|
|
71 'SNPFILTER': SNPFILTER[self.snp_filter],
|
|
72 'SNPFUNC': self.snp_function,
|
|
73 'NOTIFYME': '',
|
|
74
|
|
75 }
|
|
76 if self.notify_me:
|
|
77 payload['NOTIFYME'] = self.notify_me
|
|
78 request = requests.post(submission_url, data=payload)
|
|
79 content = request.content
|
|
80 soup = self.soupify(content)
|
|
81 sid_soup = soup.find('input', {'name': 'sid'})
|
|
82 try:
|
|
83 sid = sid_soup['value']
|
|
84 except:
|
|
85 sid = None
|
|
86 shutil.rmtree(tmp_dir)
|
|
87 return sid
|
|
88
|
|
89 def poll_for_files(self, sid,
|
|
90 max_tries=MAX_TRIES,
|
|
91 time_delay=TIME_DELAY,
|
|
92 timeout=TIMEOUT):
|
|
93 payload = {
|
|
94 '_ggi_project': 'PPHWeb2',
|
|
95 '_ggi_origin': 'manage',
|
|
96 '_ggi_target_manage': 'Refresh',
|
|
97 'sid': sid,
|
|
98 }
|
|
99 content = None
|
|
100 tries = 0
|
|
101 url_dict = None
|
|
102 while True:
|
|
103 tries += 1
|
|
104 if tries > max_tries:
|
|
105 stop_err('Number of tries exceeded!')
|
|
106 request = requests.post(submission_url, data=payload)
|
|
107 content = request.content
|
|
108 soup = self.soupify(content)
|
|
109 all_tables = soup.findAll('table')
|
|
110 if all_tables:
|
|
111 try:
|
|
112 running_jobs_table = all_tables[-2]
|
|
113 except:
|
|
114 running_jobs_table = None
|
|
115 if running_jobs_table:
|
|
116 rows = running_jobs_table.findAll('tr')
|
|
117 if len(rows) == 1:
|
|
118 row = rows[0]
|
|
119 hrefs = row.findAll('a')
|
|
120 # print hrefs
|
|
121 if len(hrefs) >= 3:
|
|
122 short_txt = hrefs[0]['href']
|
|
123 # print short_txt
|
|
124 path = short_txt.split('-')[0]
|
|
125 full_txt = result_url + path + '-full.txt'
|
|
126 log_txt = result_url + path + '-log.txt'
|
|
127 snps_txt = result_url + path + '-snps.txt'
|
|
128 short_txt = result_url + path + \
|
|
129 '-short.txt' # short_txt
|
|
130 url_dict = {
|
|
131 'full_file': full_txt,
|
|
132 'snps_file': snps_txt,
|
|
133 'log_file': log_txt,
|
|
134 'short_file': short_txt,
|
|
135 }
|
|
136 return url_dict
|
|
137 time.sleep(time_delay)
|
|
138 return url_dict
|
|
139
|
|
140 def save_to_files(self, url_dict, args):
|
|
141 tmp_dir = tempfile.mkdtemp()
|
|
142 for key, value in url_dict.iteritems():
|
|
143 r = requests.get(value, stream=True)
|
|
144 if r.status_code == 200:
|
|
145 path = os.path.join(tmp_dir, key)
|
|
146 with open(path, 'wb') as f:
|
|
147 for chunk in r.iter_content(128):
|
|
148 f.write(chunk)
|
|
149 shutil.move(path, args[key])
|
|
150 if os.path.exists(tmp_dir):
|
|
151 shutil.rmtree(tmp_dir)
|
|
152 return True
|
|
153
|
|
154
|
|
155 def main(args):
|
|
156 parser = argparse.ArgumentParser()
|
|
157 parser.add_argument('-u',
|
|
158 '--ucscdb',
|
|
159 dest='ucscdb',
|
|
160 choices=UCSCDB,
|
|
161 required=True, type=str)
|
|
162 parser.add_argument('-m', '--model',
|
|
163 dest='modelname', choices=MODELNAME,
|
|
164 required=True, type=str)
|
|
165 parser.add_argument('-fl', '--filter',
|
|
166 '--snpfilter', dest='snpfilter',
|
|
167 choices=SNPFILTER.keys(),
|
|
168 required=True, type=str)
|
|
169 parser.add_argument('-i', '--input',
|
|
170 dest='input', nargs='?',
|
|
171 required=True, type=str,
|
|
172 default=sys.stdin)
|
|
173 parser.add_argument('-e', '--email',
|
|
174 dest='email',
|
|
175 required=False, default=None)
|
|
176 parser.add_argument('--log', dest='log_file',
|
|
177 required=True, default=None, type=str)
|
|
178 parser.add_argument('--short', dest='short_file',
|
|
179 required=True, default=None, type=str)
|
|
180 parser.add_argument('--full', dest='full_file',
|
|
181 required=True, default=None, type=str)
|
|
182 parser.add_argument('--snp', dest='snps_file',
|
|
183 required=True, default=None, type=str)
|
|
184 parser.add_argument('--function', dest='snpfunction',
|
|
185 required=True, type=str)
|
|
186 args_s = vars(parser.parse_args(args))
|
|
187 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'],
|
|
188 model_name=args_s['modelname'],
|
|
189 snp_filter=args_s['snpfilter'],
|
|
190 snp_function=args_s['snpfunction'],
|
|
191 file_location=args_s['input'],
|
|
192 email=args_s['email'])
|
|
193 sid = polyphen2_web.make_request()
|
|
194 if not sid:
|
|
195 stop_err(
|
|
196 'Something went wrong! The tracking id could not be retrieved.')
|
|
197 url_dict = polyphen2_web.poll_for_files(sid)
|
|
198 locations = {}
|
|
199 if not url_dict:
|
|
200 stop_err('There was error downloading the output files!')
|
|
201 for key in url_dict.keys():
|
|
202 locations[key] = args_s[key]
|
|
203 polyphen2_web.save_to_files(url_dict, locations)
|
|
204 return True
|
|
205
|
|
206 if __name__ == '__main__':
|
|
207 main(sys.argv[1:])
|