Mercurial > repos > card > rgi
comparison data_managers/data_manager_rgi_build_db/data_manager/import_data.py @ 0:715bc9aeef69 draft
planemo upload for repository https://github.com/arpcard/rgi commit 7a78289be23c5a14ae39f454610fa8eca3f05188
author | card |
---|---|
date | Wed, 27 Feb 2019 09:08:21 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:715bc9aeef69 |
---|---|
1 import argparse | |
2 import datetime | |
3 import json | |
4 import os | |
5 import shutil | |
6 import sys | |
7 import tarfile | |
8 import urllib.request, urllib.error, urllib.parse | |
9 import zipfile | |
10 import logging | |
11 | |
12 path = os.path.join(os.getcwd(), 'rgi-database') | |
13 data_path = path | |
14 | |
15 level = logging.WARNING | |
16 logger = logging.getLogger(__name__) | |
17 logger.setLevel(level) | |
18 | |
19 def url_download(url, workdir): | |
20 file_path = os.path.join(workdir, 'download.dat') | |
21 if not os.path.exists(workdir): | |
22 os.makedirs(workdir) | |
23 src = None | |
24 dst = None | |
25 try: | |
26 req = urllib.request.Request(url) | |
27 src = urllib.request.urlopen(req) | |
28 dst = open(file_path, 'wb') | |
29 while True: | |
30 chunk = src.read(2**10) | |
31 if chunk: | |
32 dst.write(chunk) | |
33 else: | |
34 break | |
35 except Exception as e: | |
36 print(str(e), file=sys.stderr) | |
37 finally: | |
38 if src: | |
39 src.close() | |
40 if dst: | |
41 dst.close() | |
42 if tarfile.is_tarfile(file_path): | |
43 fh = tarfile.open(file_path, 'r:*') | |
44 elif zipfile.is_zipfile(file_path): | |
45 fh = zipfile.ZipFile(file_path, 'r') | |
46 else: | |
47 return | |
48 # extract only one file : card.json | |
49 for member in fh.getmembers(): | |
50 if member.isreg(): # skip if the TarInfo is not files | |
51 member.name = os.path.basename(member.name) # remove the path by reset it | |
52 if member.name == 'card.json': | |
53 print('[import_data] extracting file: {}'.format(str(member.name))) | |
54 fh.extract(member.name,workdir) | |
55 os.remove(file_path) | |
56 | |
57 def checkKeyExisted(key, my_dict): | |
58 try: | |
59 nonNone = my_dict[key] is not None | |
60 except KeyError: | |
61 nonNone = False | |
62 return nonNone | |
63 | |
64 def data_version(): | |
65 data_version = '' | |
66 with open(os.path.join(data_path, 'card.json')) as json_file: | |
67 json_data = json.load(json_file) | |
68 for item in list(json_data.keys()): | |
69 if item == '_version': | |
70 data_version = json_data[item] | |
71 json_file.close() | |
72 return data_version | |
73 | |
74 def makeBlastDB(): | |
75 if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True: | |
76 print('[import_data] create blast DB.') | |
77 os.system('makeblastdb -in {}/proteindb.fsa -dbtype prot -out {}/protein.db > /dev/null 2>&1'.format(path, path)) | |
78 | |
79 def makeDiamondDB(): | |
80 if os.path.isfile(os.path.join(path, 'proteindb.fsa')) == True: | |
81 print('[import_data] create diamond DB.') | |
82 os.system('diamond makedb --quiet --in {}/proteindb.fsa --db {}/protein.db'.format(path, path)) | |
83 | |
84 def write_fasta_from_json(): | |
85 '''Creates a fasta file from card.json file.''' | |
86 if os.path.isfile(os.path.join(path, 'proteindb.fsa')): | |
87 return | |
88 else: | |
89 try: | |
90 with open(os.path.join(data_path, 'card.json'), 'r') as jfile: | |
91 j = json.load(jfile) | |
92 except Exception as e: | |
93 logger.error(e) | |
94 exit() | |
95 | |
96 with open(os.path.join(path, 'proteindb.fsa'), 'w') as fout: | |
97 for i in j: | |
98 if i.isdigit(): | |
99 # model_type: protein homolog model | |
100 if j[i]['model_type_id'] == '40292': | |
101 try: | |
102 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] | |
103 except KeyError: | |
104 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ | |
105 % (j[i]['model_id'], j[i]['model_name'])) | |
106 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
107 else: | |
108 try: | |
109 for seq in j[i]['model_sequences']['sequence']: | |
110 fout.write('>%s_%s | model_type_id: 40292 | pass_bitscore: %s | %s\n' % (i, seq, pass_bit_score, j[i]['ARO_name'])) | |
111 fout.write('%s\n' %(j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) | |
112 except Exception as e: | |
113 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ | |
114 % (j[i]['model_id'], j[i]['model_name'])) | |
115 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
116 | |
117 | |
118 # model_type: protein variant model | |
119 elif j[i]['model_type_id'] == '40293': | |
120 try: | |
121 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] | |
122 except KeyError: | |
123 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ | |
124 % (j[i]['model_id'], j[i]['model_name'])) | |
125 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
126 else: | |
127 try: | |
128 snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']] | |
129 except Exception as e: | |
130 logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \ | |
131 % (j[i]['model_id'], j[i]['model_name'])) | |
132 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
133 | |
134 try: | |
135 for seq in j[i]['model_sequences']['sequence']: | |
136 fout.write('>%s_%s | model_type_id: 40293 | pass_bit_score: %s | SNP: %s | %s\n' \ | |
137 % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name'])) | |
138 fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) | |
139 except Exception as e: | |
140 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ | |
141 % (j[i]['model_id'], j[i]['model_name'])) | |
142 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
143 | |
144 # model_type: protein overexpression model | |
145 elif j[i]['model_type_id'] == '41091': | |
146 try: | |
147 pass_bit_score = j[i]['model_param']['blastp_bit_score']['param_value'] | |
148 except KeyError: | |
149 logger.warning('No bitscore for model (%s, %s). RGI will omit this model and keep running.' \ | |
150 % (j[i]['model_id'], j[i]['model_name'])) | |
151 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
152 else: | |
153 try: | |
154 snpList = [j[i]['model_param']['snp']['param_value'][k] for k in j[i]['model_param']['snp']['param_value']] | |
155 except Exception as e: | |
156 logger.warning('No snp for model (%s, %s). RGI will omit this model and keep running.' \ | |
157 % (j[i]['model_id'], j[i]['model_name'])) | |
158 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
159 | |
160 try: | |
161 for seq in j[i]['model_sequences']['sequence']: | |
162 fout.write('>%s_%s | model_type_id: 41091 | pass_bit_score: %s | SNP: %s | %s\n' \ | |
163 % (i, seq, pass_bit_score, ','.join(snpList), j[i]['ARO_name'])) | |
164 fout.write('%s\n' % (j[i]['model_sequences']['sequence'][seq]['protein_sequence']['sequence'])) | |
165 except Exception as e: | |
166 logger.warning('No model sequences for model (%s, %s). RGI will omit this model and keep running.' \ | |
167 % (j[i]['model_id'], j[i]['model_name'])) | |
168 logger.info('Please let the CARD Admins know! Email: card@mcmaster.ca') | |
169 | |
170 def _main(args): | |
171 if not os.path.exists(path): | |
172 print('[import_data] mkdir: {}'.format(path)) | |
173 os.makedirs(path) | |
174 print('[import_data] path: {}'.format(path)) | |
175 print(args) | |
176 | |
177 if args.url == None: | |
178 url = 'https://card.mcmaster.ca/latest/data' | |
179 else: | |
180 url = args.url | |
181 print('[import_data] url: {}'.format(url)) | |
182 workdir = os.path.join(os.getcwd(), 'rgi-database') | |
183 print('[import_data] working directory: {}'.format(workdir)) | |
184 url_download(url, workdir) | |
185 write_fasta_from_json() | |
186 makeBlastDB() | |
187 makeDiamondDB() | |
188 version = data_version() | |
189 print('[import_data] data version: {}'.format(version)) | |
190 return version | |
191 | |
192 def run(): | |
193 parser = argparse.ArgumentParser(description='Create data manager json.') | |
194 parser.add_argument('--url', dest='url', action='store', help='Url for CARD data') | |
195 args = parser.parse_args() | |
196 _main(args) | |
197 | |
198 if __name__ == '__main__': | |
199 run() |