Mercurial > repos > bgruening > get_online_data
annotate get_online_data.py @ 0:2538366eb8fb draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
author | bgruening |
---|---|
date | Wed, 22 May 2019 07:43:41 -0400 |
parents | |
children |
rev | line source |
---|---|
0
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
1 import os |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
2 import urllib.request |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
3 import gzip, tempfile |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
4 import zipfile |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
5 import subprocess |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
6 import shutil |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
7 import argparse |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
8 from io import BytesIO |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
9 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
10 def unescape(cond_text): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
11 # Unescape if input has been escaped |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
12 mapped_chars = { '>' :'__gt__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
13 '<' :'__lt__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
14 "'" :'__sq__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
15 '"' :'__dq__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
16 '[' :'__ob__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
17 ']' :'__cb__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
18 '{' :'__oc__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
19 '}' :'__cc__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
20 '@' : '__at__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
21 '\n' : '__cn__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
22 '\r' : '__cr__', |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
23 '\t' : '__tc__' |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
24 } |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
25 for key, value in mapped_chars.items(): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
26 cond_text = cond_text.replace( value, key ) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
27 return cond_text |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
28 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
29 def get_files(options): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
30 urls = unescape(options.url) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
31 with open(options.out, 'wb+') as out: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
32 if options.whitelist: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
33 allowed_extensions = [ext.strip() for ext in unescape(options.whitelist).split('\n')] |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
34 else: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
35 allowed_extensions = ['.sdf', '.smi', '.inchi', '.mol'] |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
36 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
37 for url in urls.split('\n'): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
38 request = urllib.request.Request(url) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
39 response = urllib.request.urlopen(request) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
40 resp_read = response.read() |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
41 if resp_read[:2] == b'\x1f\x8b': # test magic number for gzipped files |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
42 response = urllib.request.urlopen(request) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
43 out.write(gzip.decompress(resp_read)) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
44 elif resp_read[:2] == b'PK': # test magic number for zipped files |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
45 temp = tempfile.NamedTemporaryFile(delete=False) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
46 temp.close() |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
47 zf = zipfile.ZipFile(BytesIO(resp_read), allowZip64=True) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
48 tmpdir = tempfile.mkdtemp() |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
49 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
50 for filename in zf.namelist(): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
51 zf.extractall(tmpdir) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
52 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
53 os.remove(temp.name) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
54 molfiles = [] |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
55 for root, dirs, files in os.walk(tmpdir): |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
56 for filename in files: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
57 if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
58 mfile = os.path.join(root, filename) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
59 shutil.copyfileobj(open(mfile, 'rb'), out) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
60 shutil.rmtree( tmpdir ) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
61 zf.close() |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
62 else: |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
63 out.write(resp_read) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
64 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
65 if __name__ == "__main__": |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
66 parser = argparse.ArgumentParser(description="""Download compressed files and extract files of with chosen extensions |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
67 """) |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
68 parser.add_argument('--url', dest='url', help='URL') |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
69 parser.add_argument('--whitelist', dest='whitelist', default=None, help='whitelist') |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
70 parser.add_argument('--out', dest='out', help='output') |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
71 |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
72 options = parser.parse_args() |
2538366eb8fb
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff
changeset
|
73 get_files(options) |