annotate get_online_data.py @ 0:2538366eb8fb draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
author bgruening
date Wed, 22 May 2019 07:43:41 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
1 import os
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
2 import urllib.request
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
3 import gzip, tempfile
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
4 import zipfile
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
5 import subprocess
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
6 import shutil
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
7 import argparse
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
8 from io import BytesIO
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
9
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
10 def unescape(cond_text):
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
11 # Unescape if input has been escaped
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
12 mapped_chars = { '>' :'__gt__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
13 '<' :'__lt__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
14 "'" :'__sq__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
15 '"' :'__dq__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
16 '[' :'__ob__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
17 ']' :'__cb__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
18 '{' :'__oc__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
19 '}' :'__cc__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
20 '@' : '__at__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
21 '\n' : '__cn__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
22 '\r' : '__cr__',
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
23 '\t' : '__tc__'
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
24 }
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
25 for key, value in mapped_chars.items():
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
26 cond_text = cond_text.replace( value, key )
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
27 return cond_text
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
28
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
29 def get_files(options):
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
30 urls = unescape(options.url)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
31 with open(options.out, 'wb+') as out:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
32 if options.whitelist:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
33 allowed_extensions = [ext.strip() for ext in unescape(options.whitelist).split('\n')]
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
34 else:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
35 allowed_extensions = ['.sdf', '.smi', '.inchi', '.mol']
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
36
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
37 for url in urls.split('\n'):
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
38 request = urllib.request.Request(url)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
39 response = urllib.request.urlopen(request)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
40 resp_read = response.read()
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
41 if resp_read[:2] == b'\x1f\x8b': # test magic number for gzipped files
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
42 response = urllib.request.urlopen(request)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
43 out.write(gzip.decompress(resp_read))
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
44 elif resp_read[:2] == b'PK': # test magic number for zipped files
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
45 temp = tempfile.NamedTemporaryFile(delete=False)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
46 temp.close()
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
47 zf = zipfile.ZipFile(BytesIO(resp_read), allowZip64=True)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
48 tmpdir = tempfile.mkdtemp()
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
49
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
50 for filename in zf.namelist():
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
51 zf.extractall(tmpdir)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
52
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
53 os.remove(temp.name)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
54 molfiles = []
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
55 for root, dirs, files in os.walk(tmpdir):
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
56 for filename in files:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
57 if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
58 mfile = os.path.join(root, filename)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
59 shutil.copyfileobj(open(mfile, 'rb'), out)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
60 shutil.rmtree( tmpdir )
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
61 zf.close()
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
62 else:
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
63 out.write(resp_read)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
64
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
65 if __name__ == "__main__":
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
66 parser = argparse.ArgumentParser(description="""Download compressed files and extract files of with chosen extensions
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
67 """)
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
68 parser.add_argument('--url', dest='url', help='URL')
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
69 parser.add_argument('--whitelist', dest='whitelist', default=None, help='whitelist')
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
70 parser.add_argument('--out', dest='out', help='output')
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
71
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
72 options = parser.parse_args()
2538366eb8fb planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_online_data commit aed18d7d09e332efe57d00b33c2b8249abefaedb
bgruening
parents:
diff changeset
73 get_files(options)