Mercurial > repos > bgruening > chemical_data_sources
comparison get_online_data/get_online_data.py @ 0:f653fd06f055 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 15 Aug 2013 03:23:17 -0400 |
parents | |
children | c2055dd1927b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f653fd06f055 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 __author__ = 'Bjoern Gruening' | |
4 __version__ = '0.1' | |
5 __date__ = '2012' | |
6 __license__ = 'GLP3+' | |
7 | |
8 import os, sys | |
9 import urllib2 | |
10 import gzip, tempfile | |
11 import zipfile | |
12 import subprocess | |
13 import shutil | |
14 | |
15 def unescape(cond_text): | |
16 # Unescape if input has been escaped | |
17 mapped_chars = { '>' :'__gt__', | |
18 '<' :'__lt__', | |
19 "'" :'__sq__', | |
20 '"' :'__dq__', | |
21 '[' :'__ob__', | |
22 ']' :'__cb__', | |
23 '{' :'__oc__', | |
24 '}' :'__cc__', | |
25 '@' : '__at__', | |
26 '\n' : '__cn__', | |
27 '\r' : '__cr__', | |
28 '\t' : '__tc__' | |
29 } | |
30 for key, value in mapped_chars.items(): | |
31 cond_text = cond_text.replace( value, key ) | |
32 return cond_text | |
33 | |
34 urls = unescape(sys.argv[1]) | |
35 out = open(sys.argv[2], 'wb') | |
36 | |
37 if len(sys.argv) > 3: | |
38 allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ] | |
39 else: | |
40 allowed_extensions = ['.sdf', '.smi', '.inchi'] | |
41 | |
42 for url in urls.split('\n'): | |
43 url = url.strip() | |
44 request = urllib2.Request( url ) | |
45 request.add_header('Accept-encoding', 'gzip') | |
46 request.add_header('Accept-encoding', 'gz') | |
47 response = urllib2.urlopen( request ) | |
48 | |
49 if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']: | |
50 temp = tempfile.NamedTemporaryFile( delete=False ) | |
51 temp.write( response.read() ) | |
52 temp.close() | |
53 zipfile = gzip.open(temp.name, 'rb') | |
54 out.write( zipfile.read() ) | |
55 os.remove(temp.name) | |
56 elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']: | |
57 temp = tempfile.NamedTemporaryFile(delete=False) | |
58 temp.close() | |
59 with open(temp.name, 'wb') as fp: | |
60 shutil.copyfileobj(response, fp) | |
61 | |
62 zf = zipfile.ZipFile(temp.name, allowZip64=True) | |
63 tmpdir = tempfile.mkdtemp( ) | |
64 | |
65 for filename in zf.namelist(): | |
66 zf.extractall( tmpdir ) | |
67 | |
68 os.remove( temp.name ) | |
69 molfiles = [] | |
70 for root, dirs, files in os.walk(tmpdir): | |
71 for filename in files: | |
72 if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []: | |
73 mfile = os.path.join( root, filename) | |
74 molfiles.append( mfile ) | |
75 | |
76 for filename in molfiles: | |
77 shutil.copyfileobj(open(filename, 'rb'), out) | |
78 shutil.rmtree( tmpdir ) | |
79 zf.close() | |
80 elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']: | |
81 temp = tempfile.NamedTemporaryFile(delete=False) | |
82 temp.close() | |
83 with open(temp.name, 'wb') as fp: | |
84 shutil.copyfileobj(response, fp) | |
85 cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True) | |
86 os.remove( temp.name ) | |
87 else: | |
88 out.write( response.read() ) | |
89 out.close() |