comparison get_online_data/get_online_data.py @ 0:f653fd06f055 draft

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:23:17 -0400
parents
children c2055dd1927b
comparison
equal deleted inserted replaced
-1:000000000000 0:f653fd06f055
1 #!/usr/bin/env python
2
3 __author__ = 'Bjoern Gruening'
4 __version__ = '0.1'
5 __date__ = '2012'
6 __license__ = 'GLP3+'
7
8 import os, sys
9 import urllib2
10 import gzip, tempfile
11 import zipfile
12 import subprocess
13 import shutil
14
15 def unescape(cond_text):
16 # Unescape if input has been escaped
17 mapped_chars = { '>' :'__gt__',
18 '<' :'__lt__',
19 "'" :'__sq__',
20 '"' :'__dq__',
21 '[' :'__ob__',
22 ']' :'__cb__',
23 '{' :'__oc__',
24 '}' :'__cc__',
25 '@' : '__at__',
26 '\n' : '__cn__',
27 '\r' : '__cr__',
28 '\t' : '__tc__'
29 }
30 for key, value in mapped_chars.items():
31 cond_text = cond_text.replace( value, key )
32 return cond_text
33
34 urls = unescape(sys.argv[1])
35 out = open(sys.argv[2], 'wb')
36
37 if len(sys.argv) > 3:
38 allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
39 else:
40 allowed_extensions = ['.sdf', '.smi', '.inchi']
41
42 for url in urls.split('\n'):
43 url = url.strip()
44 request = urllib2.Request( url )
45 request.add_header('Accept-encoding', 'gzip')
46 request.add_header('Accept-encoding', 'gz')
47 response = urllib2.urlopen( request )
48
49 if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']:
50 temp = tempfile.NamedTemporaryFile( delete=False )
51 temp.write( response.read() )
52 temp.close()
53 zipfile = gzip.open(temp.name, 'rb')
54 out.write( zipfile.read() )
55 os.remove(temp.name)
56 elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']:
57 temp = tempfile.NamedTemporaryFile(delete=False)
58 temp.close()
59 with open(temp.name, 'wb') as fp:
60 shutil.copyfileobj(response, fp)
61
62 zf = zipfile.ZipFile(temp.name, allowZip64=True)
63 tmpdir = tempfile.mkdtemp( )
64
65 for filename in zf.namelist():
66 zf.extractall( tmpdir )
67
68 os.remove( temp.name )
69 molfiles = []
70 for root, dirs, files in os.walk(tmpdir):
71 for filename in files:
72 if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
73 mfile = os.path.join( root, filename)
74 molfiles.append( mfile )
75
76 for filename in molfiles:
77 shutil.copyfileobj(open(filename, 'rb'), out)
78 shutil.rmtree( tmpdir )
79 zf.close()
80 elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']:
81 temp = tempfile.NamedTemporaryFile(delete=False)
82 temp.close()
83 with open(temp.name, 'wb') as fp:
84 shutil.copyfileobj(response, fp)
85 cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True)
86 os.remove( temp.name )
87 else:
88 out.write( response.read() )
89 out.close()