annotate get_online_data/get_online_data.py @ 0:f653fd06f055 draft

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:23:17 -0400
parents
children c2055dd1927b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
2
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
3 __author__ = 'Bjoern Gruening'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
4 __version__ = '0.1'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
5 __date__ = '2012'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
6 __license__ = 'GLP3+'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
7
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
8 import os, sys
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
9 import urllib2
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
10 import gzip, tempfile
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
11 import zipfile
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
12 import subprocess
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
13 import shutil
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
14
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
15 def unescape(cond_text):
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
16 # Unescape if input has been escaped
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
17 mapped_chars = { '>' :'__gt__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
18 '<' :'__lt__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
19 "'" :'__sq__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
20 '"' :'__dq__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
21 '[' :'__ob__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
22 ']' :'__cb__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
23 '{' :'__oc__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
24 '}' :'__cc__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
25 '@' : '__at__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
26 '\n' : '__cn__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
27 '\r' : '__cr__',
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
28 '\t' : '__tc__'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
29 }
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
30 for key, value in mapped_chars.items():
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
31 cond_text = cond_text.replace( value, key )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
32 return cond_text
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
33
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
34 urls = unescape(sys.argv[1])
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
35 out = open(sys.argv[2], 'wb')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
36
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
37 if len(sys.argv) > 3:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
38 allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
39 else:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
40 allowed_extensions = ['.sdf', '.smi', '.inchi']
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
41
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
42 for url in urls.split('\n'):
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
43 url = url.strip()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
44 request = urllib2.Request( url )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
45 request.add_header('Accept-encoding', 'gzip')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
46 request.add_header('Accept-encoding', 'gz')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
47 response = urllib2.urlopen( request )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
48
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
49 if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
50 temp = tempfile.NamedTemporaryFile( delete=False )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
51 temp.write( response.read() )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
52 temp.close()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
53 zipfile = gzip.open(temp.name, 'rb')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
54 out.write( zipfile.read() )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
55 os.remove(temp.name)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
56 elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
57 temp = tempfile.NamedTemporaryFile(delete=False)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
58 temp.close()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
59 with open(temp.name, 'wb') as fp:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
60 shutil.copyfileobj(response, fp)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
61
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
62 zf = zipfile.ZipFile(temp.name, allowZip64=True)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
63 tmpdir = tempfile.mkdtemp( )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
64
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
65 for filename in zf.namelist():
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
66 zf.extractall( tmpdir )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
67
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
68 os.remove( temp.name )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
69 molfiles = []
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
70 for root, dirs, files in os.walk(tmpdir):
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
71 for filename in files:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
72 if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
73 mfile = os.path.join( root, filename)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
74 molfiles.append( mfile )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
75
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
76 for filename in molfiles:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
77 shutil.copyfileobj(open(filename, 'rb'), out)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
78 shutil.rmtree( tmpdir )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
79 zf.close()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
80 elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
81 temp = tempfile.NamedTemporaryFile(delete=False)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
82 temp.close()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
83 with open(temp.name, 'wb') as fp:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
84 shutil.copyfileobj(response, fp)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
85 cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
86 os.remove( temp.name )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
87 else:
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
88 out.write( response.read() )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
89 out.close()