annotate get_pubchem/get_pubchem_as_smiles.py @ 2:b65518a007fa draft

Uploaded
author bgruening
date Sun, 08 Sep 2013 09:42:57 -0400
parents f653fd06f055
children 021f0ef9474f
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
2
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
3 __author__ = 'Bjoern Gruening'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
4 __version__ = '0.1'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
5 __date__ = '2012'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
6 __license__ = 'GLP3+'
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
7
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
8 import ftplib
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
9 import os, sys
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
10 import argparse
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
11 import subprocess
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
12 from multiprocessing import Pool
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
13 import tempfile
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
14 import shutil
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
15
2
b65518a007fa Uploaded
bgruening
parents: 0
diff changeset
16 def main(output, processors = 4):
0
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
17 output_handle = open(output,'w+')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
18
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
19 td = tempfile.mkdtemp()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
20 ftp = ftplib.FTP('ftp.ncbi.nih.gov')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
21 ftp.login()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
22 ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
23 filelist = ftp.nlst()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
24
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
25 pool = Pool(processes = processors)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
26 filenames = zip(filelist, [td]*len(filelist))
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
27
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
28 result = pool.map_async(fetch_convert, filenames)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
29 result.get()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
30
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
31 for filename in os.listdir(td):
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
32 path = os.path.join(td, filename)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
33 shutil.copyfileobj(open(path, 'rb'), output_handle)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
34
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
35 output_handle.close()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
36 shutil.rmtree( td )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
37
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
38 def fetch_convert(args):
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
39 (filename, td) = args
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
40
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
41 tmp_name = os.path.join( tempfile.gettempdir(), filename)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
42 subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
43 output = os.path.join(td, filename)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
44 subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
45 os.remove(tmp_name)
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
46
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
47
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
48 if __name__ == '__main__':
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
49 parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
50 parser.add_argument("-o", "--output", dest="output",
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
51 required=True,
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
52 help="Path to the output file.")
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
53 parser.add_argument("-p", "--processors", dest="processors",
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
54 type=int, default=10,
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
55 help="How many processors you want to use.")
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
56
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
57 options = parser.parse_args()
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
58 main( options.output, options.processors )
f653fd06f055 Uploaded
bgruening
parents:
diff changeset
59