0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 __author__ = 'Bjoern Gruening'
|
|
4 __version__ = '0.1'
|
|
5 __date__ = '2012'
|
|
6 __license__ = 'GLP3+'
|
|
7
|
|
8 import ftplib
|
|
9 import os, sys
|
|
10 import argparse
|
|
11 import subprocess
|
|
12 from multiprocessing import Pool
|
|
13 import tempfile
|
|
14 import shutil
|
|
15
|
2
|
16 def main(output, processors = 4):
|
0
|
17 output_handle = open(output,'w+')
|
|
18
|
|
19 td = tempfile.mkdtemp()
|
|
20 ftp = ftplib.FTP('ftp.ncbi.nih.gov')
|
|
21 ftp.login()
|
|
22 ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
|
|
23 filelist = ftp.nlst()
|
|
24
|
|
25 pool = Pool(processes = processors)
|
|
26 filenames = zip(filelist, [td]*len(filelist))
|
|
27
|
|
28 result = pool.map_async(fetch_convert, filenames)
|
|
29 result.get()
|
|
30
|
|
31 for filename in os.listdir(td):
|
|
32 path = os.path.join(td, filename)
|
|
33 shutil.copyfileobj(open(path, 'rb'), output_handle)
|
|
34
|
|
35 output_handle.close()
|
|
36 shutil.rmtree( td )
|
|
37
|
|
38 def fetch_convert(args):
|
|
39 (filename, td) = args
|
|
40
|
|
41 tmp_name = os.path.join( tempfile.gettempdir(), filename)
|
|
42 subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
|
|
43 output = os.path.join(td, filename)
|
|
44 subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
|
|
45 os.remove(tmp_name)
|
|
46
|
|
47
|
|
48 if __name__ == '__main__':
|
|
49 parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
|
|
50 parser.add_argument("-o", "--output", dest="output",
|
|
51 required=True,
|
|
52 help="Path to the output file.")
|
|
53 parser.add_argument("-p", "--processors", dest="processors",
|
|
54 type=int, default=10,
|
|
55 help="How many processors you want to use.")
|
|
56
|
|
57 options = parser.parse_args()
|
|
58 main( options.output, options.processors )
|
|
59
|