# HG changeset patch
# User bgruening
# Date 1376551397 14400
# Node ID f653fd06f055f7a3a85a92a0e5a30b22398e078b
Uploaded
diff -r 000000000000 -r f653fd06f055 get_online_data/get_online_data.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_online_data/get_online_data.py Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import os, sys
+import urllib2
+import gzip, tempfile
+import zipfile
+import subprocess
+import shutil
+
+def unescape(cond_text):
+ # Unescape if input has been escaped
+ mapped_chars = { '>' :'__gt__',
+ '<' :'__lt__',
+ "'" :'__sq__',
+ '"' :'__dq__',
+ '[' :'__ob__',
+ ']' :'__cb__',
+ '{' :'__oc__',
+ '}' :'__cc__',
+ '@' : '__at__',
+ '\n' : '__cn__',
+ '\r' : '__cr__',
+ '\t' : '__tc__'
+ }
+ for key, value in mapped_chars.items():
+ cond_text = cond_text.replace( value, key )
+ return cond_text
+
+urls = unescape(sys.argv[1])
+out = open(sys.argv[2], 'wb')
+
+if len(sys.argv) > 3:
+ allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
+else:
+ allowed_extensions = ['.sdf', '.smi', '.inchi']
+
+for url in urls.split('\n'):
+ url = url.strip()
+ request = urllib2.Request( url )
+ request.add_header('Accept-encoding', 'gzip')
+ request.add_header('Accept-encoding', 'gz')
+ response = urllib2.urlopen( request )
+
+ if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']:
+ temp = tempfile.NamedTemporaryFile( delete=False )
+ temp.write( response.read() )
+ temp.close()
+ zipfile = gzip.open(temp.name, 'rb')
+ out.write( zipfile.read() )
+ os.remove(temp.name)
+ elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']:
+ temp = tempfile.NamedTemporaryFile(delete=False)
+ temp.close()
+ with open(temp.name, 'wb') as fp:
+ shutil.copyfileobj(response, fp)
+
+ zf = zipfile.ZipFile(temp.name, allowZip64=True)
+ tmpdir = tempfile.mkdtemp( )
+
+ for filename in zf.namelist():
+ zf.extractall( tmpdir )
+
+ os.remove( temp.name )
+ molfiles = []
+ for root, dirs, files in os.walk(tmpdir):
+ for filename in files:
+ if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
+ mfile = os.path.join( root, filename)
+ molfiles.append( mfile )
+
+ for filename in molfiles:
+ shutil.copyfileobj(open(filename, 'rb'), out)
+ shutil.rmtree( tmpdir )
+ zf.close()
+ elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']:
+ temp = tempfile.NamedTemporaryFile(delete=False)
+ temp.close()
+ with open(temp.name, 'wb') as fp:
+ shutil.copyfileobj(response, fp)
+ cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True)
+ os.remove( temp.name )
+ else:
+ out.write( response.read() )
+out.close()
diff -r 000000000000 -r f653fd06f055 get_online_data/get_online_data.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_online_data/get_online_data.xml Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,37 @@
+
+
+ fetching ...
+
+
+ get_online_data.py "$url_paste" $output $whitelist
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What this tool does**
+
+Fetch data via FTP or HTTP and store them in your history.
+
+-----
+
+.. class:: infomark
+
+**Input**
+
+Supported filetypes are:
+ - gz/gzip
+ - rar
+
+ZIP is supported with recursive extracting of specific filetypes.
+
+
+
diff -r 000000000000 -r f653fd06f055 get_pubchem/get_pubchem_as_smiles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_as_smiles.py Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+
+def main(output, processors = 10):
+ output_handle = open(output,'w+')
+
+ td = tempfile.mkdtemp()
+ ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+ ftp.login()
+ ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
+ filelist = ftp.nlst()
+
+ pool = Pool(processes = processors)
+ filenames = zip(filelist, [td]*len(filelist))
+
+ result = pool.map_async(fetch_convert, filenames)
+ result.get()
+
+ for filename in os.listdir(td):
+ path = os.path.join(td, filename)
+ shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+ output_handle.close()
+ shutil.rmtree( td )
+
+def fetch_convert(args):
+ (filename, td) = args
+
+ tmp_name = os.path.join( tempfile.gettempdir(), filename)
+ subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
+ output = os.path.join(td, filename)
+ subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
+ os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+ parser.add_argument("-o", "--output", dest="output",
+ required=True,
+ help="Path to the output file.")
+ parser.add_argument("-p", "--processors", dest="processors",
+ type=int, default=10,
+ help="How many processors you want to use.")
+
+ options = parser.parse_args()
+ main( options.output, options.processors )
+
diff -r 000000000000 -r f653fd06f055 get_pubchem/get_pubchem_as_smiles.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_as_smiles.xml Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,36 @@
+
+ as canonical SMILES
+
+ get_pubchem_as_smiles.py
+ -o $pubchem_smi
+ -p 10
+ 2>&1
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ file after another and convert them to canonical SMILES.
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+
+
+
diff -r 000000000000 -r f653fd06f055 repository_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,4 @@
+
+
+
+