changeset 0:f653fd06f055 draft

Uploaded
author bgruening
date Thu, 15 Aug 2013 03:23:17 -0400
parents
children 17a3f755d472
files get_online_data/get_online_data.py get_online_data/get_online_data.xml get_pubchem/get_pubchem_as_smiles.py get_pubchem/get_pubchem_as_smiles.xml repository_dependencies.xml
diffstat 5 files changed, 225 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_online_data/get_online_data.py	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import os, sys
+import urllib2
+import gzip, tempfile
+import zipfile
+import subprocess
+import shutil
+
+def unescape(cond_text):
+    # Unescape if input has been escaped
+    mapped_chars = { '>' :'__gt__', 
+                 '<' :'__lt__', 
+                 "'" :'__sq__',
+                 '"' :'__dq__',
+                 '[' :'__ob__',
+                 ']' :'__cb__',
+                 '{' :'__oc__',
+                 '}' :'__cc__',
+                 '@' : '__at__',
+                 '\n' : '__cn__',
+                 '\r' : '__cr__',
+                 '\t' : '__tc__'
+                 }
+    for key, value in mapped_chars.items():
+        cond_text = cond_text.replace( value, key )
+    return cond_text
+
+urls = unescape(sys.argv[1])
+out = open(sys.argv[2], 'wb')
+
+if len(sys.argv) > 3:
+    allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
+else:
+    allowed_extensions = ['.sdf', '.smi', '.inchi']
+
+for url in urls.split('\n'):
+    url = url.strip()
+    request = urllib2.Request( url )
+    request.add_header('Accept-encoding', 'gzip')
+    request.add_header('Accept-encoding', 'gz')
+    response = urllib2.urlopen( request )
+
+    if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']:
+        temp = tempfile.NamedTemporaryFile( delete=False )
+        temp.write( response.read() )
+        temp.close()
+        zipfile = gzip.open(temp.name, 'rb')
+        out.write( zipfile.read() )
+        os.remove(temp.name)
+    elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']:
+        temp = tempfile.NamedTemporaryFile(delete=False)
+        temp.close()
+        with open(temp.name, 'wb') as fp:
+            shutil.copyfileobj(response, fp)
+
+        zf = zipfile.ZipFile(temp.name, allowZip64=True)
+        tmpdir = tempfile.mkdtemp( )
+
+        for filename in zf.namelist():
+            zf.extractall( tmpdir )
+
+        os.remove( temp.name )
+        molfiles = []
+        for root, dirs, files in os.walk(tmpdir):
+            for filename in files:
+                if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []:
+                    mfile = os.path.join( root, filename)
+                    molfiles.append( mfile )
+
+        for filename in molfiles:
+            shutil.copyfileobj(open(filename, 'rb'), out)
+        shutil.rmtree( tmpdir )
+        zf.close()
+    elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']:
+        temp = tempfile.NamedTemporaryFile(delete=False)
+        temp.close()
+        with open(temp.name, 'wb') as fp:
+            shutil.copyfileobj(response, fp)
+        cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True)
+        os.remove( temp.name )
+    else:
+        out.write( response.read() )
+out.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_online_data/get_online_data.xml	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,37 @@
+<tool id="ctb_online_data_fetch" name="Online data" version="0.2">
+  <description>
+    fetching ...
+  </description>
+  <command interpreter="python">
+      get_online_data.py "$url_paste" $output $whitelist
+  </command>
+  <inputs>
+      <param name="url_paste" type="text" area="true" size="5x55" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/>
+      <param name="whitelist" type="text" area="true" size="10x20" label="Whitlist of filename extensions" help="Please specify a list of file extensions witch should be extracted, for example sdf, mol, smi. Every line one extension."/> 
+  </inputs>
+  <outputs>
+     <data format="txt" name="output" />
+  </outputs>
+
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+Fetch data via FTP or HTTP and store them in your history.
+
+-----
+
+.. class:: infomark
+
+**Input**
+
+Supported filetypes are:
+	- gz/gzip
+	- rar
+
+ZIP is supported with recursive extracting of specific filetypes.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_as_smiles.py	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+
+def main(output, processors = 10):
+    output_handle = open(output,'w+')
+
+    td = tempfile.mkdtemp()
+    ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+    ftp.login()
+    ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
+    filelist = ftp.nlst()
+
+    pool = Pool(processes = processors)
+    filenames = zip(filelist, [td]*len(filelist))
+
+    result = pool.map_async(fetch_convert, filenames)
+    result.get()
+
+    for filename in os.listdir(td):
+        path = os.path.join(td, filename)
+        shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+    output_handle.close()
+    shutil.rmtree( td )
+
+def fetch_convert(args):
+    (filename, td) = args
+
+    tmp_name = os.path.join( tempfile.gettempdir(), filename)
+    subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
+    output = os.path.join(td, filename)
+    subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
+    os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+    parser.add_argument("-o", "--output", dest="output",
+                    required=True,
+                    help="Path to the output file.")
+    parser.add_argument("-p", "--processors", dest="processors",
+                    type=int, default=10,
+                    help="How many processors you want to use.")
+
+    options = parser.parse_args()
+    main( options.output, options.processors )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_as_smiles.xml	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,36 @@
+<tool id="ctb_pubchem_download_as_smiles" name="PubChem Download" Version="0.1" >
+  <description>as canonical SMILES</description>
+  <command interpreter="python">
+        get_pubchem_as_smiles.py 
+            -o $pubchem_smi 
+            -p 10
+            2>&#38;1
+  </command>
+  <inputs>
+     <param name="infile" type="select" display="radio" size="250" label="Load all pubchem files and convert them to canonical smiles." />
+  </inputs>
+  <outputs>
+     <data format="smi" name="pubchem_smi" />
+  </outputs>
+  <tests>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ file after another and convert them to canonical SMILES. 
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+  
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repository_dependencies.xml	Thu Aug 15 03:23:17 2013 -0400
@@ -0,0 +1,4 @@
+<?xml version="1.0"?>
+<repositories description="This requires the Molecule datatype definitions (e.g. SMILES, InChI, SD-format).">
+    <repository changeset_revision="85eca06eefc6" name="molecule_datatypes" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+</repositories>