Mercurial > repos > bgruening > chemical_data_sources

--- a/get_online_data/get_online_data.py	Fri Sep 27 15:51:46 2013 -0400
+++ b/get_online_data/get_online_data.py	Thu Apr 24 13:19:33 2014 -0400
@@ -37,7 +37,7 @@
 if len(sys.argv) > 3:
     allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
 else:
-    allowed_extensions = ['.sdf', '.smi', '.inchi']
+    allowed_extensions = ['.sdf', '.smi', '.inchi', '.mol']

 for url in urls.split('\n'):
     url = url.strip()
--- a/get_online_data/get_online_data.xml	Fri Sep 27 15:51:46 2013 -0400
+++ b/get_online_data/get_online_data.xml	Thu Apr 24 13:19:33 2014 -0400
@@ -6,8 +6,10 @@
       get_online_data.py "$url_paste" $output $whitelist
   </command>
   <inputs>
-      <param name="url_paste" type="text" area="true" size="5x55" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/>
-      <param name="whitelist" type="text" area="true" size="10x20" label="Whitlist of filename extensions" help="Please specify a list of file extensions witch should be extracted, for example sdf, mol, smi. Every line one extension."/>
+      <param name="url_paste" type="text" area="true" size="5x55" label="URL" help="Here you may specify a list of URLs (one per line)."/>
+      <param name="whitelist" type="text" area="true" size="10x20"
+        label="Whitlist of filename extensions"
+        help="Please specify a list of file extensions witch should be extracted. (default: sdf, mol, smi, inchi). Every line one extension."/>
   </inputs>
   <outputs>
      <data format="txt" name="output" />
--- a/get_pubchem/get_pubchem_as_smiles.xml	Fri Sep 27 15:51:46 2013 -0400
+++ b/get_pubchem/get_pubchem_as_smiles.xml	Thu Apr 24 13:19:33 2014 -0400
@@ -6,7 +6,7 @@
     <command interpreter="python">
         get_pubchem_as_smiles.py
             -o $pubchem_smi
-            -p 4
+            -p "\${GALAXY_SLOTS:-4}"
             ## temporary hack until my Galaxy patch is committed
             > /dev/null 2>&#38;1
     </command>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_assays.py	Thu Apr 24 13:19:33 2014 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2014'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+import urllib
+import zipfile
+import gzip
+
+
+PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
+
+def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
+    """
+        Starting multiple processes to download and extract PubChem Assay data.
+    """
+    td = tempfile.mkdtemp()
+    ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+    ftp.login()
+    ftp.cwd( PUBCHEM_URL )
+    filelist = ftp.nlst()
+
+    pool = Pool(processes = processors)
+    triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
+
+    result = pool.map_async(fetch_convert, triplestore)
+    result.get()
+
+    with open(output,'w+') as output_handle:
+        for filename in os.listdir( td ):
+            path = os.path.join( td, filename )
+            shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+    shutil.rmtree( td )
+
+def fetch_convert(args):
+    (filename, td, white_list) = args
+    tmp_name = os.path.join( td, filename)
+    urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
+
+    temp_dir = tempfile.mkdtemp()
+    with zipfile.ZipFile(tmp_name, "r") as z:
+        z.extractall(temp_dir)
+
+    output = os.path.join(td, filename) + '.tsv'
+    with open(output, 'w+') as out_handle:
+        for root, dirs, files in os.walk( temp_dir ):
+            for filename in files:
+                # filename encodes the assay_id, it looks like 1.csv.gz
+                # extract the assay id and insert it as column one
+                assay_id = filename.split('.', 1)
+                gzfile_path = os.path.join( root, filename )
+                with gzip.open(gzfile_path, 'rb') as gzfile:
+                    gzfile.readline() # skip first line
+                    for line in gzfile:
+                        cols = line.split(',')
+                        PUBCHEM_ACTIVITY_OUTCOME = cols[2]
+                        cols = line.pop(4) # removing the URL column
+                        cols.insert(0, assay_id) # insert assay_id as first column
+                        if PUBCHEM_ACTIVITY_OUTCOME in white_list:
+                            out_handle.write( '%s' % line.replace(',', '\t') )
+    os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+    parser.add_argument("-o", "--output", dest="output",
+                    required=True,
+                    help="Path to the output file.")
+    parser.add_argument("-p", "--processors", dest="processors",
+                    type=int, default=10,
+                    help="How many processors you want to use.")
+    parser.add_argument("-w", "--white-list", dest="white_list",
+                    default="Active,Inconclusive,Inactive",
+                    help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
+
+    options = parser.parse_args()
+    main( options.output, options.processors, options.white_list.split(',') )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_assays.xml	Thu Apr 24 13:19:33 2014 -0400
@@ -0,0 +1,68 @@
+<tool id="ctb_pubchem_download_assays" name="PubChem Assay Downloader" Version="0.1" >
+    <description>as canonical SMILES</description>
+    <command interpreter="python">
+        get_pubchem_assay.py
+            -o $pubchem_assay_tsv
+            -p "\${GALAXY_SLOTS:-4}"
+            --white-list $white_list
+    </command>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+        <!-- In case the return code has not been set propery check stderr too -->
+        <regex match="Error:" />
+        <regex match="Exception:" />
+    </stdio>
+    <inputs>
+        <param name="white_list" type="select" multiple="true" label="Scoring matrix">
+            <option value="Active" selected="true">Active</option>
+            <option value="Inconclusive" selected="true">Inconclusive</option>
+            <option value="Inactive">Inactive</option>
+            <option value="Unspecified">Unspecified</option>
+            <option value="Probe">Probe</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="pubchem_assay_tsv" />
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ Assay file after another and concatenating them.
+It is possible to optionally filter by PUBCHEM_ACTIVITY_OUTCOME.
+
+Columns in the result file:
+
+ - column 1: PubChem AID (assay id)
+ - column 1: PubChem SID (substance id)
+ - column 2: PubChem CID (compound id)
+ - column 3: PubChem Activity Outcome
+            1-Inactive
+            2-Active
+            3-Inconclusive
+            4-Unspecified
+            5-Probe
+ - column 4: PubChem activity score, the higher value, the more active
+ - column 5: Test result specific comment
+ - column 6 and beyond: All remaining columns starting from the 7th column are the TID "names" defined in the associated assay description given by the XML file under the corresponding Description/ directory. These "names" can also be found in the "Result Definitions" section of the assay summary page: e.g. http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=2244#aDefinitions
+
+
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+
+    </help>
+</tool>
--- a/jmoleditor/jmoleditor.xml	Fri Sep 27 15:51:46 2013 -0400
+++ b/jmoleditor/jmoleditor.xml	Thu Apr 24 13:19:33 2014 -0400
@@ -13,7 +13,7 @@
         <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" />
         <request_param galaxy_name="output_label" remote_name="data_type" missing="txt" >
             <value_translation>
-                <value galaxy_value="Molecule" remote_value="mol" />
+                <value galaxy_value="Molecule" remote_value="sdf" />
                 <value galaxy_value="SMILES" remote_value="smi" />
             </value_translation>
         </request_param>
--- a/tool_dependencies.xml	Fri Sep 27 15:51:46 2013 -0400
+++ b/tool_dependencies.xml	Thu Apr 24 13:19:33 2014 -0400
@@ -1,6 +1,6 @@
 <?xml version="1.0"?>
 <tool_dependency>
     <package name="openbabel" version="2.3.2">
-        <repository changeset_revision="99a10425de93" name="package_openbabel_2_3" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
+        <repository changeset_revision="e5ef70185d24" name="package_openbabel_2_3" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" />
     </package>
 </tool_dependency>