# HG changeset patch
# User bgruening
# Date 1398359973 14400
# Node ID c2055dd1927bbc9a86e30b0d325553ecd6a10034
# Parent 7c1f9962ac07deeba4259ad68cdc344007a6f550
Uploaded
diff -r 7c1f9962ac07 -r c2055dd1927b get_online_data/get_online_data.py
--- a/get_online_data/get_online_data.py Fri Sep 27 15:51:46 2013 -0400
+++ b/get_online_data/get_online_data.py Thu Apr 24 13:19:33 2014 -0400
@@ -37,7 +37,7 @@
if len(sys.argv) > 3:
allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ]
else:
- allowed_extensions = ['.sdf', '.smi', '.inchi']
+ allowed_extensions = ['.sdf', '.smi', '.inchi', '.mol']
for url in urls.split('\n'):
url = url.strip()
diff -r 7c1f9962ac07 -r c2055dd1927b get_online_data/get_online_data.xml
--- a/get_online_data/get_online_data.xml Fri Sep 27 15:51:46 2013 -0400
+++ b/get_online_data/get_online_data.xml Thu Apr 24 13:19:33 2014 -0400
@@ -6,8 +6,10 @@
get_online_data.py "$url_paste" $output $whitelist
-
-
+
+
diff -r 7c1f9962ac07 -r c2055dd1927b get_pubchem/get_pubchem_as_smiles.xml
--- a/get_pubchem/get_pubchem_as_smiles.xml Fri Sep 27 15:51:46 2013 -0400
+++ b/get_pubchem/get_pubchem_as_smiles.xml Thu Apr 24 13:19:33 2014 -0400
@@ -6,7 +6,7 @@
get_pubchem_as_smiles.py
-o $pubchem_smi
- -p 4
+ -p "\${GALAXY_SLOTS:-4}"
## temporary hack until my Galaxy patch is committed
> /dev/null 2>&1
diff -r 7c1f9962ac07 -r c2055dd1927b get_pubchem/get_pubchem_assays.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_assays.py Thu Apr 24 13:19:33 2014 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2014'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+import urllib
+import zipfile
+import gzip
+
+
+PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
+
+def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
+ """
+ Starting multiple processes to download and extract PubChem Assay data.
+ """
+ td = tempfile.mkdtemp()
+ ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+ ftp.login()
+ ftp.cwd( PUBCHEM_URL )
+ filelist = ftp.nlst()
+
+ pool = Pool(processes = processors)
+ triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
+
+ result = pool.map_async(fetch_convert, triplestore)
+ result.get()
+
+ with open(output,'w+') as output_handle:
+ for filename in os.listdir( td ):
+ path = os.path.join( td, filename )
+ shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+ shutil.rmtree( td )
+
+def fetch_convert(args):
+ (filename, td, white_list) = args
+ tmp_name = os.path.join( td, filename)
+ urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
+
+ temp_dir = tempfile.mkdtemp()
+ with zipfile.ZipFile(tmp_name, "r") as z:
+ z.extractall(temp_dir)
+
+ output = os.path.join(td, filename) + '.tsv'
+ with open(output, 'w+') as out_handle:
+ for root, dirs, files in os.walk( temp_dir ):
+ for filename in files:
+ # filename encodes the assay_id, it looks like 1.csv.gz
+ # extract the assay id and insert it as column one
+ assay_id = filename.split('.', 1)
+ gzfile_path = os.path.join( root, filename )
+ with gzip.open(gzfile_path, 'rb') as gzfile:
+ gzfile.readline() # skip first line
+ for line in gzfile:
+ cols = line.split(',')
+ PUBCHEM_ACTIVITY_OUTCOME = cols[2]
+ cols = line.pop(4) # removing the URL column
+ cols.insert(0, assay_id) # insert assay_id as first column
+ if PUBCHEM_ACTIVITY_OUTCOME in white_list:
+ out_handle.write( '%s' % line.replace(',', '\t') )
+ os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+ parser.add_argument("-o", "--output", dest="output",
+ required=True,
+ help="Path to the output file.")
+ parser.add_argument("-p", "--processors", dest="processors",
+ type=int, default=10,
+ help="How many processors you want to use.")
+ parser.add_argument("-w", "--white-list", dest="white_list",
+ default="Active,Inconclusive,Inactive",
+ help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
+
+ options = parser.parse_args()
+ main( options.output, options.processors, options.white_list.split(',') )
+
diff -r 7c1f9962ac07 -r c2055dd1927b get_pubchem/get_pubchem_assays.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem/get_pubchem_assays.xml Thu Apr 24 13:19:33 2014 -0400
@@ -0,0 +1,68 @@
+
+ as canonical SMILES
+
+ get_pubchem_assay.py
+ -o $pubchem_assay_tsv
+ -p "\${GALAXY_SLOTS:-4}"
+ --white-list $white_list
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**What this tool does**
+
+This tool will fetch one PubChem_ Assay file after another and concatenating them.
+It is possible to optionally filter by PUBCHEM_ACTIVITY_OUTCOME.
+
+Columns in the result file:
+
+ - column 1: PubChem AID (assay id)
+ - column 1: PubChem SID (substance id)
+ - column 2: PubChem CID (compound id)
+ - column 3: PubChem Activity Outcome
+ 1-Inactive
+ 2-Active
+ 3-Inconclusive
+ 4-Unspecified
+ 5-Probe
+ - column 4: PubChem activity score, the higher value, the more active
+ - column 5: Test result specific comment
+ - column 6 and beyond: All remaining columns starting from the 7th column are the TID "names" defined in the associated assay description given by the XML file under the corresponding Description/ directory. These "names" can also be found in the "Result Definitions" section of the assay summary page: e.g. http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=2244#aDefinitions
+
+
+
+.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The output will be one large SMILES file.
+
+
+
diff -r 7c1f9962ac07 -r c2055dd1927b jmoleditor/jmoleditor.xml
--- a/jmoleditor/jmoleditor.xml Fri Sep 27 15:51:46 2013 -0400
+++ b/jmoleditor/jmoleditor.xml Thu Apr 24 13:19:33 2014 -0400
@@ -13,7 +13,7 @@
-
+
diff -r 7c1f9962ac07 -r c2055dd1927b tool_dependencies.xml
--- a/tool_dependencies.xml Fri Sep 27 15:51:46 2013 -0400
+++ b/tool_dependencies.xml Thu Apr 24 13:19:33 2014 -0400
@@ -1,6 +1,6 @@
-
+