Mercurial > repos > bgruening > chemical_data_sources
changeset 5:c2055dd1927b draft default tip
Uploaded
author | bgruening |
---|---|
date | Thu, 24 Apr 2014 13:19:33 -0400 |
parents | 7c1f9962ac07 |
children | |
files | get_online_data/get_online_data.py get_online_data/get_online_data.xml get_pubchem/get_pubchem_as_smiles.xml get_pubchem/get_pubchem_assays.py get_pubchem/get_pubchem_assays.xml jmoleditor/jmoleditor.xml tool_dependencies.xml |
diffstat | 7 files changed, 164 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/get_online_data/get_online_data.py Fri Sep 27 15:51:46 2013 -0400 +++ b/get_online_data/get_online_data.py Thu Apr 24 13:19:33 2014 -0400 @@ -37,7 +37,7 @@ if len(sys.argv) > 3: allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ] else: - allowed_extensions = ['.sdf', '.smi', '.inchi'] + allowed_extensions = ['.sdf', '.smi', '.inchi', '.mol'] for url in urls.split('\n'): url = url.strip()
--- a/get_online_data/get_online_data.xml Fri Sep 27 15:51:46 2013 -0400 +++ b/get_online_data/get_online_data.xml Thu Apr 24 13:19:33 2014 -0400 @@ -6,8 +6,10 @@ get_online_data.py "$url_paste" $output $whitelist </command> <inputs> - <param name="url_paste" type="text" area="true" size="5x55" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/> - <param name="whitelist" type="text" area="true" size="10x20" label="Whitlist of filename extensions" help="Please specify a list of file extensions witch should be extracted, for example sdf, mol, smi. Every line one extension."/> + <param name="url_paste" type="text" area="true" size="5x55" label="URL" help="Here you may specify a list of URLs (one per line)."/> + <param name="whitelist" type="text" area="true" size="10x20" + label="Whitlist of filename extensions" + help="Please specify a list of file extensions witch should be extracted. (default: sdf, mol, smi, inchi). Every line one extension."/> </inputs> <outputs> <data format="txt" name="output" />
--- a/get_pubchem/get_pubchem_as_smiles.xml Fri Sep 27 15:51:46 2013 -0400 +++ b/get_pubchem/get_pubchem_as_smiles.xml Thu Apr 24 13:19:33 2014 -0400 @@ -6,7 +6,7 @@ <command interpreter="python"> get_pubchem_as_smiles.py -o $pubchem_smi - -p 4 + -p "\${GALAXY_SLOTS:-4}" ## temporary hack until my Galaxy patch is committed > /dev/null 2>&1 </command>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_assays.py Thu Apr 24 13:19:33 2014 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2014' +__license__ = 'GLP3+' + +import ftplib +import os, sys +import argparse +import subprocess +from multiprocessing import Pool +import tempfile +import shutil +import urllib +import zipfile +import gzip + + +PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/" + +def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']): + """ + Starting multiple processes to download and extract PubChem Assay data. + """ + td = tempfile.mkdtemp() + ftp = ftplib.FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd( PUBCHEM_URL ) + filelist = ftp.nlst() + + pool = Pool(processes = processors) + triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist)) + + result = pool.map_async(fetch_convert, triplestore) + result.get() + + with open(output,'w+') as output_handle: + for filename in os.listdir( td ): + path = os.path.join( td, filename ) + shutil.copyfileobj(open(path, 'rb'), output_handle) + + shutil.rmtree( td ) + +def fetch_convert(args): + (filename, td, white_list) = args + tmp_name = os.path.join( td, filename) + urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name) + + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(tmp_name, "r") as z: + z.extractall(temp_dir) + + output = os.path.join(td, filename) + '.tsv' + with open(output, 'w+') as out_handle: + for root, dirs, files in os.walk( temp_dir ): + for filename in files: + # filename encodes the assay_id, it looks like 1.csv.gz + # extract the assay id and insert it as column one + assay_id = filename.split('.', 1) + gzfile_path = os.path.join( root, filename ) + with gzip.open(gzfile_path, 'rb') as gzfile: + gzfile.readline() # skip first line + for line in gzfile: + cols = line.split(',') + PUBCHEM_ACTIVITY_OUTCOME = cols[2] + cols = line.pop(4) # removing the URL column + cols.insert(0, assay_id) # insert assay_id as first column + if PUBCHEM_ACTIVITY_OUTCOME in white_list: + out_handle.write( '%s' % line.replace(',', '\t') ) + os.remove(tmp_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') + parser.add_argument("-o", "--output", dest="output", + required=True, + help="Path to the output file.") + parser.add_argument("-p", "--processors", dest="processors", + type=int, default=10, + help="How many processors you want to use.") + parser.add_argument("-w", "--white-list", dest="white_list", + default="Active,Inconclusive,Inactive", + help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.") + + options = parser.parse_args() + main( options.output, options.processors, options.white_list.split(',') ) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_assays.xml Thu Apr 24 13:19:33 2014 -0400 @@ -0,0 +1,68 @@ +<tool id="ctb_pubchem_download_assays" name="PubChem Assay Downloader" Version="0.1" > + <description>as canonical SMILES</description> + <command interpreter="python"> + get_pubchem_assay.py + -o $pubchem_assay_tsv + -p "\${GALAXY_SLOTS:-4}" + --white-list $white_list + </command> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <!-- In case the return code has not been set propery check stderr too --> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + <inputs> + <param name="white_list" type="select" multiple="true" label="Scoring matrix"> + <option value="Active" selected="true">Active</option> + <option value="Inconclusive" selected="true">Inconclusive</option> + <option value="Inactive">Inactive</option> + <option value="Unspecified">Unspecified</option> + <option value="Probe">Probe</option> + </param> + </inputs> + <outputs> + <data format="tabular" name="pubchem_assay_tsv" /> + </outputs> + <tests> + </tests> + <help> + +.. class:: infomark + +**What this tool does** + +This tool will fetch one PubChem_ Assay file after another and concatenating them. +It is possible to optionally filter by PUBCHEM_ACTIVITY_OUTCOME. + +Columns in the result file: + + - column 1: PubChem AID (assay id) + - column 1: PubChem SID (substance id) + - column 2: PubChem CID (compound id) + - column 3: PubChem Activity Outcome + 1-Inactive + 2-Active + 3-Inconclusive + 4-Unspecified + 5-Probe + - column 4: PubChem activity score, the higher value, the more active + - column 5: Test result specific comment + - column 6 and beyond: All remaining columns starting from the 7th column are the TID "names" defined in the associated assay description given by the XML file under the corresponding Description/ directory. These "names" can also be found in the "Result Definitions" section of the assay summary page: e.g. http://pubchem.ncbi.nlm.nih.gov/assay/assay.cgi?aid=2244#aDefinitions + + + +.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/ + +----- + +.. class:: infomark + +**Output** + +The output will be one large SMILES file. + + </help> +</tool>
--- a/jmoleditor/jmoleditor.xml Fri Sep 27 15:51:46 2013 -0400 +++ b/jmoleditor/jmoleditor.xml Thu Apr 24 13:19:33 2014 -0400 @@ -13,7 +13,7 @@ <request_param galaxy_name="data_type" remote_name="data_type" missing="txt" /> <request_param galaxy_name="output_label" remote_name="data_type" missing="txt" > <value_translation> - <value galaxy_value="Molecule" remote_value="mol" /> + <value galaxy_value="Molecule" remote_value="sdf" /> <value galaxy_value="SMILES" remote_value="smi" /> </value_translation> </request_param>
--- a/tool_dependencies.xml Fri Sep 27 15:51:46 2013 -0400 +++ b/tool_dependencies.xml Thu Apr 24 13:19:33 2014 -0400 @@ -1,6 +1,6 @@ <?xml version="1.0"?> <tool_dependency> <package name="openbabel" version="2.3.2"> - <repository changeset_revision="99a10425de93" name="package_openbabel_2_3" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> + <repository changeset_revision="e5ef70185d24" name="package_openbabel_2_3" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> </package> </tool_dependency>