Mercurial > repos > bgruening > chemical_data_sources
changeset 0:f653fd06f055 draft
Uploaded
author | bgruening |
---|---|
date | Thu, 15 Aug 2013 03:23:17 -0400 |
parents | |
children | 17a3f755d472 |
files | get_online_data/get_online_data.py get_online_data/get_online_data.xml get_pubchem/get_pubchem_as_smiles.py get_pubchem/get_pubchem_as_smiles.xml repository_dependencies.xml |
diffstat | 5 files changed, 225 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_online_data/get_online_data.py Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import os, sys +import urllib2 +import gzip, tempfile +import zipfile +import subprocess +import shutil + +def unescape(cond_text): + # Unescape if input has been escaped + mapped_chars = { '>' :'__gt__', + '<' :'__lt__', + "'" :'__sq__', + '"' :'__dq__', + '[' :'__ob__', + ']' :'__cb__', + '{' :'__oc__', + '}' :'__cc__', + '@' : '__at__', + '\n' : '__cn__', + '\r' : '__cr__', + '\t' : '__tc__' + } + for key, value in mapped_chars.items(): + cond_text = cond_text.replace( value, key ) + return cond_text + +urls = unescape(sys.argv[1]) +out = open(sys.argv[2], 'wb') + +if len(sys.argv) > 3: + allowed_extensions = [ ext.strip() for ext in unescape(sys.argv[3]).split('\n') ] +else: + allowed_extensions = ['.sdf', '.smi', '.inchi'] + +for url in urls.split('\n'): + url = url.strip() + request = urllib2.Request( url ) + request.add_header('Accept-encoding', 'gzip') + request.add_header('Accept-encoding', 'gz') + response = urllib2.urlopen( request ) + + if response.info().get('Content-Encoding') in ['gz','gzip'] or os.path.splitext(url)[-1] in ['.gz','.gzip']: + temp = tempfile.NamedTemporaryFile( delete=False ) + temp.write( response.read() ) + temp.close() + zipfile = gzip.open(temp.name, 'rb') + out.write( zipfile.read() ) + os.remove(temp.name) + elif response.info().get('Content-Encoding') in ['zip'] or os.path.splitext(url)[-1] in ['.zip']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + + zf = zipfile.ZipFile(temp.name, allowZip64=True) + tmpdir = tempfile.mkdtemp( ) + + for filename in zf.namelist(): + zf.extractall( tmpdir ) + + os.remove( temp.name ) + molfiles = [] + for root, dirs, files in os.walk(tmpdir): + for filename in files: + if os.path.splitext(filename)[-1].lower() in allowed_extensions or allowed_extensions == []: + mfile = os.path.join( root, filename) + molfiles.append( mfile ) + + for filename in molfiles: + shutil.copyfileobj(open(filename, 'rb'), out) + shutil.rmtree( tmpdir ) + zf.close() + elif response.info().get('Content-Encoding') == 'rar' or os.path.splitext(url)[-1] in ['.rar']: + temp = tempfile.NamedTemporaryFile(delete=False) + temp.close() + with open(temp.name, 'wb') as fp: + shutil.copyfileobj(response, fp) + cmd = subprocess.Popen('unrar p -inul %s' % temp.name, stdout=out, shell=True) + os.remove( temp.name ) + else: + out.write( response.read() ) +out.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_online_data/get_online_data.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,37 @@ +<tool id="ctb_online_data_fetch" name="Online data" version="0.2"> + <description> + fetching ... + </description> + <command interpreter="python"> + get_online_data.py "$url_paste" $output $whitelist + </command> + <inputs> + <param name="url_paste" type="text" area="true" size="5x55" label="URL/Text" help="Here you may specify a list of URLs (one per line) or paste the contents of a file."/> + <param name="whitelist" type="text" area="true" size="10x20" label="Whitlist of filename extensions" help="Please specify a list of file extensions witch should be extracted, for example sdf, mol, smi. Every line one extension."/> + </inputs> + <outputs> + <data format="txt" name="output" /> + </outputs> + + <help> + +.. class:: infomark + +**What this tool does** + +Fetch data via FTP or HTTP and store them in your history. + +----- + +.. class:: infomark + +**Input** + +Supported filetypes are: + - gz/gzip + - rar + +ZIP is supported with recursive extracting of specific filetypes. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_as_smiles.py Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import ftplib +import os, sys +import argparse +import subprocess +from multiprocessing import Pool +import tempfile +import shutil + +def main(output, processors = 10): + output_handle = open(output,'w+') + + td = tempfile.mkdtemp() + ftp = ftplib.FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/') + filelist = ftp.nlst() + + pool = Pool(processes = processors) + filenames = zip(filelist, [td]*len(filelist)) + + result = pool.map_async(fetch_convert, filenames) + result.get() + + for filename in os.listdir(td): + path = os.path.join(td, filename) + shutil.copyfileobj(open(path, 'rb'), output_handle) + + output_handle.close() + shutil.rmtree( td ) + +def fetch_convert(args): + (filename, td) = args + + tmp_name = os.path.join( tempfile.gettempdir(), filename) + subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] ) + output = os.path.join(td, filename) + subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output]) + os.remove(tmp_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') + parser.add_argument("-o", "--output", dest="output", + required=True, + help="Path to the output file.") + parser.add_argument("-p", "--processors", dest="processors", + type=int, default=10, + help="How many processors you want to use.") + + options = parser.parse_args() + main( options.output, options.processors ) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem/get_pubchem_as_smiles.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,36 @@ +<tool id="ctb_pubchem_download_as_smiles" name="PubChem Download" Version="0.1" > + <description>as canonical SMILES</description> + <command interpreter="python"> + get_pubchem_as_smiles.py + -o $pubchem_smi + -p 10 + 2>&1 + </command> + <inputs> + <param name="infile" type="select" display="radio" size="250" label="Load all pubchem files and convert them to canonical smiles." /> + </inputs> + <outputs> + <data format="smi" name="pubchem_smi" /> + </outputs> + <tests> + </tests> + <help> + +.. class:: infomark + +**What this tool does** + +This tool will fetch one PubChem_ file after another and convert them to canonical SMILES. + +.. _PubChem: http://pubchem.ncbi.nlm.nih.gov/ + +----- + +.. class:: infomark + +**Output** + +The output will be one large SMILES file. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Thu Aug 15 03:23:17 2013 -0400 @@ -0,0 +1,4 @@ +<?xml version="1.0"?> +<repositories description="This requires the Molecule datatype definitions (e.g. SMILES, InChI, SD-format)."> + <repository changeset_revision="85eca06eefc6" name="molecule_datatypes" owner="iuc" toolshed="http://toolshed.g2.bx.psu.edu" /> +</repositories>