# HG changeset patch # User bgruening # Date 1558525443 14400 # Node ID cd19c3fab3a60dcdf03859701b6c1f5ddb6e76ce planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_pubchem commit aed18d7d09e332efe57d00b33c2b8249abefaedb diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_as_smiles.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem_as_smiles.py Wed May 22 07:44:03 2019 -0400 @@ -0,0 +1,58 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2012' +__license__ = 'GLP3+' + +import ftplib +import os, sys +import argparse +import subprocess +from multiprocessing import Pool +import tempfile +import shutil + + +def main(output, processors = 4): + output_handle = open(output,'w+') + + td = tempfile.mkdtemp() + ftp = ftplib.FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/') + filelist = ftp.nlst() + + pool = Pool(processes = processors) + filenames = zip(filelist, [td]*len(filelist)) + result = pool.map_async(fetch_convert, filenames) + result.get() + + for filename in os.listdir(td): + path = os.path.join(td, filename) + shutil.copyfileobj(open(path, 'rb'), output_handle) + + output_handle.close() + shutil.rmtree(td) + +def fetch_convert(args): + (filename, td) = args + tmp_name = os.path.join( td, filename) + subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] ) + output = os.path.join(td, filename) + '.smi' + subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output]) + os.remove(tmp_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') + parser.add_argument("-o", "--output", dest="output", + required=True, + help="Path to the output file.") + parser.add_argument("-p", "--processors", dest="processors", + type=int, default=10, + help="How many processors you want to use.") + + options = parser.parse_args() + main( options.output, options.processors ) + diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_as_smiles.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem_as_smiles.xml Wed May 22 07:44:03 2019 -0400 @@ -0,0 +1,58 @@ + + as canonical SMILES + + openbabel + python + + + + + + + + /dev/null 2>&1 +]]> + + + + + + + + + + + + diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_assays.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem_assays.py Wed May 22 07:44:03 2019 -0400 @@ -0,0 +1,88 @@ +#!/usr/bin/env python + +__author__ = 'Bjoern Gruening' +__version__ = '0.1' +__date__ = '2014' +__license__ = 'GLP3+' + +import ftplib +import os, sys +import argparse +import subprocess +from multiprocessing import Pool +import tempfile +import shutil +import urllib +import zipfile +import gzip + + +PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/" + +def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']): + """ + Starting multiple processes to download and extract PubChem Assay data. + """ + td = tempfile.mkdtemp() + ftp = ftplib.FTP('ftp.ncbi.nih.gov') + ftp.login() + ftp.cwd( PUBCHEM_URL ) + filelist = ftp.nlst() + + pool = Pool(processes = processors) + triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist)) + + result = pool.map_async(fetch_convert, triplestore) + result.get() + + with open(output,'w+') as output_handle: + for filename in os.listdir( td ): + path = os.path.join( td, filename ) + shutil.copyfileobj(open(path, 'rb'), output_handle) + + shutil.rmtree( td ) + +def fetch_convert(args): + (filename, td, white_list) = args + tmp_name = os.path.join( td, filename) + urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name) + + temp_dir = tempfile.mkdtemp() + with zipfile.ZipFile(tmp_name, "r") as z: + z.extractall(temp_dir) + + output = os.path.join(td, filename) + '.tsv' + with open(output, 'w+') as out_handle: + for root, dirs, files in os.walk( temp_dir ): + for filename in files: + # filename encodes the assay_id, it looks like 1.csv.gz + # extract the assay id and insert it as column one + assay_id = filename.split('.', 1) + gzfile_path = os.path.join( root, filename ) + with gzip.open(gzfile_path, 'rb') as gzfile: + gzfile.readline() # skip first line + for line in gzfile: + cols = line.split(',') + PUBCHEM_ACTIVITY_OUTCOME = cols[2] + cols = line.pop(4) # removing the URL column + cols.insert(0, assay_id) # insert assay_id as first column + if PUBCHEM_ACTIVITY_OUTCOME in white_list: + out_handle.write( '%s' % line.replace(',', '\t') ) + os.remove(tmp_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.') + parser.add_argument("-o", "--output", dest="output", + required=True, + help="Path to the output file.") + parser.add_argument("-p", "--processors", dest="processors", + type=int, default=10, + help="How many processors you want to use.") + parser.add_argument("-w", "--white-list", dest="white_list", + default="Active,Inconclusive,Inactive", + help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.") + + options = parser.parse_args() + main( options.output, options.processors, options.white_list.split(',') ) + diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_assays.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_pubchem_assays.xml Wed May 22 07:44:03 2019 -0400 @@ -0,0 +1,67 @@ + + as table + + python + + + + + + + + + + + + + + + + + + + + + +