# HG changeset patch
# User bgruening
# Date 1558525443 14400
# Node ID cd19c3fab3a60dcdf03859701b6c1f5ddb6e76ce
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/data_source/get_pubchem commit aed18d7d09e332efe57d00b33c2b8249abefaedb
diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_as_smiles.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_as_smiles.py Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2012'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+
+
+def main(output, processors = 4):
+ output_handle = open(output,'w+')
+
+ td = tempfile.mkdtemp()
+ ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+ ftp.login()
+ ftp.cwd('/pubchem/Compound/CURRENT-Full/SDF/')
+ filelist = ftp.nlst()
+
+ pool = Pool(processes = processors)
+ filenames = zip(filelist, [td]*len(filelist))
+ result = pool.map_async(fetch_convert, filenames)
+ result.get()
+
+ for filename in os.listdir(td):
+ path = os.path.join(td, filename)
+ shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+ output_handle.close()
+ shutil.rmtree(td)
+
+def fetch_convert(args):
+ (filename, td) = args
+ tmp_name = os.path.join( td, filename)
+ subprocess.call( ['wget', '-O', tmp_name, os.path.join('ftp://ftp.ncbi.nih.gov/pubchem/Compound/CURRENT-Full/SDF/', filename)] )
+ output = os.path.join(td, filename) + '.smi'
+ subprocess.call(["obabel", "-isdf", tmp_name, "-ocan", '-O', output])
+ os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+ parser.add_argument("-o", "--output", dest="output",
+ required=True,
+ help="Path to the output file.")
+ parser.add_argument("-p", "--processors", dest="processors",
+ type=int, default=10,
+ help="How many processors you want to use.")
+
+ options = parser.parse_args()
+ main( options.output, options.processors )
+
diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_as_smiles.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_as_smiles.xml Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,58 @@
+
+ as canonical SMILES
+
+ openbabel
+ python
+
+
+
+
+
+
+
+ /dev/null 2>&1
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_assays.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_assays.py Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,88 @@
+#!/usr/bin/env python
+
+__author__ = 'Bjoern Gruening'
+__version__ = '0.1'
+__date__ = '2014'
+__license__ = 'GLP3+'
+
+import ftplib
+import os, sys
+import argparse
+import subprocess
+from multiprocessing import Pool
+import tempfile
+import shutil
+import urllib
+import zipfile
+import gzip
+
+
+PUBCHEM_URL = "ftp://ftp.ncbi.nlm.nih.gov/pubchem/Bioassay/CSV/Data/"
+
+def main(output, processors = 4, white_list = ['Active','Inconclusive', 'Inactive']):
+ """
+ Starting multiple processes to download and extract PubChem Assay data.
+ """
+ td = tempfile.mkdtemp()
+ ftp = ftplib.FTP('ftp.ncbi.nih.gov')
+ ftp.login()
+ ftp.cwd( PUBCHEM_URL )
+ filelist = ftp.nlst()
+
+ pool = Pool(processes = processors)
+ triplestore = zip(filelist, [td]*len(filelist), [white_list]*len(filelist))
+
+ result = pool.map_async(fetch_convert, triplestore)
+ result.get()
+
+ with open(output,'w+') as output_handle:
+ for filename in os.listdir( td ):
+ path = os.path.join( td, filename )
+ shutil.copyfileobj(open(path, 'rb'), output_handle)
+
+ shutil.rmtree( td )
+
+def fetch_convert(args):
+ (filename, td, white_list) = args
+ tmp_name = os.path.join( td, filename)
+ urllib.urlretrieve(os.path.join(PUBCHEM_URL, filename), tmp_name)
+
+ temp_dir = tempfile.mkdtemp()
+ with zipfile.ZipFile(tmp_name, "r") as z:
+ z.extractall(temp_dir)
+
+ output = os.path.join(td, filename) + '.tsv'
+ with open(output, 'w+') as out_handle:
+ for root, dirs, files in os.walk( temp_dir ):
+ for filename in files:
+ # filename encodes the assay_id, it looks like 1.csv.gz
+ # extract the assay id and insert it as column one
+ assay_id = filename.split('.', 1)
+ gzfile_path = os.path.join( root, filename )
+ with gzip.open(gzfile_path, 'rb') as gzfile:
+ gzfile.readline() # skip first line
+ for line in gzfile:
+ cols = line.split(',')
+ PUBCHEM_ACTIVITY_OUTCOME = cols[2]
+ cols = line.pop(4) # removing the URL column
+ cols.insert(0, assay_id) # insert assay_id as first column
+ if PUBCHEM_ACTIVITY_OUTCOME in white_list:
+ out_handle.write( '%s' % line.replace(',', '\t') )
+ os.remove(tmp_name)
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Download the whole PubChem and converts it to canonical SMILES on the fly.')
+ parser.add_argument("-o", "--output", dest="output",
+ required=True,
+ help="Path to the output file.")
+ parser.add_argument("-p", "--processors", dest="processors",
+ type=int, default=10,
+ help="How many processors you want to use.")
+ parser.add_argument("-w", "--white-list", dest="white_list",
+ default="Active,Inconclusive,Inactive",
+ help="List of comma separated PUBCHEM_ACTIVITY_OUTCOME values that should be fetched.")
+
+ options = parser.parse_args()
+ main( options.output, options.processors, options.white_list.split(',') )
+
diff -r 000000000000 -r cd19c3fab3a6 get_pubchem_assays.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_pubchem_assays.xml Wed May 22 07:44:03 2019 -0400
@@ -0,0 +1,67 @@
+
+ as table
+
+ python
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+