# HG changeset patch # User iuc # Date 1616328454 0 # Node ID 5558f74bd2968b08699a66d85580de9e048f9f36 # Parent 5a0d0bee4f8dd1c344c8ff1ae05ec6a0ee08ceb2 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_diamond_database_builder commit 75abf7d4b23ed7ae8abce80609d81b20bc882863" diff -r 5a0d0bee4f8d -r 5558f74bd296 data_manager/data_manager_diamond_database_builder.py --- a/data_manager/data_manager_diamond_database_builder.py Tue Dec 03 17:39:48 2019 -0500 +++ b/data_manager/data_manager_diamond_database_builder.py Sun Mar 21 12:07:34 2021 +0000 @@ -1,17 +1,19 @@ #!/usr/bin/env python +import bz2 +import gzip import json -import sys -import os -import tempfile -import shutil import optparse -import urllib2 +import os +import shutil import subprocess -from ftplib import FTP +import sys import tarfile +import tempfile +import urllib.error +import urllib.parse +import urllib.request import zipfile -import gzip -import bz2 +from ftplib import FTP CHUNK_SIZE = 2**20 # 1mb @@ -21,11 +23,6 @@ shutil.rmtree(tmp_dir) -def stop_err(msg): - sys.stderr.write(msg) - sys.exit(1) - - def _get_files_in_ftp_path(ftp, path): path_contents = [] ftp.retrlines('MLSD %s' % (path), path_contents.append) @@ -54,12 +51,17 @@ return [bz2.BZ2File(file_obj.name, 'rb')] -def download_from_ncbi(data_manager_dict, params, target_directory, database_id, database_name): +def download_from_ncbi(data_manager_dict, params, target_directory, + database_id, database_name): NCBI_FTP_SERVER = 'ftp.ncbi.nlm.nih.gov' NCBI_DOWNLOAD_PATH = '/blast/db/FASTA/' - COMPRESSED_EXTENSIONS = [('.tar.gz', _get_stream_readers_for_tar), ('.tar.bz2', _get_stream_readers_for_tar), ('.zip', _get_stream_readers_for_zip), ('.gz', _get_stream_readers_for_gzip), ('.bz2', _get_stream_readers_for_bz2)] + COMPRESSED_EXTENSIONS = [('.tar.gz', _get_stream_readers_for_tar), + ('.tar.bz2', _get_stream_readers_for_tar), + ('.zip', _get_stream_readers_for_zip), + ('.gz', _get_stream_readers_for_gzip), + ('.bz2', _get_stream_readers_for_bz2)] - ncbi_identifier = params['param_dict']['reference_source']['requested_identifier'] + ncbi_identifier = params['reference_source']['requested_identifier'] ftp = FTP(NCBI_FTP_SERVER) ftp.login() @@ -79,9 +81,9 @@ tmp_dir = tempfile.mkdtemp(prefix='tmp-data-manager-ncbi-') ncbi_fasta_filename = os.path.join(tmp_dir, "%s%s" % (ncbi_identifier, ext)) - fasta_base_filename = "%s.fa" % database_id - fasta_filename = os.path.join(target_directory, fasta_base_filename) - fasta_writer = open(fasta_filename, 'wb+') + # fasta_base_filename = "%s.fa" % database_id + # fasta_filename = os.path.join(target_directory, fasta_base_filename) + # fasta_writer = open(fasta_filename, 'wb+') tmp_extract_dir = os.path.join(tmp_dir, 'extracted_fasta') os.mkdir(tmp_extract_dir) @@ -106,8 +108,8 @@ def download_from_url(data_manager_dict, params, target_directory, database_id, database_name): # TODO: we should automatically do decompression here - urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) - fasta_reader = [urllib2.urlopen(url) for url in urls] + urls = list(filter(bool, [x.strip() for x in params['reference_source']['user_url'].split('\n')])) + fasta_reader = [urllib.request.urlopen(url) for url in urls] data_table_entry = _stream_fasta_to_file(fasta_reader, target_directory, database_id, database_name, params) _add_data_table_entry(data_manager_dict, data_table_entry) @@ -115,19 +117,19 @@ def download_from_history(data_manager_dict, params, target_directory, database_id, database_name): # TODO: allow multiple FASTA input files - input_filename = params['param_dict']['reference_source']['input_fasta'] + input_filename = params['reference_source']['input_fasta'] if isinstance(input_filename, list): fasta_reader = [open(filename, 'rb') for filename in input_filename] else: - fasta_reader = open(input_filename) + fasta_reader = open(input_filename, 'rb') data_table_entry = _stream_fasta_to_file(fasta_reader, target_directory, database_id, database_name, params) _add_data_table_entry(data_manager_dict, data_table_entry) def copy_from_directory(data_manager_dict, params, target_directory, database_id, database_name): - input_filename = params['param_dict']['reference_source']['fasta_filename'] - create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' + input_filename = params['reference_source']['fasta_filename'] + create_symlink = params['reference_source']['create_symlink'] == 'create_symlink' if create_symlink: data_table_entry = _create_symlink(input_filename, target_directory, database_id, database_name) else: @@ -146,7 +148,8 @@ return data_manager_dict -def _stream_fasta_to_file(fasta_stream, target_directory, database_id, database_name, params, close_stream=True): +def _stream_fasta_to_file(fasta_stream, target_directory, database_id, + database_name, params, close_stream=True): fasta_base_filename = "%s.fa" % database_id fasta_filename = os.path.join(target_directory, fasta_base_filename) @@ -154,53 +157,70 @@ temp_fasta.close() fasta_writer = open(temp_fasta.name, 'wb+') - if isinstance(fasta_stream, list) and len(fasta_stream) == 1: - fasta_stream = fasta_stream[0] + if not isinstance(fasta_stream, list): + fasta_stream = [fasta_stream] - if isinstance(fasta_stream, list): - last_char = None - for fh in fasta_stream: - if last_char not in [None, '\n', '\r']: - fasta_writer.write('\n') - while True: - data = fh.read(CHUNK_SIZE) - if data: - fasta_writer.write(data) - last_char = data[-1] - else: - break - if close_stream: - fh.close() - else: + last_char = None + for fh in fasta_stream: + if last_char not in [None, '\n', '\r']: + fasta_writer.write('\n') while True: - data = fasta_stream.read(CHUNK_SIZE) + data = fh.read(CHUNK_SIZE) if data: fasta_writer.write(data) + last_char = data[-1] else: break if close_stream: - fasta_stream.close() + fh.close() fasta_writer.close() - args = ['diamond', 'makedb', '--in', temp_fasta.name, '--db', fasta_filename] + args = ['diamond', 'makedb', + '--in', temp_fasta.name, + '--db', fasta_filename] + if params['tax_cond']['tax_select'] == "history": + for i in ["taxonmap", "taxonnodes", "taxonnames"]: + args.extend(['--' + i, params['tax_cond'][i]]) + elif params['tax_cond']['tax_select'] == "ncbi": + if os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL.gz')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL.gz')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.FULL')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.gz')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid.gz')]) + elif os.path.isfile(os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid')): + args.extend(['--taxonmap', + os.path.join(params['tax_cond']['ncbi_tax'], 'prot.accession2taxid')]) + else: + raise Exception('Unable to find prot.accession2taxid file in %s' % (params['tax_cond']['ncbi_tax'])) + + args.extend(['--taxonnodes', + os.path.join(params['tax_cond']['ncbi_tax'], 'nodes.dmp')]) + args.extend(['--taxonnames', + os.path.join(params['tax_cond']['ncbi_tax'], 'names.dmp')]) tmp_stderr = tempfile.NamedTemporaryFile(prefix="tmp-data-manager-diamond-database-builder-stderr") - proc = subprocess.Popen(args=args, shell=False, cwd=target_directory, stderr=tmp_stderr.fileno()) + proc = subprocess.Popen(args=args, shell=False, cwd=target_directory, + stderr=tmp_stderr.fileno()) return_code = proc.wait() if return_code: tmp_stderr.flush() tmp_stderr.seek(0) - print >> sys.stderr, "Error building diamond database:" + print("Error building diamond database:", file=sys.stderr) while True: chunk = tmp_stderr.read(CHUNK_SIZE) if not chunk: break - sys.stderr.write(chunk) + sys.stderr.write(chunk.decode('utf-8')) sys.exit(return_code) tmp_stderr.close() os.remove(temp_fasta.name) - return dict(value=database_id, name=database_name, db_path="%s.dmnd" % fasta_base_filename) + return dict(value=database_id, name=database_name, + db_path="%s.dmnd" % fasta_base_filename) def _create_symlink(input_filename, target_directory, database_id, database_name): @@ -210,27 +230,36 @@ return dict(value=database_id, name=database_name, db_path=fasta_base_filename) -REFERENCE_SOURCE_TO_DOWNLOAD = dict(ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory) +REFERENCE_SOURCE_TO_DOWNLOAD = dict(ncbi=download_from_ncbi, + url=download_from_url, + history=download_from_history, + directory=copy_from_directory) def main(): # Parse Command Line parser = optparse.OptionParser() - parser.add_option('-d', '--dbkey_description', dest='dbkey_description', action='store', type="string", default=None, help='dbkey_description') + parser.add_option('-d', '--dbkey_description', dest='dbkey_description', + action='store', type="string", default=None, + help='dbkey_description') (options, args) = parser.parse_args() filename = args[0] - params = json.loads(open(filename).read()) + with open(filename) as fp: + params = json.load(fp) target_directory = params['output_data'][0]['extra_files_path'] os.mkdir(target_directory) data_manager_dict = {} - database_id = params['param_dict']['database_id'] - database_name = params['param_dict']['database_name'] + param_dict = params['param_dict'] + database_id = param_dict['database_id'] + database_name = param_dict['database_name'] + if param_dict['tax_cond']['tax_select'] == "ncbi": + param_dict['tax_cond']['ncbi_tax'] = args[1] # Fetch the FASTA - REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']](data_manager_dict, params, target_directory, database_id, database_name) + REFERENCE_SOURCE_TO_DOWNLOAD[param_dict['reference_source']['reference_source_selector']](data_manager_dict, param_dict, target_directory, database_id, database_name) # save info to json file open(filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True)) diff -r 5a0d0bee4f8d -r 5558f74bd296 data_manager/data_manager_diamond_database_builder.xml --- a/data_manager/data_manager_diamond_database_builder.xml Tue Dec 03 17:39:48 2019 -0500 +++ b/data_manager/data_manager_diamond_database_builder.xml Sun Mar 21 12:07:34 2021 +0000 @@ -1,36 +1,60 @@ - + Database builder - diamond - python + diamond + python - + + #if $tax_cond.tax_select == "ncbi" + '$tax_cond.ncbi_tax.fields.path' + #end if + ]]> - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -44,16 +68,41 @@ + + + + + + + + + + + + + + + + + + + .. class:: infomark -NCBI databases can be downded from ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/ +NCBI databases can be downloaded from ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/ For example the NR database can be downloaded from ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz + +- taxonmap: Path to mapping file that maps NCBI protein accession numbers to taxon ids (gzip compressed). This parameter is optional and needs to be supplied in order to provide taxonomy features. The file can be downloaded from NCBI: ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz + +- taxonnames: Path to the names.dmp file from the NCBI taxonomy. This parameter is optional and needs to be supplied in order to provide taxonomy features. The file is contained within this archive downloadable at NCBI: ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip + +- taxonnodes: Path to the nodes.dmp file from the NCBI taxonomy. This parameter is optional and needs to be supplied in order to provide taxonomy features. The file is contained within this archive downloadable at NCBI: ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip 10.1038/nmeth.3176 diff -r 5a0d0bee4f8d -r 5558f74bd296 data_manager_conf.xml --- a/data_manager_conf.xml Tue Dec 03 17:39:48 2019 -0500 +++ b/data_manager_conf.xml Sun Mar 21 12:07:34 2021 +0000 @@ -1,13 +1,12 @@ - + - diamond_database/${value} ${GALAXY_DATA_MANAGER_DATA_PATH}/diamond_database/${value}/${db_path} diff -r 5a0d0bee4f8d -r 5558f74bd296 test-data/diamond_data_manager_ncbi.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/diamond_data_manager_ncbi.json Sun Mar 21 12:07:34 2021 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"diamond_database": [{"db_path": "pdbaa.fa.dmnd", "name": "pdbaa", "value": "pdbaa"}]}} \ No newline at end of file diff -r 5a0d0bee4f8d -r 5558f74bd296 test-data/ncbi_accession2taxid.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_accession2taxid.loc Sun Mar 21 12:07:34 2021 +0000 @@ -0,0 +1,6 @@ +# Tab separated fields where +# value is unique key +# name is descriptive name +# path is path to directory containing accession2taxid files +#value name path +2021-03-19 2021-03-19 ${__HERE__}/taxonomy diff -r 5a0d0bee4f8d -r 5558f74bd296 test-data/taxonomy/names.dmp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomy/names.dmp Sun Mar 21 12:07:34 2021 +0000 @@ -0,0 +1,11 @@ +1 | all | | synonym | +1 | root | | scientific name | +2 | Bacteria | Bacteria | scientific name | +2 | bacteria | | blast name | +2 | eubacteria | | genbank common name | +2 | Monera | Monera | in-part | +3 | Procaryotae | Procaryotae | in-part | +3 | Prokaryotae | Prokaryotae | in-part | +3 | Prokaryota | Prokaryota | in-part | +3 | prokaryote | prokaryote | in-part | +3 | prokaryotes | prokaryotes | in-part | diff -r 5a0d0bee4f8d -r 5558f74bd296 test-data/taxonomy/nodes.dmp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomy/nodes.dmp Sun Mar 21 12:07:34 2021 +0000 @@ -0,0 +1,3 @@ +1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | | +2 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | +3 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | | diff -r 5a0d0bee4f8d -r 5558f74bd296 test-data/taxonomy/prot.accession2taxid --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/taxonomy/prot.accession2taxid Sun Mar 21 12:07:34 2021 +0000 @@ -0,0 +1,4 @@ +accession accession.version taxid gi +AAD44166 AAD44166.1 2 5524211 +AAD44167 AAD44167.1 3 5524212 + diff -r 5a0d0bee4f8d -r 5558f74bd296 tool_data_table_conf.xml.sample --- a/tool_data_table_conf.xml.sample Tue Dec 03 17:39:48 2019 -0500 +++ b/tool_data_table_conf.xml.sample Sun Mar 21 12:07:34 2021 +0000 @@ -4,4 +4,9 @@ value, name, db_path + + + value, name, path + +
diff -r 5a0d0bee4f8d -r 5558f74bd296 tool_data_table_conf.xml.test --- a/tool_data_table_conf.xml.test Tue Dec 03 17:39:48 2019 -0500 +++ b/tool_data_table_conf.xml.test Sun Mar 21 12:07:34 2021 +0000 @@ -4,4 +4,8 @@ value, name, db_path + + value, name, path + +