Mercurial > repos > jjohnson > data_manager_cat
changeset 0:f59e7e242bde draft
planemo upload commit f80f020c77d04c2e13b89aaea3d784314b940931-dirty
author | jjohnson |
---|---|
date | Sun, 24 Nov 2019 21:54:57 -0500 |
parents | |
children | 2bec6d7877fc |
files | data_manager/data_manager_cat.py data_manager/data_manager_cat.xml data_manager/macros.xml data_manager_conf.xml tool-data/cat_database.loc.sample tool_data_table_conf.xml.sample |
diffstat | 6 files changed, 455 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_cat.py Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,103 @@ +#!/usr/bin/env python +from __future__ import print_function + +import argparse +import json +import os.path +import subprocess +import tarfile +import tempfile +import zipfile +try: + # For Python 3.0 and later + from urllib.request import urlopen +except ImportError: + # Fall back to Python 2 imports + from urllib2 import urlopen + + +def url_download(url, workdir): + file_path = os.path.join(workdir, 'download.dat') + src = None + dst = None + try: + src = urlopen(url) + with open(file_path, 'wb') as dst: + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + finally: + if src: + src.close() + if tarfile.is_tarfile(file_path): + fh = tarfile.open(file_path, 'r:*') + elif zipfile.is_zipfile(file_path): + fh = zipfile.ZipFile(file_path, 'r') + else: + return + fh.extractall(workdir) + os.remove(file_path) + + +def cat_prepare(install_dir): + cmd = ['CAT', 'prepare' '--fresh', '-q'] + cmd_stdout = tempfile.NamedTemporaryFile() + cmd_stderr = tempfile.NamedTemporaryFile() + return_code = subprocess.call(cmd, shell=True, cwd=install_dir, + stdout=cmd_stdout, stderr=cmd_stderr) + if return_code: + msg = "stdout:\n%s\nstderr:\n%s" % (cmd_stdout.read(), + cmd_stderr.read()) + cmd_stdout.close() + cmd_stderr.close() + raise Exception('Error: (%s), returncode=%s %s' + % (' '.join(cmd), return_code, msg)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--config_file') + parser.add_argument('--install_path') + parser.add_argument('--db_url', default=None) + args = parser.parse_args() + + if not os.path.exists(args.install_path): + os.makedirs(args.install_path) + if args.db_url: + url_download(args.db_url, args.install_path) + else: + cat_prepare(args.install_path) + + cat_path = None + cat_db = None + tax_db = None + for root, dirs, files in os.walk(args.install_path): + for dname in dirs: + if dname.endswith('CAT_database'): + cat_db = dname + elif dname.endswith('taxonomy'): + tax_db = dname + if cat_db and tax_db: + cat_path = root + break + + cat_dir = os.path.basename(cat_path) + # params = json.loads(open(args.config_file).read()) + dm_dict = {} + dm_dict['data_tables'] = dm_dict.get('data_tables', {}) + data_table = 'cat_database' + dm_dict['data_tables'][data_table]\ + = dm_dict['data_tables'].get(data_table, []) + data_table_entry = dict(value=cat_dir, name=cat_dir, + database_folder=os.path.join(cat_dir, cat_db), + taxonomy_folder=os.path.join(cat_dir, tax_db)) + dm_dict['data_tables'][data_table].append(data_table_entry) + # save info to json file + open(args.config_file, 'wb').write(json.dumps(dm_dict)) + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_cat.xml Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,44 @@ +<tool id="data_manager_cat" name="CAT DB" version="@VERSION@.0" tool_type="manage_data"> + <description>Install a new CAT database</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <expand macro="version_command" /> + <command detect_errors="exit_code"><![CDATA[ +#import json, os +#set params = json.loads(open(str($out_file)).read()) +#set install_path = $params['output_data'][0]['extra_files_path'].encode('ascii', 'replace') +mkdir -p $install_path && +python '${__tool_directory__}/data_manager_cat.py' --config_file '$out_file' --install_path '$install_path' +#if $db.src == 'download' + --db_url '$db_url' +#end if + ]]></command> + <inputs> + <conditional name="db"> + <param name="src" type="select" label="Download or Build DBs"> + <option value="download">download</option> + <option value="build">build</option> + </param> + <when value="download"> + <param name="db_url" type="text" label="DB rul at https://tbb.bio.uu.nl/bastiaan/CAT_prepare/" + help="example: https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20190719.tar.gz"> + </param> + </when> + <when value="build"> + </when> + </conditional> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" label="${tool.name}"/> + </outputs> + <tests> + </tests> + <help><![CDATA[ +This tool prepares reference data for CAT, the Contig Annotation Tool. +It can either download prebuilt reference data from https://tbb.bio.uu.nl/bastiaan/CAT_prepare/ +or build new reference data using the CAT prepare application. + ]]></help> + <expand macro="citations" /> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,267 @@ +<macros> + <token name="@VERSION@">5.0.3</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">cat</requirement> + <yield/> + </requirements> + </xml> + <xml name="version_command"> + <version_command>CAT --version</version_command> + </xml> + <token name="@DATABASE_FOLDER@">CAT_database</token> + <token name="@TAXONOMY_FOLDER@">taxonomy</token> + <xml name="cat_db"> + <conditional name="db"> + <param name="db_src" type="select" label="CAT database from"> + <option value="cached">local cached database</option> + <option value="history">history</option> + </param> + <when value="cached"> + <param name="cat_builtin" type="select" label="Use a built-in CAT database" help="If the CAT database of interest is not listed, contact your Galaxy administrator"> + <options from_data_table="cat_databases"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No CAT database is available." /> + </options> + </param> + </when> + <when value="history"> + <param name="cat_db" type="data" format="txt" label="A history dataset from CAT prepare tool"/> + </when> + </conditional> + </xml> + <token name="@CAT_DB@"><![CDATA[ + #if $db.db_src == 'cached': + --database_folder $db.cat_builtin.fields.database_folder + --taxonomy_folder $db.cat_builtin.fields.taxonomy_folder + #else + #import os.path + #set $catdb = $db.cat_db.extra_files_path + --database_folder '$os.path.join($catdb,"@DATABASE_FOLDER@")' + --taxonomy_folder '$os.path.join($catdb,"@TAXONOMY_FOLDER@")' + #end if +]]></token> + <token name="@CAT_TAXONOMY@"><![CDATA[ + #if $db.db_src == 'cached': + --taxonomy_folder $db.cat_builtin.fields.taxonomy_folder + #else + #import os.path + #set $catdb = $db.cat_db.extra_files_path + --taxonomy_folder '$os.path.join($catdb,"@TAXONOMY_FOLDER@")' + #end if +]]></token> + <xml name="test_catdb"> + <conditional name="db"> + <param name="db_src" value="cached"/> + <param name="cat_builtin" value="CAT_prepare_test"/> + </conditional> + </xml> + + <xml name="use_intermediates"> + <conditional name="previous"> + <param name="use_previous" type="select" label="Use previous gene prediction and diamond alignment"> + <option value="no">No</option> + <option value="yes">Yes</option> + </param> + <when value="no"/> + <when value="yes"> + <param argument="--proteins_fasta" type="data" format="fasta" label="predicted proteins fasta"/> + <param argument="--diamond_alignment" type="data" format="fasta" label="alignments file"/> + </when> + </conditional> + </xml> + <token name="@USE_INTERMEDIATES@"><![CDATA[ + #if $previous.use_previous == 'yes' + --proteins_fasta '$previous.proteins_fasta' + --diamond_alignment '$previous.diamond_alignment' + #end if + --out_prefix 'cat_output' +]]></token> + + <xml name="custom_settings"> + <param argument="--range" type="integer" value="10" min="0" max="49" label="range"/> + <param argument="--fraction" type="float" value="0.5" min="0" max="0.99" label="fraction"/> + </xml> + <token name="@CUSTOM_SETTINGS@"><![CDATA[ + --range $range + --fraction $fraction +]]></token> + <xml name="add_names_options"> + <param argument="--only_official" type="boolean" truevalue="--only_official" falsevalue="" checked="true" + label="Only output official level names."/> + <param argument="--exclude_scores" type="boolean" truevalue="--exclude_scores" falsevalue="" checked="false" + label="Exclude bit-score support scores in the lineage."/> + </xml> + <token name="@ADD_NAMES_OPTIONS@"><![CDATA[ + $only_official $exclude_scores +]]></token> + <xml name="add_names"> + <conditional name="names"> + <param name="add_names" type="select" label="add_names"> + <option value="no">No</option> + <option value="orf2lca">ORF2LCA.txt</option> + <option value="classification">classification.txt</option> + <option value="both">ORF2LCA.txt and classification.txt</option> + </param> + <when value="no"/> + <when value="orf2lca"> + <expand macro="add_names_options"/> + </when> + <when value="classification"> + <expand macro="add_names_options"/> + </when> + <when value="both"> + <expand macro="add_names_options"/> + </when> + </conditional> + </xml> + <token name="@ADD_NAMES@"><![CDATA[ + #if $names.add_names in ['classification','both']: + && CAT add_names $names.only_official $names.exclude_scores + @CAT_TAXONOMY@ + #if $bcat == 'CAT' + -i cat_output.contigs2classification.tsv + #else + -i cat_output.bin2classification.tsv + #end if + -o classification_names.txt + && @TXT2TSV@ -i classification_names -o $classification_names + #end if + #if $names.add_names in ['orf2lca','both']: + && CAT add_names $names.only_official $names.exclude_scores + @CAT_TAXONOMY@ + -i cat_output.ORF2LCA.tsv + -o orf2lca_names.txt + && @TXT2TSV@ -i orf2lca_names.txt -o $orf2lca_names + #end if +]]></token> + <xml name="summarise"> + <param name="summarise" type="select" label="summarise"> + <option value="no">No</option> + <option value="classification">classification.txt</option> + </param> + </xml> + <token name="@SUMMARISE@"><![CDATA[ + #if $summarise in ['classification']: + #if $names.add_names in ['classification','both'] and $names.only_official: + #set $summary_input = $classification_names + #else + #set $summary_input = classification_offical_names + && CAT add_names --only_official + @CAT_TAXONOMY@ + #if $bcat == 'CAT' + -i cat_output.contigs2classification.tsv + #else + -i cat_output.bin2classification.tsv + #end if + -o $summary_input + #end if + && CAT summarise + #if $bcat == 'CAT' + -c $contigs_fasta + #end if + -i $summary_input + -o classification_summary.txt + && @TXT2TSV@ -i classification_summary.txt -o $classification_summary + #end if +]]></token> + + <xml name="select_outputs"> + <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> + <option value="log" selected="true">CAT.log</option> + <option value="predicted_proteins_faa" selected="true">predicted_proteins.faa</option> + <option value="predicted_proteins_gff">predicted_proteins.gff</option> + <option value="alignment_diamond">alignment.diamond</option> + <option value="orf2lca" selected="true">ORF2LCA.txt</option> + <yield/> + </param> + </xml> + <xml name="select_cat_outputs"> + <param name="bcat" type="hidden" value="CAT"/> + <param name="seqtype" type="hidden" value="contig"/> + <expand macro="select_outputs"> + <option value="contig2classification" selected="true">contig2classification.txt</option> + </expand> + </xml> + <xml name="select_bat_outputs"> + <param name="bcat" type="hidden" value="BAT"/> + <param name="seqtype" type="hidden" value="bin"/> + <expand macro="select_outputs"> + <option value="bin2classification" selected="true">bin2classification.txt</option> + </expand> + </xml> + + <token name="@TXT2TSV@"><![CDATA[ + $__tool_directory__/tabpad.py +]]></token> + <xml name="outputs"> + <data name="log" format="txt" label="${bcat}.log" from_work_dir="cat_output.log"> + <filter>'log' in select_outputs or not select_outputs</filter> + </data> + <data name="predicted_proteins_faa" format="fasta" label="${bcat}.predicted_proteins.faa" from_work_dir="cat_output.predicted_proteins.faa"> + <filter>'predicted_proteins_faa' in select_outputs</filter> + </data> + <data name="predicted_proteins_gff" format="gff" label="${bcat}.predicted_proteins.gff" from_work_dir="cat_output.predicted_proteins.gff"> + <filter>'predicted_proteins_gff' in select_outputs</filter> + </data> + <data name="alignment_diamond" format="tabular" label="${bcat}.alignment.diamond" from_work_dir="cat_output.alignment.diamond"> + <filter>'alignment_diamond' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore" /> + </actions> + </data> + <data name="orf2lca" format="tabular" label="${bcat}.ORF2LCA.txt" from_work_dir="cat_output.ORF2LCA.tsv"> + <filter>'orf2lca' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="ORF,lineage,bit-score" /> + </actions> + </data> + <data name="contig2classification" format="tabular" label="${bcat}.contig2classification.txt" from_work_dir="cat_output.contig2classification.tsv"> + <filter>'contig2classification' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="contig,classification,reason,lineage,lineage scores" /> + </actions> + </data> + <data name="bin2classification" format="tabular" label="${bcat}.bin2classification.txt" from_work_dir="cat_output.bin2classification.tsv"> + <filter>'bin2classification' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="bin,classification,reason,lineage,lineage scores" /> + </actions> + </data> + <data name="orf2lca_names" format="tabular" label="${bcat}.ORF2LCA.names.txt"> + <filter>'orf2lca' in names.add_names</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="ORF,lineage,bit-score,superkingdom,phylum,class,order,family,genus,species" /> + </actions> + </data> + <data name="classification_names" format="tabular" label="${bcat}.${seqtype}2classification.names.txt"> + <filter>'classification' in names.add_names</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="${seqtype},classification,reason,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species" /> + </actions> + </data> + <data name="classification_summary" format="tabular" label="${bcat}.${seqtype}2classification.summary.txt"> + <filter>'classification' in summarise</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="rank,clade,number of contigs,number of ORFs,number of positions" /> + </actions> + </data> + </xml> + <token name="@COMMON_HELP@"><![CDATA[ +The CAT/BAT workflow is described at: https://github.com/dutilh/CAT +]]></token> + <xml name="citations"> + <citations> + <citation type="doi">https://doi.org/10.1101/072868</citation> + <citation type="doi">https://doi.org/10.1186/s13059-019-1817-x</citation> + <yield /> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,27 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_cat.xml" id="data_manager_cat" > + <data_table name="cat_database"> <!-- Defines a Data Table to be modified. --> + <output> <!-- Handle the output of the Data Manager Tool --> + <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="database_folder" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <src >${database_folder}</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">CAT/${database_folder}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/CAT/${database_folder}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + <column name="taxonomy_folder" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <src >${taxonomy_folder}</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">CAT/${taxonomy_folder}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/CAT/${taxonomy_folder}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/cat_database.loc.sample Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,7 @@ +## A typical download from https://tbb.bio.uu.nl/bastiaan/CAT_prepare/CAT_prepare_20190719.tar.gz +# ls CAT_prepare_20190719/ +# 2019-07-19.CAT_prepare.fresh.log +# 2019-07-19_CAT_database +# 2019-07-19_taxonomy +#value name database_folder taxonomy_folder +#2019-07-19_CAT_database 2019-07-19_CAT_database /opt/galaxy/tool-data/cat_database/CAT_prepare_20190719/2019-07-19_CAT_database /opt/galaxy/tool-data/cat_database/CAT_prepare_20190719/2019-07-19_taxonomy
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Sun Nov 24 21:54:57 2019 -0500 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of CAT databases --> + <table name="cat_database" comment_char="#" allow_duplicate_entries="False"> + <columns>value, name, database_folder, taxonomy_folder</columns> + <file path="tool-data/cat_database.loc" /> + </table> +</tables>