Mercurial > repos > iuc > drep_dereplicate
view macros.xml @ 3:a0713a10c9f6 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 91d5bae878afa54d8264c1ca89c79b2d26ef35ad
author | iuc |
---|---|
date | Wed, 15 Mar 2023 11:05:04 +0000 |
parents | 368cb4bef9d8 |
children | 1faea6e8c92b |
line wrap: on
line source
<?xml version="1.0"?> <macros> <token name="@TOOL_VERSION@">3.4.2</token> <token name="@VERSION_SUFFIX@">0</token> <token name="@PROFILE@">20.01</token> <xml name="biotools"> <xrefs> <xref type="bio.tools">drep</xref> </xrefs> </xml> <xml name="requirements"> <requirements> <requirement type="package" version="@TOOL_VERSION@">drep</requirement> <yield/> </requirements> </xml> <xml name="citations"> <citations> <citation type="doi">10.1038/ismej.2017.126</citation> <yield /> </citations> </xml> <xml name="genomes"> <param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/> </xml> <!-- Addition of ".fasta" after names to avoid string to be read as integer Bug in dRep: probably fixed in next version --> <token name="@PREPARE_GENOMES@"><![CDATA[ #import re #set $genomefiles = [] #for $genome in $genomes #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) ln -s '${genome}' '${input_name}.fasta' && $genomefiles.append($input_name) #end for ]]></token> <token name="@GENOMES@"><![CDATA[ -g #for $genomefile in $genomefiles '${genomefile}.fasta' #end for ]]></token> <xml name="filtering_options"> <section name="filter" title="Genome filtering" expanded="true"> <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> </section> </xml> <xml name="test_default_filtering_options"> <section name="filter"> <param name="length" value="50000"/> <param name="completeness" value="75"/> <param name="contamination" value="100"/> </section> </xml> <token name="@FILTER_OPTIONS@"><![CDATA[ --length $filter.length --completeness $filter.completeness --contamination $filter.contamination ]]></token> <xml name="quality_assessment_options"> <conditional name="quality"> <param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50."> <option value="checkm" selected="true">Run checkM</option> <option value="genomeInfo">Provide quality information on the genome (CSV file)</option> <option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option> </param> <when value="checkm"> <param argument="--checkM_method" type="select" label="CheckM method"> <option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option> <option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option> </param> <param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/> <param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/> </when> <when value="genomeInfo"> <param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes"> <help><![CDATA[ A CSV dataset that must contain: [ "genome"(history dataset name of .fasta dataset of that genome), "completeness"(0-100 value for completeness of the genome), "contamination"(0-100 value of the contamination of the genome)] ]]></help> </param> </when> <when value="ignoreGenomeQuality"/> </conditional> </xml> <xml name="test_default_quality_assessment_options"> <conditional name="quality"> <param name="source" value="checkm"/> <param name="checkM_method" value="taxonomy_wf"/> <param name="checkm_group_size" value="2000"/> </conditional> </xml> <token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[ #if $quality.source == 'checkm' --checkM_method '$quality.checkM_method' #if str($quality.set_recursion) != '' --set_recurison $filter.set_recursion #end if --checkm_group_size $quality.checkm_group_size #else if $quality.source == 'genomeInfo' --genomeInfo '$quality.genomeInfo' #else if $quality.source == 'ignoreGenomeQuality' --ignoreGenomeQuality #end if ]]></token> <xml name="mash"> <param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/> <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/> <param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/> <param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/> </xml> <xml name="test_default_mash"> <param name="MASH_sketch" value="1000"/> <param name="P_ani" value="0.9"/> <param name="multiround_primary_clustering" value=''/> <param name="primary_chunksize" value="5000"/> </xml> <token name="@MASH@"><![CDATA[ --MASH_sketch '$comp_clust.steps.MASH_sketch' --P_ani $comp_clust.steps.P_ani $comp_clust.steps.multiround_primary_clustering --primary_chunksize $comp_clust.steps.primary_chunksize ]]></token> <xml name="nucmer"> <param argument="--n_PRESET" type="select" label="Presets to pass to nucmer"> <option value="normal" selected="true">normal: default ANIn parameters</option> <option value="tight">tight: only align highly conserved regions</option> </param> </xml> <xml name="test_default_nucmer"> <param name="n_PRESET" value="normal"/> </xml> <token name="@NUCMER@"><![CDATA[ --n_PRESET '$comp_clust.steps.clustering.n_PRESET' ]]></token> <xml name="coverage_method"> <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> <option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option> <option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option> </param> </xml> <xml name="test_default_coverage_method"> <param name="coverage_method" value="larger"/> </xml> <token name="@COVERAGE_METHOD@"><![CDATA[ --coverage_method '$comp_clust.steps.clustering.coverage_method' ]]></token> <xml name="secondary_clustering"> <conditional name="clustering"> <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons"> <option value="fastANI">fastANI: Kmer-based approach - very fast</option> <option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option> <option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option> <option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option> <option value="goANI">Open source version of gANI; requires nsmimscan</option> </param> <when value="fastANI"> <param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/> </when> <when value="ANImf"> <expand macro="nucmer"/> <expand macro="coverage_method"/> </when> <when value="ANIn"> <expand macro="nucmer"/> <expand macro="coverage_method"/> </when> <when value="gANI"/> <when value="goANI"/> </conditional> <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> </xml> <xml name="test_default_secondary_clustering"> <conditional name="clustering"> <param name="S_algorithm" value="ANImf"/> <expand macro="test_default_nucmer"/> <expand macro="test_default_coverage_method"/> </conditional> <param name="S_ani" value="0.99"/> <param name="cov_thresh" value="0.1"/> </xml> <token name="@SECONDARY_CLUSTERING@"><![CDATA[ --S_algorithm '$comp_clust.steps.clustering.S_algorithm' #if $comp_clust.steps.clustering.S_algorithm == 'fastANI' $comp_clust.steps.clustering.greedy_secondary_clustering #else if $comp_clust.steps.clustering.S_algorithm == 'ANImf' @NUCMER@ @COVERAGE_METHOD@ #else if $comp_clust.steps.clustering.S_algorithm == 'ANIn' @NUCMER@ @COVERAGE_METHOD@ #end if --S_ani $comp_clust.steps.S_ani --cov_thresh $comp_clust.steps.cov_thresh ]]></token> <xml name="comparison_clustering_options"> <section name="comp_clust" title="Genome comparison and clustering" expanded="false"> <conditional name="steps"> <param name="select" type="select" label="Steps in genome comparison"> <option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option> <option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option> <option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option> </param> <when value="default"> <expand macro="mash"/> <expand macro="secondary_clustering"/> </when> <when value="SkipMash"> <expand macro="secondary_clustering"/> </when> <when value="SkipSecondary"> <expand macro="mash"/> </when> </conditional> <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage"> <option value="average" selected="true">average</option> <option value="ward">ward</option> <option value="single">single</option> <option value="median">median</option> <option value="centroid">centroid</option> <option value="weighted">weighted</option> </param> <param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/> </section> </xml> <xml name="test_default_comparison_clustering_options"> <section name="comp_clust"> <conditional name="steps"> <param name="select" value="default" /> <expand macro="test_default_mash"/> <expand macro="test_default_secondary_clustering"/> </conditional> <param name="clusterAlg" value="average"/> <param name="run_tertiary_clustering" value=''/> </section> </xml> <token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[ #if $comp_clust.steps.select == 'default' @MASH@ @SECONDARY_CLUSTERING@ #else if $comp_clust.steps.select == 'SkipMash' --SkipMash @SECONDARY_CLUSTERING@ #else @MASH@ --SkipSecondary #end if --clusterAlg '$comp_clust.clusterAlg' $comp_clust.run_tertiary_clustering ]]></token> <xml name="scoring_options"> <section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight"> <param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/> <param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/> <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/> <param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/> <param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/> <param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/> <param argument="--extra_weight_table" type="data" format="tabular" multiple="false" optional="true" label="Genome specific extra weight"/> </section> </xml> <xml name="test_default_scoring_options"> <section name="scoring"> <param name="completeness_weight" value="1"/> <param name="contamination_weight" value="5"/> <param name="strain_heterogeneity_weight" value="1"/> <param name="N50_weight" value=".5" /> <param name="size_weight" value="0"/> <param name="centrality_weight" value="1"/> </section> </xml> <xml name="test_extra_weight_table_scoring_options"> <section name="scoring"> <param name="completeness_weight" value="1"/> <param name="contamination_weight" value="5"/> <param name="strain_heterogeneity_weight" value="1"/> <param name="N50_weight" value=".5" /> <param name="size_weight" value="0"/> <param name="centrality_weight" value="1"/> <param name="extra_weight_table" ftype="tabular" value="extra_weight_table_test"/> </section> </xml> <token name="@SCORING_OPTIONS@"><![CDATA[ --completeness_weight $scoring.completeness_weight --contamination_weight $scoring.contamination_weight --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight --N50_weight $scoring.N50_weight --size_weight $scoring.size_weight --centrality_weight $scoring.centrality_weight #if str($extra_weight_table) != 'None' --extra_weight_table $extra_weight_table #end if ]]></token> <xml name="warning_options"> <section name="warning" title="Warnings" expanded="false"> <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> </section> </xml> <xml name="test_default_warning_options"> <section name="warning"> <param name="warn_dist" value="0.25"/> <param name="warn_sim" value="0.98"/> <param name="warn_aln" value="0.25"/> </section> </xml> <token name="@WARNING_OPTIONS@"><![CDATA[ --warn_dist $warning.warn_dist --warn_sim $warning.warn_sim --warn_aln $warning.warn_aln ]]></token> <xml name="select_outputs"> <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> <option value="log" selected="true">log</option> <option value="warnings" selected="true">Warnings</option> <option value="Primary_clustering_dendrogram" selected="true">Primary_clustering_dendrogram.pdf</option> <option value="Secondary_clustering_dendrograms">Secondary_clustering_dendrograms.pdf</option> <option value="Secondary_clustering_MDS">Secondary_clustering_MDS.pdf</option> <option value="Clustering_scatterplots" selected="true">Clustering_scatterplots.pdf</option> <yield/> </param> </xml> <xml name="select_drep_outputs"> <expand macro="select_outputs"> <option value="Cluster_scoring">Cluster_scoring.pdf</option> <option value="Winning_genomes">Winning_genomes.pdf</option> <option value="Widb">Widb.csv</option> <option value="Chdb">Chdb.tsv</option> </expand> </xml> <xml name="test_default_select_drep_outputs"> <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" /> </xml> <xml name="test_default_select_outputs"> <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" /> </xml> <xml name="common_outputs"> <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> <filter>'log' in select_outputs or not select_outputs</filter> </data> <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> <filter>'warnings' in select_outputs</filter> </data> <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf"> <filter>'Primary_clustering_dendrogram' in select_outputs</filter> </data> <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf"> <filter>'Secondary_clustering_dendrograms' in select_outputs</filter> </data> <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf"> <filter>'Secondary_clustering_MDS' in select_outputs</filter> </data> <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> <filter>'Clustering_scatterplots' in select_outputs</filter> </data> </xml> <xml name="drep_outputs"> <expand macro="common_outputs"/> <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> <filter>'Cluster_scoring' in select_outputs</filter> </data> <data name="Winning_genomes" format="pdf" label="${tool.name} on ${on_string}: Winning_genomes.pdf" from_work_dir="outdir/figures/Winning_genomes.pdf"> <filter>'Winning_genomes' in select_outputs</filter> </data> <data name="Widb" format="csv" label="${tool.name} on ${on_string}: Widb.csv" from_work_dir="outdir/data_tables/Widb.csv"> <filter>'Widb' in select_outputs</filter> </data> <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> <filter>'Chdb' in select_outputs</filter> </data> </xml> <xml name="test_string_inputs"> <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> </xml> <xml name="test_integer_inputs"> <param name="genomes" ftype="fasta" value="001,002,003"/> </xml> <xml name="test_log_output"> <output name="log"> <assert_contents> <yield/> </assert_contents> </output> </xml> <token name="@GENOMES_HELP@"><![CDATA[ I/O PARAMETERS: -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] genomes to cluster in .fasta format (default: None) ]]></token> <token name="@FILTERING_HELP@"><![CDATA[ FILTERING OPTIONS: -l LENGTH, --length LENGTH Minimum genome length (default: 50000) -comp COMPLETENESS, --completeness COMPLETENESS Minumum genome completeness (default: 75) -con CONTAMINATION, --contamination CONTAMINATION Maximum genome contamination (default: 25) --ignoreGenomeQuality Don't run checkM or do any quality filtering. NOT RECOMMENDED! This is useful for use with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50 (default: False) ]]></token> <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ GENOME COMPARISON PARAMETERS: -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH MASH sketch size (default: 1000) --S_algorithm {goANI,ANIn,ANImf,gANI} Algorithm for secondary clustering comaprisons: ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions ANIn = Align whole genomes with nucmer; compare aligned regions gANI = Identify and align ORFs; compare aligned ORFS (default: ANImf) -n_PRESET {normal,tight} Presets to pass to nucmer tight = only align highly conserved regions normal = default ANIn parameters (default: normal) ]]></token> <token name="@CLUSTERING_HELP@"><![CDATA[ CLUSTERING PARAMETERS: -pa P_ANI, --P_ani P_ANI ANI threshold to form primary (MASH) clusters (default: 0.9) -sa S_ANI, --S_ani S_ANI ANI threshold to form secondary clusters (default: 0.99) --SkipMash Skip MASH clustering, just do secondary clustering on all genomes (default: False) --SkipSecondary Skip secondary clustering, just perform MASH clustering (default: False) -nc COV_THRESH, --cov_thresh COV_THRESH Minmum level of overlap between genomes when doing secondary comparisons (default: 0.1) -cm {total,larger}, --coverage_method {total,larger} Method to calculate coverage of an alignment (for ANIn/ANImf only; gANI can only do larger method) total = 2*(aligned length) / (sum of total genome lengths) larger = max((aligned length / genome 1), (aligned_length / genome2)) (default: larger) --clusterAlg CLUSTERALG Algorithm used to cluster genomes (passed to scipy.cluster.hierarchy.linkage (default: average) ]]></token> <token name="@SCORING_HELP@"><![CDATA[ SCORING CRITERIA Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT completeness weight (default: 1) -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT contamination weight (default: 5) -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT strain heterogeneity weight (default: 1) -N50W N50_WEIGHT, --N50_weight N50_WEIGHT weight of log(genome N50) (default: 0.5) -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT weight of log(genome size) (default: 0) -extraW EXTRA_WEIGHT_TABLE, --extra_weight_table EXTRA_WEIGHT_TABLE Path to a tab-separated file with two-columns, no headers, listing genome and extra score to apply to that genome (optional) ]]></token> <token name="@TAXONOMY_HELP@"><![CDATA[ TAXONOMY: --run_tax generate taxonomy information (Tdb) (default: False) --tax_method {percent,max} Method of determining taxonomy percent = The most descriptive taxonimic level with at least (per) hits max = The centrifuge taxonomic level with the most overall hits (default: percent) -per PERCENT, --percent PERCENT minimum percent for percent method (default: 50) --cent_index CENT_INDEX path to centrifuge index (for example, /home/mattolm/download/centrifuge/indices/b+h+v (default: None) ]]></token> <token name="@WARNINGS_HELP@"><![CDATA[ WARNINGS: --warn_dist WARN_DIST How far from the threshold to throw cluster warnings (default: 0.25) --warn_sim WARN_SIM Similarity threshold for warnings between dereplicated genomes (default: 0.98) --warn_aln WARN_ALN Minimum aligned fraction for warnings between dereplicated genomes (ANIn) (default: 0.25) ]]></token> </macros>