Mercurial > repos > iuc > picrust2_metagenome_pipeline

<?xml version="1.0"?>
<macros>
    <token name="@TOOL_VERSION@">2.5.3</token>
    <token name="@VERSION_SUFFIX@">0</token>
    <token name="@PROFILE@">22.01</token>
    <xml name="bio_tool">
        <xrefs>
            <xref type="bio.tools">picrust2</xref>
        </xrefs>
    </xml>
    <xml name="requirements">
        <requirements>
            <requirement type="package" version="@TOOL_VERSION@">picrust2</requirement>
            <yield/>
        </requirements>
    </xml>
    <token name="@HELP_HEADER@">
What it does
============

PICRUSt2 (Phylogenetic Investigation of Communities by Reconstruction of
Unobserved States) is a tool for predicting functional abundances based only on
marker gene sequences.

Read more about the tool: https://github.com/picrust/picrust2/wiki
    </token>
    <xml name="citations">
        <citations>
            <citation type="doi">10.1038/s41587-020-0548-6</citation>
        </citations>
    </xml>


    <token name="@VAR_ACCESS_FOO@"><![CDATA[
    ## in picrust2_pipeline the parameters are within a section or a
    ## conditional. in the separate sections they are not.
    ## this function allows unified access
    #def getVarCond($sec_cond, $var)
        #if $varExists($var)
            #return $getVar($var)
        #else if $varExists($sec_cond + "." + $var)
            #return $getVar($sec_cond + "." + $var)
        #else
            #return
        #end if
    #end def
    ]]></token>

    <!-- macros for place_seqs -->

    <token name="@PLACE_SEQS_PREPROCESSING@"><![CDATA[
        ## determine project dir which is something like /lib/python3.8/site-packages/picrust2/default_files/
        PROJECT_DIR=\$(python -c 'from picrust2 import default; print(default.project_dir)') &&
        REF_DIR_BASE=\$PROJECT_DIR"/default_files/" &&
        #if $getVarCond("place_seqs_section", "ref_dir.selector") == "custom"
            mkdir -p custom/ &&
            ln -s '$getVarCond("place_seqs_section", "ref_dir.custom_fna")' custom/custom.fna &&
            ln -s '$getVarCond("place_seqs_section", "ref_dir.custom_hmm")' custom/custom.hmm &&
            #if $getVarCond("place_seqs_section", "placement_tool") == "epa-ng"
                ln -s '$getVarCond("place_seqs_section", "ref_dir.custom_model")' custom/custom.model &&
            #else if $getVarCond("place_seqs_section", "placement_tool")
                ln -s '$getVarCond("place_seqs_section", "ref_dir.custom_model")' custom/custom.raxml_info &&
            #end if
            ln -s '$getVarCond("place_seqs_section", "ref_dir.custom_tre")' custom/custom.tre &&
        #end if
    ]]></token>
    <token name="@PLACE_SEQS_PARAMS@"><![CDATA[
        --study_fasta '$getVarCond("place_seqs_section", "study_fasta")'
        --placement_tool '$getVarCond("place_seqs_section", "placement_tool")'
        ## set refdir (default is prokaryotic), even if the default will
        ## be treated internally as `"\$REF_DIR_BASE"$ref_dir.selector`
        ## picrust2 will complain about non-default reference files
        ## specified with default pathway mapfile
        #if $getVarCond("place_seqs_section", "ref_dir.selector") == "custom"
            --ref_dir custom/
        #else if $getVarCond("place_seqs_section", "ref_dir.selector") != "prokaryotic/pro_ref/"
            --ref_dir "\$REF_DIR_BASE"$getVarCond("place_seqs_section", "ref_dir.selector")
        #end if
        --min_align $getVarCond("place_seqs_section", "min_align")
    ]]></token>
    <xml name="place_seqs_params">
        <param argument="--study_fasta" type="data" format="fasta" label="Study sequences" help="Sequences of the representative OTUs and/or ASVs. Sequences need to be on the positive strand and the headerline should be only one field, i.e. no additional whitespace-delimited fields"/>
        <param argument="--placement_tool" type="select" label="Placement tool" help="Used for placing sequences into reference tree">
            <option value="epa-ng" selected="true">EPA-ng -  Fast, parallel, highly accurate Maximum Likelihood Phylogenetic Placement, by the team behind RAxML(-ng)</option>
            <option value="sepp">SEPP - SATe-enabled Phylogenetic Placement</option>
        </param>
        <conditional name="ref_dir">
            <param name="selector" type="select" label="Reference data" help="Used for sequence placement">
                <option value="prokaryotic/pro_ref/" selected="true">Prokaryotic 16S rRNA gene</option>
                <!-- TODO https://github.com/picrust/picrust2/issues/276 -->
                <option value="fungi/fungi_ITS/">Fungal ITS (only for epa-ng)</option>
                <option value="fungi/fungi_18S/">Fungal 18S (only for epa-ng)</option>
                <option value="custom">Custom reference sequence files</option>
            </param>
            <when value="prokaryotic/pro_ref/"/>
            <when value="fungi/fungi_ITS/"/>
            <when value="fungi/fungi_18S/"/>
            <when value="custom">
                <param name="custom_fna" type="data" format="fasta" label="Multiple-sequence alignment of reference sequences"/>
                <param name="custom_hmm" type="data" format="hmm2,hmm3" label="Hidden-markov model of the multiple-sequence alignment" help="The HMM of the alignment can be created using hmmbuild"/>
                <param name="custom_tre" type="data" format="newick" label="Tree of the reference sequences"/>
                <param name="custom_model" type="data" format="txt" label="Modelfile" help="For epa-ng: output by RaXmL specifying the best parameters for the tree, for sepp see examples in PICRUSt2 repository"/>
            </when>
        </conditional>
        <param argument="--min_align" type="float" value="0.80" min="0.0" max="1.0" label="Minimum alignment length" help="Proportion of the total length of an input query sequence that must align with reference sequences. Sequences with lengths below this value will be excluded from the placement and all subsequent steps"/>
    </xml>
    <xml name="place_seqs_output" tokens="from_work_dir" token_label_suffix="">
        <data name="out_tree" format="newick" from_work_dir="@FROM_WORK_DIR@/out.tre" label="${tool.name} on ${on_string}: Tree of reference and study 16S sequences @LABEL_SUFFIX@"/>
        <collection name="place_seqs_intermediate_output" type="list" label="${tool.name} on ${on_string}: Intermediate files @LABEL_SUFFIX@" >
            <discover_datasets pattern="__name_and_ext__" directory="@FROM_WORK_DIR@/intermediate/place_seqs/"/>
            <yield/>
        </collection>
    </xml>

    <!-- parameters of hsp -->
    <token name="@HSP_PARAMS@"><![CDATA[
    ## hsp and picrust2_pipeline
    #if $getVarCond("hsp_section", "trait_input.selector") == "default"
        #if $varExists('trait_input.in_trait')
            --in_trait '$trait_input.in_trait'
        #else if $varExists('hsp_section.trait_input.in_traits')
            --in_traits '$hsp_section.trait_input.in_traits'
        #else
            #raise Exception("wrapper must define in_trait / in_traits")
        #end if
    #else if $getVarCond("hsp_section", "trait_input.selector") == "custom"
        #if $varExists('trait_input.observed_trait_table')
            --observed_trait_table '$trait_input.observed_trait_table'
        #else if $varExists('hsp_section.trait_input.custom_trait_tables')
            --custom_trait_tables '$hsp_section.trait_input.custom_trait_tables'
            --marker_gene_table '$hsp_section.trait_input.marker_gene_table'
        #else
            #raise Exception("wrapper must define observed_trait_table / (custom_trait_tables + marker_gene_table)")
        #end if
    #end if


    --hsp_method '$getVarCond("hsp_section", "hsp_method_options.hsp_method")'
    #if $getVarCond("hsp_section", "hsp_method_options.hsp_method") == "mp"
        --edge_exponent $getVarCond("hsp_section", "hsp_method_options.edge_exponent")
    #else if $getVarCond("hsp_section", "") == "emp_prob"
        ## special treatment of seed (option absent in picrust2_pipeline)
        #if $varExists('hsp_method_options') and has_attrib($hsp_method_options, "seed")
            --seed $hsp_method_options.seed
        #end if
    #end if
    ## hsp and picrust2_pipeline use different CLI params to toggle NSTI computation
    #if $varExists('calculate_NSTI')
        $calculate_NSTI
    #else if $varExists('hsp_section.skip_nsti')
        $hsp_section.skip_nsti
    #else
        #raise Exception("wrapper must define calculate_NSTI / skip_nsti")
    #end if
    ]]></token>
    <!-- - one of nsti_[true,false]value must be given: CLI param
           differs between hsp and picrust2_pipeline
         - nsti_checked must be set accordingly to true or false

         furthermore there three yields can be used (2 names & 1 unnamed)
         - the unnamed is used to add the seed param for hsp (for \-\-hsp_method emp_prob)
         - the named yield `add_default_traits` is used to add two default trait tables for hsp
         - the named yield `custom_traits` is used for the different parameters
         to specify custom trait tables in hsp (observed_trait_table) and
         picrust2_pipeline (custom_trait_tables, marker_gene_table)
        -->
    <xml name="hsp_params" tokens="nsti_checked,in_trait_arg,in_trait_multiple,in_trait_label_suff" token_nsti_truevalue="" token_nsti_falsevalue="" token_in_traits_help="">
        <conditional name="trait_input">
            <param name="selector" type="select" label="Trait table@IN_TRAIT_LABEL_SUFF@" help="i.e. which gene families to predict">
                <option value="default" selected="true">Default trait table@IN_TRAIT_LABEL_SUFF@</option>
                <option value="custom">Customized trait table@IN_TRAIT_LABEL_SUFF@</option>
            </param>
            <when value="default">
                <param argument="@IN_TRAIT_ARG@" type="select" multiple="@IN_TRAIT_MULTIPLE@" optional="false" label="Pre-calculated trait table@IN_TRAIT_LABEL_SUFF@" help="@IN_TRAITS_HELP@">
                    <option value="COG">Clusters of Orthologous Genes database (COG)</option>
                    <option value="EC" selected="true">Enzyme Commission number database (EC number)</option>
                    <option value="KO" selected="true">KEGG Orthology database (KO)</option>
                    <option value="PFAM">Pfam database</option>
                    <option value="TIGRFAM">TIGRFAM database</option>
                    <yield name="add_default_traits"/>
                </param>
            </when>
            <when value="custom">
                <yield name="custom_traits"/>
            </when>
        </conditional>
        <conditional name="hsp_method_options">
            <param argument="--hsp_method" type="select" label="Hidden-state prediction method">
                <option value="mp" selected="true">Predict discrete traits by: Maximum parsimony (mp)</option>
                <option value="emp_prob">Predict discrete traits by: Empirical state probabilities across tips (emp_prob)</option>
                <option value="subtree_average">Predict continuous traits by: Subtree averaging (subtree_average)</option>
                <option value="pic">Predict continuous traits by: phylogentic independent contrast (pic)</option>
                <option value="scp">Reconstruct continuous traits by: squared-change parsimony (scp)</option>
            </param>
            <when value="mp">
                <param argument="--edge_exponent" type="float" value="0.5" min="0.0" label="Transition cost weight" help="Specifies weighting transition costs by the inverse length of edge lengths. If 0, then edge lengths do not influence predictions"/>
            </when>
            <when value="emp_prob">
                <yield/>
            </when>
            <when value="subtree_average"/>
            <when value="pic"/>
            <when value="scp"/>
        </conditional>
        <param argument="@NSTI_TRUEVALUE@@NSTI_FALSEVALUE@" type="boolean" truevalue="@NSTI_TRUEVALUE@" falsevalue="@NSTI_FALSEVALUE@" checked="@NSTI_CHECKED@" label="Calculate NSTI and add to output file" help="And add to output file"/>
    </xml>

    <!-- parameters of the metagenome_pipeline -->

    <token name="@PREPARE_METAGENOME_PIPELINE_PARAMS@"><![CDATA[
#set $_input=$getVarCond("metagenome_pipeline_section", "input")
#if $_input.ext == "mothur.shared"
    #set ext="msf"
#else if $_input.ext == "tabular"
    #set ext="tsv"
#else if $_input.ext.startswith('biom')
    #set ext="biom"
#else
    >&2 "unknown extension $_input.ext"
    exit 1;
#end if
ln -s '$input' 'input.$ext' &&
    ]]></token>
    <token name="@METAGENOME_PIPELINE_PARAMS@"><![CDATA[
--input 'input.$ext'
#if $getVarCond("metagenome_pipeline_section", "input_options.selector") == "ASV"
    --min_reads $getVarCond("metagenome_pipeline_section", "input_options.min_reads")
    --min_samples $getVarCond("metagenome_pipeline_section", "input_options.min_samples")
#end if
$getVarCond("metagenome_pipeline_section", "stratified_output.selector")
#if $getVarCond("metagenome_pipeline_section", "stratified_output.selector") != ''
    $getVarCond("metagenome_pipeline_section", "stratified_output.wide_table")
#end if
$getVarCond("metagenome_pipeline_section", "skip_norm")
--max_nsti $getVarCond("metagenome_pipeline_section", "max_nsti")
    ]]></token>
    <xml name="metagenome_pipeline_params" tokens="stratified_arg">
        <param argument="--input" type="data" format="tabular,biom1,biom2,mothur.shared" label="Sequence abundance table (OTUs or ASVs)" help="The sequence abundances should be in read counts and not relative abundances. The tool will normalize the input sequence abundance table by the predicted number of marker genes"/>
        <conditional name="input_options">
            <param name="selector" type="select" label="Sequence abundance table type">
                <option value="OTU" selected="true">Operational Taxonomic Units (OTU)</option>
                <option value="ASV">Amplicon Sequence Variants (ASV)</option>
            </param>
            <when value="OTU">
            </when>
            <when value="ASV">
                <param argument="--min_reads" type="integer" min="1" value="1" label="Minimum number of reads across all samples for each input ASV" help="ASVs below this cut-off will be counted as part of the RARE category in the stratified output"/>
                <param argument="--min_samples" type="integer" min="1" value="1" label="Minimum number of samples that an ASV needs to be identfied within" help="ASVs below this cut-off will be counted as part of the RARE category in the stratified output"/>
            </when>
        </conditional>
        <yield/>
        <param argument="--max_nsti" type="float" min="0" value="2.0" label="Maximum Nearest-sequenced taxon index (NSTI)" help="Sequences with larger values will be excluded"/>
        <conditional name="stratified_output">
            <param argument="@STRATIFIED_ARG@" name="selector" type="select" label="Generate an output table stratified by sequences">
                <option value="" selected="true">No</option>
                <option value="@STRATIFIED_ARG@">Yes [will increase run-time]</option>
            </param>
            <when value=""/>
            <when value="@STRATIFIED_ARG@">
                <param argument="--wide_table" type="boolean" truevalue="--wide_table" falsevalue="" checked="false" label="Output wide-format stratified table of metagenome predictions" help="This is the deprecated method of generating stratified tables since it is extremely memory intensive"/>
            </when>
        </conditional>
        <param argument="--skip_norm" type="boolean" truevalue="--skip_norm" falsevalue="" checked="false" label="Skip normalizing sequence abundances by predicted marker gene copy numbers"/>
    </xml>

    <!-- pathway_pipeline macros-->
    <token name="@PATHWAY_PIPELINE_PARAMS@"><![CDATA[
## in pathway_pipeline its --map while in picrust2_pipeline its --pathway_map
#if $varExists('map') and $map
    --map '$map'
#else if $varExists('predict_pathways.pathway_map') and $predict_pathways.pathway_map
    --pathway_map '$predict_pathways.pathway_map'
#end if
$getVarCond("predict_pathways", "skip_minpath")
$getVarCond("predict_pathways", "no_gap_fill")
$getVarCond("predict_pathways", "regrouping.no_regroup")
#if $getVarCond("predict_pathways", "regrouping.no_regroup") == '' and $getVarCond("predict_pathways", "regrouping.regroup_map")
    --regroup_map '$getVarCond("predict_pathways", "regroup_map")'
#end if
$getVarCond("predict_pathways", "strat_output.per_sequence_contrib")
#if $getVarCond("predict_pathways", "strat_output.per_sequence_contrib") != ""
    --per_sequence_function '$getVarCond("predict_pathways", "strat_output.per_sequence_function")'
    --per_sequence_abun '$getVarCond("predict_pathways", "strat_output.per_sequence_abun")'
    $getVarCond("predict_pathways", "strat_output.wide_table")
#end if
$getVarCond("predict_pathways", "coverage")
    ]]></token>
    <xml name="pathway_pipeline_params" tokens="mapargument">
        <param argument="@MAPARGUMENT@" type="data" format="txt,tabular" optional="true" label="Customized table mapping of pathways to reactions" help="Default mapping file is Maps MetaCyc reactions to prokaryotic MetaCyc pathways"/>
        <param argument="--skip_minpath" type="boolean" truevalue="" falsevalue="--skip_minpath" checked="true" label="Run MinPath to identify which pathways are present as a first pass"/>
        <param argument="--no_gap_fill" type="boolean" truevalue="" falsevalue="--no_gap_fill" checked="true" label="Perform gap filling before predicting pathway abundances"/>
        <conditional name="regrouping">
            <param argument="--no_regroup" type="select" label="Regroup input gene families to reactions">
                <option value="">Yes</option>
                <option value="--no_regroup">No</option>
            </param>
            <when value="">
                <param argument="--regroup_map" type="data" format="tabular" optional="true" label="Mapfile of ids to regroup gene families to before running MinPath" help="Keep empty to use the default mapping file (ec_level4_to_metacyc_rxn.tsv contained in PICRUSt2)"/>
            </when>
            <when value="--no_regroup"/>
        </conditional>
        <conditional name="strat_output">
            <param argument="--per_sequence_contrib" type="select" label="Calculate pathway abundances for each individual predicted genome" help="The output will be the predicted pathway abundance contributed by each individual sequence. This is in contrast to the default stratified output, which is the contribution to the community-wide pathway abundances. Note this will greatly increase the runtime. Experimental pathway coverage stratified by contributing sequence will also be output when --coverage is set">
                <option value="--per_sequence_contrib">Yes</option>
                <option value="" selected="true">No</option>
            </param>
            <when value="--per_sequence_contrib">
                <param argument="--per_sequence_abun" type="data" format="tabular" label="Table of sequence abundances across samples normalized by marker copy number" help="Typically the normalized sequence abundance table output at the metagenome pipeline step. This input is required when the per sequence contrib option is set"/>
                <param argument="--per_sequence_function" type="data" format="tabular" label="Table of function abundances per sequence, which was outputted at the hidden-state prediction step" help="This input is required when the per sequence contrib option is set. Note that this file should be the same input table as used for the metagenome pipeline step"/>
                <!-- TODO maybe deprecate .. because complicated anyway as its used in metagenome_pipeline as well and help says deprecated as well -->
                <param argument="--wide_table" type="boolean" truevalue="--wide_table" falsevalue="" checked="false" label="Output wide-format stratified table (DEPRECATED)" help="Instead of the metagenome contribution table. This is the deprecated method of generating
                stratified tables since it is extremely memory intensive"/>
            </when>
            <when value=""/>
        </conditional>
        <param argument="--coverage" type="boolean" truevalue="--coverage" falsevalue="" checked="false" label="Calculate pathway coverages as well as abundances" help="Experimental and only useful for advanced users"/>
    </xml>
    <xml name="pathways_output" tokens="from_work_dir" token_label_suffix="">
        <data name="pathways_output" format="tabular" from_work_dir="@FROM_WORK_DIR@/pathways_out/path_abun_unstrat.tabular" label="${tool.name} on ${on_string}: Pathway abundances">
            <yield/>
        </data>
        <collection name="pathways_intermediate_output" type="list" label="${tool.name} on ${on_string}: Intermediate files @LABEL_SUFFIX@" >
            <discover_datasets pattern="__name_and_ext__" directory="@FROM_WORK_DIR@/intermediate/pathways/" format="tabular"/>
            <yield name="intermediate_filter"/>
        </collection>
        <data format="tabular" name="path_cov_unstrat" from_work_dir="@FROM_WORK_DIR@/pathways_out/path_cov_unstrat.tabular" label="${tool.name} on ${on_string}: Pathway coverage @LABEL_SUFFIX@" >
            <yield/>
            <yield name="coverage_filter"/>
        </data>
        <data format="tabular" name="path_abun_unstrat_per_seq" from_work_dir="@FROM_WORK_DIR@/pathways_out/path_abun_unstrat_per_seq.tabular" label="${tool.name} on ${on_string}: Pathway abundance unstratified per sequence @LABEL_SUFFIX@" >
            <yield/>
            <yield name="per_sequence_filter"/>
        </data>
        <data format="tabular" name="path_abun_predictions" from_work_dir="@FROM_WORK_DIR@/pathways_out/path_abun_predictions.tabular" label="${tool.name} on ${on_string}: Pathway abundance predictions @LABEL_SUFFIX@" >
            <yield/>
            <yield name="per_sequence_filter"/>
        </data>
        <data format="tabular" name="path_abun_contrib" from_work_dir="@FROM_WORK_DIR@/pathways_out/path_abun_contrib.tabular" label="${tool.name} on ${on_string}: Pathway abundance contributed @LABEL_SUFFIX@" >
            <yield/>
            <yield name="per_sequence_filter"/>
        </data>
    </xml>
</macros>
author	iuc
date	Tue, 13 Aug 2024 12:10:55 +0000
parents	415cb5d91168
children