view funannotate_predict.xml @ 0:40b87aef5241 draft

"planemo upload commit 9613152729099079c7465c3d5d42005ef22ca91e"
author iuc
date Thu, 26 Aug 2021 06:55:33 +0000
parents
children 1a59958c1f76
line wrap: on
line source

<tool id="funannotate_predict" name="Funannotate predict annotation" profile="20.01" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <description></description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <expand macro="requirements" />
    </requirements>
    <version_command><![CDATA[funannotate check --show-versions]]></version_command>
    <command><![CDATA[
#if $genemark.genemark_license:
    if [ -z "\$GENEMARK_PATH" ] ; then echo "GeneMark is not installed on this Galaxy server." >&2 ; exit 1 ; fi &&
    if [ ! -f "\$GENEMARK_PATH/gmes_petap.pl" ] ; then echo "GeneMark is not installed properly on this Galaxy server." >&2 ; exit 1 ; fi &&
    ## GeneMark only search for license in ~/.gm_key
    cp '${genemark.genemark_license}' ~/.gm_key &&
#end if

#if $uglyTestingHack == "true":
    ## funannotate_db contains some hard coded path, need to rewrite one for tests (not in real life when using data manager)
    ## Need to copy too as the test_data is read only on CI
    cp -r '${database.fields.path}' './hacked_database' &&
    sed -i.bak 's|/tmp/prout|'`pwd`'/hacked_database|' './hacked_database/trained_species/fly/info.json' &&
#end if

funannotate predict
--input '${input}'
--out output

#if $uglyTestingHack == "true":
    --database `pwd`'/hacked_database'
#else
    --database '$database.fields.path'
#end if

--species '${organism.species}'
--isolate '${organism.isolate}'
--strain '${organism.strain}'
--organism '${organism.organism}'
--ploidy ${organism.ploidy}
--SeqCenter '${organism.SeqCenter}'
--SeqAccession '${organism.SeqAccession}'
--name '${organism.name}'
--numbering ${organism.numbering}

#if $parameters:
    --parameters '${parameters}'
#end if

#if $evidences.rna_bam:
    --rna_bam ${evidences.rna_bam}
#end if

#set est_list = ""
#if len($evidences.transcript_evidence) > 0:
    #for $estev in $evidences.transcript_evidence:
        #if $estev:
        #set est_list += " '" + str($estev) + "'"
        #end if
    #end for
#end if
#if $est_list:
    --transcript_evidence $est_list
#end if

#if $evidences.prot_evidence == 'custom':
    --protein_evidence
    #for $protev in $evidences.protein_evidence:
        '${protev}'
    #end for
#end if
--p2g_pident ${evidences.p2g_pident}
--p2g_prefilter ${evidences.p2g_prefilter}

#if $augustus.augustus_species != 'none':
    --augustus_species '${augustus.augustus_species}'
#end if
--min_training_models ${augustus.min_training_models}
${augustus.optimize_augustus}

#if $genemark.genemark_license:
    --genemark_mode '${genemark.genemark_mode}'
    #if $genemark.genemark_mod:
        --genemark_mod '${genemark.genemark_mod}'
    #end if
    --soft_mask ${genemark.soft_mask}
#end if

--busco_seed_species '${busco.busco_seed_species}'
--busco_db '${busco.busco_db}'

$evm.repeats2evm
#if $evm.evm_partitioning.evm_partition == "yes":
--evm-partition-interval ${evm.evm_partitioning.evm_partition_interval}
#else:
--no-evm-partitions
#end if
#if $evm.weights:
    --weights '${evm.weights}'
#end if

#if $other_predictors.stringtie:
    --stringtie '${other_predictors.stringtie}'
#end if
#if $other_predictors.maker_gff:
    --maker_gff '${other_predictors.maker_gff}'
#end if
#if $other_predictors.pasa_gff:
    --pasa_gff '${other_predictors.pasa_gff}:${other_predictors.pasa_gff_weight}'
#end if
#if $other_predictors.other_gff:
    --other_gff '${other_predictors.other_gff}:${other_predictors.other_gff_weight}'
#end if

--min_intronlen ${filtering.min_intronlen}
--max_intronlen ${filtering.max_intronlen}
--min_protlen ${filtering.min_protlen}
${filtering.keep_no_stops}
--repeat_filter ${filtering.repeat_filter}

--cpus \${GALAXY_SLOTS:-2}

&&

mv output/predict_results/*.gbk out.gbk &&
mv output/predict_results/*.tbl out.tbl &&
mv output/predict_results/*.gff3 out.gff3 &&
mv output/predict_results/*.proteins.fa out.proteins.fa &&
mv output/predict_results/*.mrna-transcripts.fa out.mrna-transcripts.fa &&
mv output/predict_results/*.cds-transcripts.fa out.cds-transcripts.fa &&
mv output/predict_results/*.discrepency.report.txt out.discrepency.report.txt &&
mv output/predict_results/*.error.summary.txt out.error.summary.txt &&
mv output/predict_results/*.validation.txt out.validation.txt &&
mv output/predict_results/*.stats.json out.stats.json
    ]]></command>
    <inputs>
        <param argument="--input" type="data" format="fasta" label="Assembly to annotate" help="The assembly should be soft-masked (with RepeatMasker for example)" />

        <param name="database" label="Funannotate database" type="select">
            <options from_data_table="funannotate">
                <column name="value" index="0" />
                <column name="name" index="1" />
                <column name="path" index="3" />
                <filter type="sort_by" column="0" />
                <filter type="static_value" column="2" value="1.0" />
            </options>
        </param>

        <section name="organism" expanded="true" title="Organism">
            <param argument="--species" type="text" optional="false" label="Name of the species to annotate" help="e.g. Genus species">
                <validator type="empty_field" />
            </param>
            <param argument="--isolate" type="text" label="Isolate name" help="If relevant (e.g. Af293)" />
            <param argument="--strain" type="text" label="Strain name" help="If relevant (e.g. FGSCA4)" />
            <param argument="--organism" type="boolean" checked="false" truevalue="fungus" falsevalue="other" label="Is it a fungus species?" />
            <param argument="--ploidy" type="integer" value="1" label="Ploidy of assembly" />
            <param argument="--SeqCenter" type="text" value="CFMR" label="Sequencing facility for NCBI tbl file" />
            <param argument="--SeqAccession" type="text" value="12345" label="Sequence accession number for NCBI tbl file" />
            <param argument="--name" type="text" value="FUN_" label="Locus tag prefix" help="Will prefix all the gene names" />
            <param argument="--numbering" type="integer" value="1" label="Specify where gene numbering starts" />
        </section>

        <section name="evidences" expanded="true" title="Evidences">
            <param argument="--rna_bam" type="data" format="bam" optional="true" label="RNA-seq mapped to genome to train Augustus/GeneMark-ET" />
            <param argument="--transcript_evidence" type="data" format="fasta" multiple="true" optional="true" label="mRNA/ESTs to align to genome" />
            <conditional name="prot_evidence">
                <param name="prot_evidence_source" type="select" label="Select protein evidences">
                    <option value="uniprot" selected="True">Use UniProtKb/SwissProt (from selected Funannotate database)</option>
                    <option value="custom">Custom protein sequences</option>
                </param>
                <when value="uniprot"/>
                <when value="custom">
                    <param argument="--protein_evidence" type="data" format="fasta" multiple="true" label="Proteins to map to genome" />
                </when>
            </conditional>
            <param argument="--p2g_pident" type="integer" value="80" label="Exonerate percent identity (for proteins)" />
            <param argument="--p2g_prefilter" type="select" label="Prefilter hists with (for proteins)">
                <option value="diamond" selected="True">Diamond</option>
                <option value="tblastn">tblastn (slower)</option>
            </param>
        </section>

        <param argument="--parameters" type="data" format="json" optional="true" label="Ab-initio training parameters from a previous run" help="If specified, will over-rule any other training presets based on sepcies selection." />

        <section name="other_predictors" expanded="false" title="Other annotations">
            <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" />
            <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" />
            <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" />
            <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" />
            <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" />
            <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" />
        </section>

        <section name="augustus" expanded="true" title="Augustus settings">
            <param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list">
                <option value="none" selected="True">No corresponding species, train from scratch</option>
                <expand macro="augustus_species"/>
            </param>
            <param argument="--min_training_models" type="integer" value="200" label="Minimum number of models to train Augustus" />
            <param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" />
        </section>

        <section name="genemark" expanded="false" title="GeneMark settings">
            <param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." />
            <param argument="--genemark_mode" type="select" label="GeneMark mode">
                <option value="ES" selected="True">ES</option>
                <option value="ET">ET</option>
            </param>
            <param argument="--genemark_mod" type="data" format="txt" optional="true" label="Use pre-existing Genemark training file (e.g. gmhmm.mod)" />
            <param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" />
        </section>

        <section name="busco" expanded="true" title="BUSCO settings">
            <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Select the closest species. BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
                <expand macro="augustus_species"/>
            </param>
            <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
                <expand macro="busco_species"/>
            </param>
        </section>

        <section name="evm" expanded="false" title="EVM settings">
            <param argument="--repeats2evm" type="boolean" checked="false" truevalue="--repeats2evm" falsevalue="" label="Use repeats in EVM consensus model building" help="Not recommended for fungal genomes that have high gene density. You might want to turn this option on for larger genomes or those that have a high repeat content." />
            <conditional name="evm_partitioning">
                <param name="evm_partition" type="select" label="Split contigs into partitions for EVM processing?" help="Splits big contigs in smaller overlaping chunks to reduce memory usage and parallelize">
                    <option value="yes" selected="True">Yes</option>
                    <option value="no">No</option>
                </param>
                <when value="yes">
                    <param argument="--evm-partition-interval" type="integer" value="1500" label="Min length between genes to make a partition" />
                </when>
                <when value="no"/>
            </conditional>
            <param argument="--weights" type="text" optional="true" label="Custom ab-initio predictor and EVM weight" help="e.g. augustus:2 pasa:10">
                <validator type="regex" message="Key must consist of alphanumeric characters only, possibly separated by the period character ('.')">^[\w: ]+$</validator>
            </param>
        </section>

        <section name="filtering" expanded="true" title="Filtering">
            <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" />
            <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" />
            <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" />
            <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" />
            <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons">
                <option value="overlap blast" selected="True">overlap + blast</option>
                <option value="overlap">overlap</option>
                <option value="blast">blast</option>
                <option value="none">none</option>
            </param>
        </section>

        <!-- Need this to change path in the test funannotate_db -->
        <param type="hidden" name="uglyTestingHack" value="" />
    </inputs>
    <outputs>
        <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk" />
        <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl" />
        <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3" />
        <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa" />
        <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa" />
        <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa" />
        <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt" />
        <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt" />
        <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt" />
        <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json" />
        <!-- TODO some day: provide trained models as output, reusable as input to other funannotate runs
            (parameters.json file references files with absolute paths, would probably need to create an archive + edit paths in parameters.json) -->
        <!--data name='abinitio' format='json' label="${tool.name} on ${on_string}: ab-initio training parameters" from_work_dir="output/predict_results/*.parameters.json" /-->
    </outputs>
    <tests>
        <!-- training from scratch -->
        <test>
            <param name="input" value="genome_masked.fa" />
            <param name="database" value="2021-07-20-120000" />
            <section name="organism">
                <param name="species" value="Genus species" />
            </section>
            <section name="augustus">
                <param name="min_training_models" value="3" />
            </section>
            <section name="busco">
                <param name="busco_seed_species" value="fly" />
                <param name="busco_db" value="insecta" />
            </section>
            <!-- non deterministic results, so can't be more precise here -->
            <output name="annot_gbk">
                <assert_contents>
                    <has_text text="  TITLE     Direct Submission" />
                    <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
                </assert_contents>
            </output>
            <output name="annot_tbl">
                <assert_contents>
                    <has_text text=">Feature sample" />
                    <has_text text="gnl|ncbi|FUN_000001-T1_mrna" />
                </assert_contents>
            </output>
            <output name="annot_gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                    <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" />
                </assert_contents>
            </output>
            <output name="fasta_proteins">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_mrna">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_cds">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <!--output name="abinitio" file="predict_scratch/fly.parameters.json" compare="sim_size" /-->
            <output name="tbl2asn_report" file="predict_scratch/Genus_species.discrepency.report.txt" compare="sim_size" />
            <output name="tbl2asn_error" file="predict_scratch/Genus_species.error.summary.txt" compare="sim_size" delta="500" />
            <output name="tbl2asn_validation" file="predict_scratch/Genus_species.validation.txt" compare="sim_size" delta="500" />
            <output name="stats" file="predict_scratch/Genus_species.stats.json" compare="sim_size" />
            <assert_stderr>
                <has_text text="augustus     busco"/>
                <has_text text="glimmerhmm   busco"/>
                <has_text text="snap         busco"/>
                <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/>
                <has_text text="Skipping CodingQuarry as no --rna_bam passed"/>
                <has_text text="Running Augustus gene prediction using genus_species parameters"/>
                <not_has_text text="Aligning transcript evidence to genome with minimap2"/>
                <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/>
                <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
                <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
                <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
            </assert_stderr>
        </test>

        <!-- pre-trained augustus -->
        <test>
            <param name="input" value="genome_masked.fa" />
            <param name="database" value="2021-07-20-120000" />
            <section name="organism">
                <param name="species" value="Genus species" />
            </section>
            <section name="augustus">
                <param name="augustus_species" value="fly" />
            </section>
            <section name="busco">
                <param name="busco_seed_species" value="fly" />
                <param name="busco_db" value="insecta" />
            </section>
            <param name="uglyTestingHack" value="true" />
            <!-- non deterministic results, so can't be more precise here -->
            <output name="annot_gbk">
                <assert_contents>
                    <has_text text="  TITLE     Direct Submission" />
                    <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
                </assert_contents>
            </output>
            <output name="annot_tbl">
                <assert_contents>
                    <has_text text=">Feature sample" />
                    <has_text text="gnl|ncbi|FUN_000001-T1_mrna" />
                </assert_contents>
            </output>
            <output name="annot_gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                    <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" />
                </assert_contents>
            </output>
            <output name="fasta_proteins">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_mrna">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_cds">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <assert_stderr>
                <has_text text="augustus     pretrained"/>
                <has_text text="glimmerhmm   busco"/>
                <has_text text="snap         busco"/>
                <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/>
                <has_text text="Skipping CodingQuarry as no --rna_bam passed"/>
                <has_text text="Running Augustus gene prediction using fly parameters"/>
                <not_has_text text="Aligning transcript evidence to genome with minimap2"/>
                <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/>
                <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
                <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
                <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
            </assert_stderr>
        </test>

        <!-- bam -->
        <test>
            <param name="input" value="genome_masked.fa" />
            <param name="database" value="2021-07-20-120000" />
            <section name="organism">
                <param name="species" value="Genus species" />
            </section>
            <section name="evidences">
                <param name="rna_bam" value="SRR7458692.bam" />
                <param name="transcript_evidence" value="predict_scratch/Genus_species.mrna-transcripts.fa" />
                <conditional name="prot_evidence">
                    <param name="prot_evidence_source" value="custom" />
                    <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" />
                </conditional>
            </section>
            <section name="augustus">
                <param name="min_training_models" value="3" />
            </section>
            <section name="busco">
                <param name="busco_seed_species" value="fly" />
                <param name="busco_db" value="insecta" />
            </section>
            <!-- non deterministic results, so can't be more precise here -->
            <output name="annot_gbk">
                <assert_contents>
                    <has_text text="  TITLE     Direct Submission" />
                    <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
                </assert_contents>
            </output>
            <output name="annot_tbl">
                <assert_contents>
                    <has_text text=">Feature sample" />
                    <has_text text="gnl|ncbi|FUN_000001-T1_mrna" />
                </assert_contents>
            </output>
            <output name="annot_gff3">
                <assert_contents>
                    <has_text text="##gff-version 3" />
                    <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" />
                </assert_contents>
            </output>
            <output name="fasta_proteins">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_mrna">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <output name="fasta_transcripts_cds">
                <assert_contents>
                    <has_text text=">FUN_000001-T1 FUN_000001" />
                </assert_contents>
            </output>
            <assert_stderr>
                <has_text text="augustus     busco"/>
                <has_text text="glimmerhmm   busco"/>
                <has_text text="snap         busco"/>
                <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/>
                <not_has_text text="Skipping CodingQuarry as no --rna_bam passed"/>
                <has_text text="Running Augustus gene prediction using genus_species parameters"/>
                <has_text text="Training Augustus using BUSCO gene models"/>
                <has_text text="Aligning transcript evidence to genome with minimap2"/>
                <has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/>
                <has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
                <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
                <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
            </assert_stderr>
        </test>
    </tests>
    <help><![CDATA[
Funannotate_ predict
--------------------

Funannotate_ is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes).

Script takes genome multi-fasta file and a variety of inputs to do a comprehensive whole
genome gene prediction.  Uses AUGUSTUS, GeneMark, Snap, GlimmerHMM, BUSCO, EVidence Modeler,
tbl2asn, tRNAScan-SE, Exonerate, minimap2.

.. _Funannotate: http://funannotate.readthedocs.io
    ]]></help>
    <expand macro="citations" />
</tool>