diff funannotate_predict.xml @ 1:1a59958c1f76 draft

"planemo upload commit 87560553f1dbbd3e0ab7d7157fa5a7f32f61dca1"
author iuc
date Mon, 04 Oct 2021 19:37:44 +0000
parents 40b87aef5241
children 33092577d65d
line wrap: on
line diff
--- a/funannotate_predict.xml	Thu Aug 26 06:55:33 2021 +0000
+++ b/funannotate_predict.xml	Mon Oct 04 19:37:44 2021 +0000
@@ -32,6 +32,8 @@
     --database '$database.fields.path'
 #end if
 
+$force
+
 --species '${organism.species}'
 --isolate '${organism.isolate}'
 --strain '${organism.strain}'
@@ -46,7 +48,7 @@
     --parameters '${parameters}'
 #end if
 
-#if $evidences.rna_bam:
+#if $evidences.rna_bam
     --rna_bam ${evidences.rna_bam}
 #end if
 
@@ -71,6 +73,9 @@
 --p2g_pident ${evidences.p2g_pident}
 --p2g_prefilter ${evidences.p2g_prefilter}
 
+--busco_seed_species '${busco.busco_seed_species}'
+--busco_db '${busco.busco_db}'
+
 #if $augustus.augustus_species != 'none':
     --augustus_species '${augustus.augustus_species}'
 #end if
@@ -85,9 +90,6 @@
     --soft_mask ${genemark.soft_mask}
 #end if
 
---busco_seed_species '${busco.busco_seed_species}'
---busco_db '${busco.busco_db}'
-
 $evm.repeats2evm
 #if $evm.evm_partitioning.evm_partition == "yes":
 --evm-partition-interval ${evm.evm_partitioning.evm_partition_interval}
@@ -145,6 +147,8 @@
             </options>
         </param>
 
+        <param argument="--force" type="boolean" checked="true" truevalue="" falsevalue="--force" label="Check the genome sequence" help="Disable at your own risk if you want to ignore problems in the genome sequence reported by Funannotate" />
+
         <section name="organism" expanded="true" title="Organism">
             <param argument="--species" type="text" optional="false" label="Name of the species to annotate" help="e.g. Genus species">
                 <validator type="empty_field" />
@@ -179,18 +183,31 @@
             </param>
         </section>
 
+        <section name="busco" expanded="true" title="Busco">
+            <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will be used to perform initial training of ab initio predictors (e.g. Augustus).">
+                <expand macro="busco_species"/>
+            </param>
+            <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Used when BUSCO runs Augustus internally.">
+                <expand macro="augustus_species"/>
+            </param>
+        </section>
+
+        <section name="filtering" expanded="true" title="Filtering">
+            <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" />
+            <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" />
+            <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" />
+            <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" />
+            <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons">
+                <option value="overlap blast" selected="True">overlap + blast</option>
+                <option value="overlap">overlap</option>
+                <option value="blast">blast</option>
+                <option value="none">none</option>
+            </param>
+        </section>
+
         <param argument="--parameters" type="data" format="json" optional="true" label="Ab-initio training parameters from a previous run" help="If specified, will over-rule any other training presets based on sepcies selection." />
 
-        <section name="other_predictors" expanded="false" title="Other annotations">
-            <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" />
-            <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" />
-            <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" />
-            <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" />
-            <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" />
-            <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" />
-        </section>
-
-        <section name="augustus" expanded="true" title="Augustus settings">
+        <section name="augustus" expanded="false" title="Augustus settings (advanced)">
             <param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list">
                 <option value="none" selected="True">No corresponding species, train from scratch</option>
                 <expand macro="augustus_species"/>
@@ -199,7 +216,7 @@
             <param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" />
         </section>
 
-        <section name="genemark" expanded="false" title="GeneMark settings">
+        <section name="genemark" expanded="false" title="GeneMark settings (advanced)">
             <param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." />
             <param argument="--genemark_mode" type="select" label="GeneMark mode">
                 <option value="ES" selected="True">ES</option>
@@ -209,16 +226,16 @@
             <param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" />
         </section>
 
-        <section name="busco" expanded="true" title="BUSCO settings">
-            <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Select the closest species. BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
-                <expand macro="augustus_species"/>
-            </param>
-            <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
-                <expand macro="busco_species"/>
-            </param>
+        <section name="other_predictors" expanded="false" title="Other annotations (advanced)">
+            <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" />
+            <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" />
+            <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" />
+            <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" />
+            <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" />
+            <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" />
         </section>
 
-        <section name="evm" expanded="false" title="EVM settings">
+        <section name="evm" expanded="false" title="EVM settings (advanced)">
             <param argument="--repeats2evm" type="boolean" checked="false" truevalue="--repeats2evm" falsevalue="" label="Use repeats in EVM consensus model building" help="Not recommended for fungal genomes that have high gene density. You might want to turn this option on for larger genomes or those that have a high repeat content." />
             <conditional name="evm_partitioning">
                 <param name="evm_partition" type="select" label="Split contigs into partitions for EVM processing?" help="Splits big contigs in smaller overlaping chunks to reduce memory usage and parallelize">
@@ -235,33 +252,53 @@
             </param>
         </section>
 
-        <section name="filtering" expanded="true" title="Filtering">
-            <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" />
-            <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" />
-            <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" />
-            <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" />
-            <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons">
-                <option value="overlap blast" selected="True">overlap + blast</option>
-                <option value="overlap">overlap</option>
-                <option value="blast">blast</option>
-                <option value="none">none</option>
-            </param>
-        </section>
+        <param name="outputs" type="select" optional="true" multiple="true" label="Which outputs should be generated">
+            <option value="gbk" selected="true">Annotated genome (genbank)</option>
+            <option value="tbl">NCBI tbl annotation file (use for NCBI submission)</option>
+            <option value="gff3">Annotation in GFF3 format</option>
+            <option value="proteins_fa">Multi-fasta file of protein coding genes</option>
+            <option value="mrna_transcripts_fa">Multi-fasta file of transcripts (mRNA)</option>
+            <option value="cds_transcripts_fa">Multi-fasta file of transcripts (CDS)</option>
+            <option value="tbl2asn_report">tbl2asn summary report of annotated genome</option>
+            <option value="tbl2asn_error">tbl2asn error summary report</option>
+            <option value="tbl2asn_validation">tbl2asn genome validation report</option>
+            <option value="stats">statistics</option>
+        </param>
 
         <!-- Need this to change path in the test funannotate_db -->
         <param type="hidden" name="uglyTestingHack" value="" />
     </inputs>
     <outputs>
-        <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk" />
-        <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl" />
-        <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3" />
-        <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa" />
-        <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa" />
-        <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa" />
-        <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt" />
-        <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt" />
-        <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt" />
-        <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json" />
+        <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk">
+            <filter>outputs and 'gbk' in outputs</filter>
+        </data>
+        <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl">
+            <filter>outputs and 'tbl' in outputs</filter>
+        </data>
+        <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3">
+            <filter>outputs and 'gff3' in outputs</filter>
+        </data>
+        <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa">
+            <filter>outputs and 'proteins_fa' in outputs</filter>
+        </data>
+        <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa">
+            <filter>outputs and 'mrna_transcripts_fa' in outputs</filter>
+        </data>
+        <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa">
+            <filter>outputs and 'cds_transcripts_fa' in outputs</filter>
+        </data>
+        <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt">
+            <filter>outputs and 'tbl2asn_report' in outputs</filter>
+        </data>
+        <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt">
+            <filter>outputs and 'tbl2asn_error' in outputs</filter>
+        </data>
+        <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt">
+            <filter>outputs and 'tbl2asn_validation' in outputs</filter>
+        </data>
+        <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json">
+            <filter>outputs and 'stats' in outputs</filter>
+        </data>
         <!-- TODO some day: provide trained models as output, reusable as input to other funannotate runs
             (parameters.json file references files with absolute paths, would probably need to create an archive + edit paths in parameters.json) -->
         <!--data name='abinitio' format='json' label="${tool.name} on ${on_string}: ab-initio training parameters" from_work_dir="output/predict_results/*.parameters.json" /-->
@@ -271,16 +308,17 @@
         <test>
             <param name="input" value="genome_masked.fa" />
             <param name="database" value="2021-07-20-120000" />
+            <section name="busco">
+                <param name="busco_seed_species" value="fly" />
+                <param name="busco_db" value="insecta" />
+            </section>
             <section name="organism">
                 <param name="species" value="Genus species" />
             </section>
             <section name="augustus">
                 <param name="min_training_models" value="3" />
             </section>
-            <section name="busco">
-                <param name="busco_seed_species" value="fly" />
-                <param name="busco_db" value="insecta" />
-            </section>
+            <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
             <!-- non deterministic results, so can't be more precise here -->
             <output name="annot_gbk">
                 <assert_contents>
@@ -350,6 +388,7 @@
                 <param name="busco_db" value="insecta" />
             </section>
             <param name="uglyTestingHack" value="true" />
+            <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
             <!-- non deterministic results, so can't be more precise here -->
             <output name="annot_gbk">
                 <assert_contents>
@@ -399,7 +438,7 @@
             </assert_stderr>
         </test>
 
-        <!-- bam -->
+        <!-- bam and transcripts and proteins -->
         <test>
             <param name="input" value="genome_masked.fa" />
             <param name="database" value="2021-07-20-120000" />
@@ -414,13 +453,14 @@
                     <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" />
                 </conditional>
             </section>
-            <section name="augustus">
-                <param name="min_training_models" value="3" />
-            </section>
             <section name="busco">
                 <param name="busco_seed_species" value="fly" />
                 <param name="busco_db" value="insecta" />
             </section>
+            <section name="augustus">
+                <param name="min_training_models" value="3" />
+            </section>
+            <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
             <!-- non deterministic results, so can't be more precise here -->
             <output name="annot_gbk">
                 <assert_contents>
@@ -470,6 +510,77 @@
                 <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
             </assert_stderr>
         </test>
+
+        <!-- proteins -->
+        <test>
+            <param name="input" value="genome_masked.fa" />
+            <param name="database" value="2021-07-20-120000" />
+            <section name="organism">
+                <param name="species" value="Genus species" />
+            </section>
+            <section name="evidences">
+                <conditional name="prot_evidence">
+                    <param name="prot_evidence_source" value="custom" />
+                    <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" />
+                </conditional>
+            </section>
+            <section name="busco">
+                <param name="busco_seed_species" value="fly" />
+                <param name="busco_db" value="insecta" />
+            </section>
+            <section name="augustus">
+                <param name="min_training_models" value="3" />
+            </section>
+            <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
+            <!-- non deterministic results, so can't be more precise here -->
+            <output name="annot_gbk">
+                <assert_contents>
+                    <has_text text="  TITLE     Direct Submission" />
+                    <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
+                </assert_contents>
+            </output>
+            <output name="annot_tbl">
+                <assert_contents>
+                    <has_text text=">Feature sample" />
+                    <has_text text="gnl|ncbi|FUN_000001-T1_mrna" />
+                </assert_contents>
+            </output>
+            <output name="annot_gff3">
+                <assert_contents>
+                    <has_text text="##gff-version 3" />
+                    <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" />
+                </assert_contents>
+            </output>
+            <output name="fasta_proteins">
+                <assert_contents>
+                    <has_text text=">FUN_000001-T1 FUN_000001" />
+                </assert_contents>
+            </output>
+            <output name="fasta_transcripts_mrna">
+                <assert_contents>
+                    <has_text text=">FUN_000001-T1 FUN_000001" />
+                </assert_contents>
+            </output>
+            <output name="fasta_transcripts_cds">
+                <assert_contents>
+                    <has_text text=">FUN_000001-T1 FUN_000001" />
+                </assert_contents>
+            </output>
+            <assert_stderr>
+                <has_text text="augustus     busco"/>
+                <has_text text="glimmerhmm   busco"/>
+                <has_text text="snap         busco"/>
+                <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/>
+                <has_text text="Skipping CodingQuarry as no --rna_bam passed"/>
+                <has_text text="Running Augustus gene prediction using genus_species parameters"/>
+                <has_text text="Training Augustus using BUSCO gene models"/>
+                <not_has_text text="Aligning transcript evidence to genome with minimap2"/>
+                <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/>
+                <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
+                <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
+                <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
+            </assert_stderr>
+        </test>
     </tests>
     <help><![CDATA[
 Funannotate_ predict