funannotate_predict: funannotate_predict.xml comparison

comparison funannotate_predict.xml @ 1:1a59958c1f76 draft

"planemo upload commit 87560553f1dbbd3e0ab7d7157fa5a7f32f61dca1"

author	iuc
date	Mon, 04 Oct 2021 19:37:44 +0000
parents	40b87aef5241
children	33092577d65d

comparison

equal deleted inserted replaced

-:40b87aef5241
+:1a59958c1f76
 --database `pwd`'/hacked_database'
 #else
 --database '$database.fields.path'
 #end if
+$force
 --species '${organism.species}'
 --isolate '${organism.isolate}'
 --strain '${organism.strain}'
 --organism '${organism.organism}'
 --ploidy ${organism.ploidy}
 #if $parameters:
 --parameters '${parameters}'
 #end if
-#if $evidences.rna_bam:
+#if $evidences.rna_bam
 --rna_bam ${evidences.rna_bam}
 #end if
 #set est_list = ""
 #if len($evidences.transcript_evidence) > 0:
 #end for
 #end if
 --p2g_pident ${evidences.p2g_pident}
 --p2g_prefilter ${evidences.p2g_prefilter}
+--busco_seed_species '${busco.busco_seed_species}'
+--busco_db '${busco.busco_db}'
 #if $augustus.augustus_species != 'none':
 --augustus_species '${augustus.augustus_species}'
 #end if
 --min_training_models ${augustus.min_training_models}
 ${augustus.optimize_augustus}
 #if $genemark.genemark_mod:
 --genemark_mod '${genemark.genemark_mod}'
 #end if
 --soft_mask ${genemark.soft_mask}
 #end if
---busco_seed_species '${busco.busco_seed_species}'
---busco_db '${busco.busco_db}'
 $evm.repeats2evm
 #if $evm.evm_partitioning.evm_partition == "yes":
 --evm-partition-interval ${evm.evm_partitioning.evm_partition_interval}
 #else:
 <column name="path" index="3" />
 <filter type="sort_by" column="0" />
 <filter type="static_value" column="2" value="1.0" />
 </options>
 </param>
+<param argument="--force" type="boolean" checked="true" truevalue="" falsevalue="--force" label="Check the genome sequence" help="Disable at your own risk if you want to ignore problems in the genome sequence reported by Funannotate" />
 <section name="organism" expanded="true" title="Organism">
 <param argument="--species" type="text" optional="false" label="Name of the species to annotate" help="e.g. Genus species">
 <validator type="empty_field" />
 </param>
 <option value="diamond" selected="True">Diamond</option>
 <option value="tblastn">tblastn (slower)</option>
 </param>
 </section>
+<section name="busco" expanded="true" title="Busco">
+<param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will be used to perform initial training of ab initio predictors (e.g. Augustus).">
+<expand macro="busco_species"/>
+</param>
+<param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Used when BUSCO runs Augustus internally.">
+<expand macro="augustus_species"/>
+</param>
+</section>
+<section name="filtering" expanded="true" title="Filtering">
+<param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" />
+<param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" />
+<param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" />
+<param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" />
+<param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons">
+<option value="overlap blast" selected="True">overlap + blast</option>
+<option value="overlap">overlap</option>
+<option value="blast">blast</option>
+<option value="none">none</option>
+</param>
+</section>
 <param argument="--parameters" type="data" format="json" optional="true" label="Ab-initio training parameters from a previous run" help="If specified, will over-rule any other training presets based on sepcies selection." />
-<section name="other_predictors" expanded="false" title="Other annotations">
+<section name="augustus" expanded="false" title="Augustus settings (advanced)">
+<param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list">
+<option value="none" selected="True">No corresponding species, train from scratch</option>
+<expand macro="augustus_species"/>
+</param>
+<param argument="--min_training_models" type="integer" value="200" label="Minimum number of models to train Augustus" />
+<param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" />
+</section>
+<section name="genemark" expanded="false" title="GeneMark settings (advanced)">
+<param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." />
+<param argument="--genemark_mode" type="select" label="GeneMark mode">
+<option value="ES" selected="True">ES</option>
+<option value="ET">ET</option>
+</param>
+<param argument="--genemark_mod" type="data" format="txt" optional="true" label="Use pre-existing Genemark training file (e.g. gmhmm.mod)" />
+<param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" />
+</section>
+<section name="other_predictors" expanded="false" title="Other annotations (advanced)">
 <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" />
 <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" />
 <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" />
 <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" />
 <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" />
 <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" />
 </section>
-<section name="augustus" expanded="true" title="Augustus settings">
+<section name="evm" expanded="false" title="EVM settings (advanced)">
-<param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list">
-<option value="none" selected="True">No corresponding species, train from scratch</option>
-<expand macro="augustus_species"/>
-</param>
-<param argument="--min_training_models" type="integer" value="200" label="Minimum number of models to train Augustus" />
-<param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" />
-</section>
-<section name="genemark" expanded="false" title="GeneMark settings">
-<param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." />
-<param argument="--genemark_mode" type="select" label="GeneMark mode">
-<option value="ES" selected="True">ES</option>
-<option value="ET">ET</option>
-</param>
-<param argument="--genemark_mod" type="data" format="txt" optional="true" label="Use pre-existing Genemark training file (e.g. gmhmm.mod)" />
-<param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" />
-</section>
-<section name="busco" expanded="true" title="BUSCO settings">
-<param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Select the closest species. BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
-<expand macro="augustus_species"/>
-</param>
-<param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will only be used if no RNASeq (bam) data is given as evidence.">
-<expand macro="busco_species"/>
-</param>
-</section>
-<section name="evm" expanded="false" title="EVM settings">
 <param argument="--repeats2evm" type="boolean" checked="false" truevalue="--repeats2evm" falsevalue="" label="Use repeats in EVM consensus model building" help="Not recommended for fungal genomes that have high gene density. You might want to turn this option on for larger genomes or those that have a high repeat content." />
 <conditional name="evm_partitioning">
 <param name="evm_partition" type="select" label="Split contigs into partitions for EVM processing?" help="Splits big contigs in smaller overlaping chunks to reduce memory usage and parallelize">
 <option value="yes" selected="True">Yes</option>
 <option value="no">No</option>
 <param argument="--weights" type="text" optional="true" label="Custom ab-initio predictor and EVM weight" help="e.g. augustus:2 pasa:10">
 <validator type="regex" message="Key must consist of alphanumeric characters only, possibly separated by the period character ('.')">^[\w: ]+$</validator>
 </param>
 </section>
-<section name="filtering" expanded="true" title="Filtering">
+<param name="outputs" type="select" optional="true" multiple="true" label="Which outputs should be generated">
-<param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" />
+<option value="gbk" selected="true">Annotated genome (genbank)</option>
-<param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" />
+<option value="tbl">NCBI tbl annotation file (use for NCBI submission)</option>
-<param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" />
+<option value="gff3">Annotation in GFF3 format</option>
-<param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" />
+<option value="proteins_fa">Multi-fasta file of protein coding genes</option>
-<param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons">
+<option value="mrna_transcripts_fa">Multi-fasta file of transcripts (mRNA)</option>
-<option value="overlap blast" selected="True">overlap + blast</option>
+<option value="cds_transcripts_fa">Multi-fasta file of transcripts (CDS)</option>
-<option value="overlap">overlap</option>
+<option value="tbl2asn_report">tbl2asn summary report of annotated genome</option>
-<option value="blast">blast</option>
+<option value="tbl2asn_error">tbl2asn error summary report</option>
-<option value="none">none</option>
+<option value="tbl2asn_validation">tbl2asn genome validation report</option>
-</param>
+<option value="stats">statistics</option>
-</section>
+</param>
 <!-- Need this to change path in the test funannotate_db -->
 <param type="hidden" name="uglyTestingHack" value="" />
 </inputs>
 <outputs>
-<data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk" />
+<data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk">
-<data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl" />
+<filter>outputs and 'gbk' in outputs</filter>
-<data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3" />
+</data>
-<data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa" />
+<data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl">
-<data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa" />
+<filter>outputs and 'tbl' in outputs</filter>
-<data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa" />
+</data>
-<data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt" />
+<data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3">
-<data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt" />
+<filter>outputs and 'gff3' in outputs</filter>
-<data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt" />
+</data>
-<data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json" />
+<data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa">
+<filter>outputs and 'proteins_fa' in outputs</filter>
+</data>
+<data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa">
+<filter>outputs and 'mrna_transcripts_fa' in outputs</filter>
+</data>
+<data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa">
+<filter>outputs and 'cds_transcripts_fa' in outputs</filter>
+</data>
+<data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt">
+<filter>outputs and 'tbl2asn_report' in outputs</filter>
+</data>
+<data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt">
+<filter>outputs and 'tbl2asn_error' in outputs</filter>
+</data>
+<data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt">
+<filter>outputs and 'tbl2asn_validation' in outputs</filter>
+</data>
+<data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json">
+<filter>outputs and 'stats' in outputs</filter>
+</data>
 <!-- TODO some day: provide trained models as output, reusable as input to other funannotate runs
 (parameters.json file references files with absolute paths, would probably need to create an archive + edit paths in parameters.json) -->
 <!--data name='abinitio' format='json' label="${tool.name} on ${on_string}: ab-initio training parameters" from_work_dir="output/predict_results/*.parameters.json" /-->
 </outputs>
 <tests>
 <!-- training from scratch -->
 <test>
 <param name="input" value="genome_masked.fa" />
 <param name="database" value="2021-07-20-120000" />
-<section name="organism">
-<param name="species" value="Genus species" />
-</section>
-<section name="augustus">
-<param name="min_training_models" value="3" />
-</section>
 <section name="busco">
 <param name="busco_seed_species" value="fly" />
 <param name="busco_db" value="insecta" />
 </section>
+<section name="organism">
+<param name="species" value="Genus species" />
+</section>
+<section name="augustus">
+<param name="min_training_models" value="3" />
+</section>
+<param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
 <!-- non deterministic results, so can't be more precise here -->
 <output name="annot_gbk">
 <assert_contents>
 <has_text text="  TITLE     Direct Submission" />
 <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
 <section name="busco">
 <param name="busco_seed_species" value="fly" />
 <param name="busco_db" value="insecta" />
 </section>
 <param name="uglyTestingHack" value="true" />
+<param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
 <!-- non deterministic results, so can't be more precise here -->
 <output name="annot_gbk">
 <assert_contents>
 <has_text text="  TITLE     Direct Submission" />
 <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
 <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
 <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
 </assert_stderr>
 </test>
-<!-- bam -->
+<!-- bam and transcripts and proteins -->
 <test>
 <param name="input" value="genome_masked.fa" />
 <param name="database" value="2021-07-20-120000" />
 <section name="organism">
 <param name="species" value="Genus species" />
 <conditional name="prot_evidence">
 <param name="prot_evidence_source" value="custom" />
 <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" />
 </conditional>
 </section>
-<section name="augustus">
-<param name="min_training_models" value="3" />
-</section>
 <section name="busco">
 <param name="busco_seed_species" value="fly" />
 <param name="busco_db" value="insecta" />
 </section>
+<section name="augustus">
+<param name="min_training_models" value="3" />
+</section>
+<param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
 <!-- non deterministic results, so can't be more precise here -->
 <output name="annot_gbk">
 <assert_contents>
 <has_text text="  TITLE     Direct Submission" />
 <has_text text="/locus_tag=&quot;FUN_000001&quot;" />
 <has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
 <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
 <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
 </assert_stderr>
 </test>
+<!-- proteins -->
+<test>
+<param name="input" value="genome_masked.fa" />
+<param name="database" value="2021-07-20-120000" />
+<section name="organism">
+<param name="species" value="Genus species" />
+</section>
+<section name="evidences">
+<conditional name="prot_evidence">
+<param name="prot_evidence_source" value="custom" />
+<param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" />
+</conditional>
+</section>
+<section name="busco">
+<param name="busco_seed_species" value="fly" />
+<param name="busco_db" value="insecta" />
+</section>
+<section name="augustus">
+<param name="min_training_models" value="3" />
+</section>
+<param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" />
+<!-- non deterministic results, so can't be more precise here -->
+<output name="annot_gbk">
+<assert_contents>
+<has_text text="  TITLE     Direct Submission" />
+<has_text text="/locus_tag=&quot;FUN_000001&quot;" />
+</assert_contents>
+</output>
+<output name="annot_tbl">
+<assert_contents>
+<has_text text=">Feature sample" />
+<has_text text="gnl|ncbi|FUN_000001-T1_mrna" />
+</assert_contents>
+</output>
+<output name="annot_gff3">
+<assert_contents>
+<has_text text="##gff-version 3" />
+<has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" />
+</assert_contents>
+</output>
+<output name="fasta_proteins">
+<assert_contents>
+<has_text text=">FUN_000001-T1 FUN_000001" />
+</assert_contents>
+</output>
+<output name="fasta_transcripts_mrna">
+<assert_contents>
+<has_text text=">FUN_000001-T1 FUN_000001" />
+</assert_contents>
+</output>
+<output name="fasta_transcripts_cds">
+<assert_contents>
+<has_text text=">FUN_000001-T1 FUN_000001" />
+</assert_contents>
+</output>
+<assert_stderr>
+<has_text text="augustus     busco"/>
+<has_text text="glimmerhmm   busco"/>
+<has_text text="snap         busco"/>
+<has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/>
+<has_text text="Skipping CodingQuarry as no --rna_bam passed"/>
+<has_text text="Running Augustus gene prediction using genus_species parameters"/>
+<has_text text="Training Augustus using BUSCO gene models"/>
+<not_has_text text="Aligning transcript evidence to genome with minimap2"/>
+<not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/>
+<not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/>
+<has_text text="Mapping 13 proteins to genome using diamond and exonerate"/>
+<has_text text="Found 4 preliminary alignments --> aligning with exonerate"/>
+</assert_stderr>
+</test>
 </tests>
 <help><![CDATA[
 Funannotate_ predict
 --------------------

Mercurial > repos > iuc > funannotate_predict

comparison funannotate_predict.xml @ 1:1a59958c1f76 draft