Mercurial > repos > iuc > funannotate_predict
diff funannotate_predict.xml @ 1:1a59958c1f76 draft
"planemo upload commit 87560553f1dbbd3e0ab7d7157fa5a7f32f61dca1"
author | iuc |
---|---|
date | Mon, 04 Oct 2021 19:37:44 +0000 |
parents | 40b87aef5241 |
children | 33092577d65d |
line wrap: on
line diff
--- a/funannotate_predict.xml Thu Aug 26 06:55:33 2021 +0000 +++ b/funannotate_predict.xml Mon Oct 04 19:37:44 2021 +0000 @@ -32,6 +32,8 @@ --database '$database.fields.path' #end if +$force + --species '${organism.species}' --isolate '${organism.isolate}' --strain '${organism.strain}' @@ -46,7 +48,7 @@ --parameters '${parameters}' #end if -#if $evidences.rna_bam: +#if $evidences.rna_bam --rna_bam ${evidences.rna_bam} #end if @@ -71,6 +73,9 @@ --p2g_pident ${evidences.p2g_pident} --p2g_prefilter ${evidences.p2g_prefilter} +--busco_seed_species '${busco.busco_seed_species}' +--busco_db '${busco.busco_db}' + #if $augustus.augustus_species != 'none': --augustus_species '${augustus.augustus_species}' #end if @@ -85,9 +90,6 @@ --soft_mask ${genemark.soft_mask} #end if ---busco_seed_species '${busco.busco_seed_species}' ---busco_db '${busco.busco_db}' - $evm.repeats2evm #if $evm.evm_partitioning.evm_partition == "yes": --evm-partition-interval ${evm.evm_partitioning.evm_partition_interval} @@ -145,6 +147,8 @@ </options> </param> + <param argument="--force" type="boolean" checked="true" truevalue="" falsevalue="--force" label="Check the genome sequence" help="Disable at your own risk if you want to ignore problems in the genome sequence reported by Funannotate" /> + <section name="organism" expanded="true" title="Organism"> <param argument="--species" type="text" optional="false" label="Name of the species to annotate" help="e.g. Genus species"> <validator type="empty_field" /> @@ -179,18 +183,31 @@ </param> </section> + <section name="busco" expanded="true" title="Busco"> + <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will be used to perform initial training of ab initio predictors (e.g. Augustus)."> + <expand macro="busco_species"/> + </param> + <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Used when BUSCO runs Augustus internally."> + <expand macro="augustus_species"/> + </param> + </section> + + <section name="filtering" expanded="true" title="Filtering"> + <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" /> + <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" /> + <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" /> + <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" /> + <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons"> + <option value="overlap blast" selected="True">overlap + blast</option> + <option value="overlap">overlap</option> + <option value="blast">blast</option> + <option value="none">none</option> + </param> + </section> + <param argument="--parameters" type="data" format="json" optional="true" label="Ab-initio training parameters from a previous run" help="If specified, will over-rule any other training presets based on sepcies selection." /> - <section name="other_predictors" expanded="false" title="Other annotations"> - <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" /> - <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" /> - <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" /> - <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" /> - <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" /> - <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" /> - </section> - - <section name="augustus" expanded="true" title="Augustus settings"> + <section name="augustus" expanded="false" title="Augustus settings (advanced)"> <param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list"> <option value="none" selected="True">No corresponding species, train from scratch</option> <expand macro="augustus_species"/> @@ -199,7 +216,7 @@ <param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" /> </section> - <section name="genemark" expanded="false" title="GeneMark settings"> + <section name="genemark" expanded="false" title="GeneMark settings (advanced)"> <param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." /> <param argument="--genemark_mode" type="select" label="GeneMark mode"> <option value="ES" selected="True">ES</option> @@ -209,16 +226,16 @@ <param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" /> </section> - <section name="busco" expanded="true" title="BUSCO settings"> - <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Select the closest species. BUSCO will only be used if no RNASeq (bam) data is given as evidence."> - <expand macro="augustus_species"/> - </param> - <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will only be used if no RNASeq (bam) data is given as evidence."> - <expand macro="busco_species"/> - </param> + <section name="other_predictors" expanded="false" title="Other annotations (advanced)"> + <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" /> + <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" /> + <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" /> + <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" /> + <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" /> + <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" /> </section> - <section name="evm" expanded="false" title="EVM settings"> + <section name="evm" expanded="false" title="EVM settings (advanced)"> <param argument="--repeats2evm" type="boolean" checked="false" truevalue="--repeats2evm" falsevalue="" label="Use repeats in EVM consensus model building" help="Not recommended for fungal genomes that have high gene density. You might want to turn this option on for larger genomes or those that have a high repeat content." /> <conditional name="evm_partitioning"> <param name="evm_partition" type="select" label="Split contigs into partitions for EVM processing?" help="Splits big contigs in smaller overlaping chunks to reduce memory usage and parallelize"> @@ -235,33 +252,53 @@ </param> </section> - <section name="filtering" expanded="true" title="Filtering"> - <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" /> - <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" /> - <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" /> - <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" /> - <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons"> - <option value="overlap blast" selected="True">overlap + blast</option> - <option value="overlap">overlap</option> - <option value="blast">blast</option> - <option value="none">none</option> - </param> - </section> + <param name="outputs" type="select" optional="true" multiple="true" label="Which outputs should be generated"> + <option value="gbk" selected="true">Annotated genome (genbank)</option> + <option value="tbl">NCBI tbl annotation file (use for NCBI submission)</option> + <option value="gff3">Annotation in GFF3 format</option> + <option value="proteins_fa">Multi-fasta file of protein coding genes</option> + <option value="mrna_transcripts_fa">Multi-fasta file of transcripts (mRNA)</option> + <option value="cds_transcripts_fa">Multi-fasta file of transcripts (CDS)</option> + <option value="tbl2asn_report">tbl2asn summary report of annotated genome</option> + <option value="tbl2asn_error">tbl2asn error summary report</option> + <option value="tbl2asn_validation">tbl2asn genome validation report</option> + <option value="stats">statistics</option> + </param> <!-- Need this to change path in the test funannotate_db --> <param type="hidden" name="uglyTestingHack" value="" /> </inputs> <outputs> - <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk" /> - <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl" /> - <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3" /> - <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa" /> - <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa" /> - <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa" /> - <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt" /> - <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt" /> - <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt" /> - <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json" /> + <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk"> + <filter>outputs and 'gbk' in outputs</filter> + </data> + <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl"> + <filter>outputs and 'tbl' in outputs</filter> + </data> + <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3"> + <filter>outputs and 'gff3' in outputs</filter> + </data> + <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa"> + <filter>outputs and 'proteins_fa' in outputs</filter> + </data> + <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa"> + <filter>outputs and 'mrna_transcripts_fa' in outputs</filter> + </data> + <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa"> + <filter>outputs and 'cds_transcripts_fa' in outputs</filter> + </data> + <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt"> + <filter>outputs and 'tbl2asn_report' in outputs</filter> + </data> + <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt"> + <filter>outputs and 'tbl2asn_error' in outputs</filter> + </data> + <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt"> + <filter>outputs and 'tbl2asn_validation' in outputs</filter> + </data> + <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json"> + <filter>outputs and 'stats' in outputs</filter> + </data> <!-- TODO some day: provide trained models as output, reusable as input to other funannotate runs (parameters.json file references files with absolute paths, would probably need to create an archive + edit paths in parameters.json) --> <!--data name='abinitio' format='json' label="${tool.name} on ${on_string}: ab-initio training parameters" from_work_dir="output/predict_results/*.parameters.json" /--> @@ -271,16 +308,17 @@ <test> <param name="input" value="genome_masked.fa" /> <param name="database" value="2021-07-20-120000" /> + <section name="busco"> + <param name="busco_seed_species" value="fly" /> + <param name="busco_db" value="insecta" /> + </section> <section name="organism"> <param name="species" value="Genus species" /> </section> <section name="augustus"> <param name="min_training_models" value="3" /> </section> - <section name="busco"> - <param name="busco_seed_species" value="fly" /> - <param name="busco_db" value="insecta" /> - </section> + <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" /> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> @@ -350,6 +388,7 @@ <param name="busco_db" value="insecta" /> </section> <param name="uglyTestingHack" value="true" /> + <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" /> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> @@ -399,7 +438,7 @@ </assert_stderr> </test> - <!-- bam --> + <!-- bam and transcripts and proteins --> <test> <param name="input" value="genome_masked.fa" /> <param name="database" value="2021-07-20-120000" /> @@ -414,13 +453,14 @@ <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" /> </conditional> </section> - <section name="augustus"> - <param name="min_training_models" value="3" /> - </section> <section name="busco"> <param name="busco_seed_species" value="fly" /> <param name="busco_db" value="insecta" /> </section> + <section name="augustus"> + <param name="min_training_models" value="3" /> + </section> + <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" /> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> @@ -470,6 +510,77 @@ <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/> </assert_stderr> </test> + + <!-- proteins --> + <test> + <param name="input" value="genome_masked.fa" /> + <param name="database" value="2021-07-20-120000" /> + <section name="organism"> + <param name="species" value="Genus species" /> + </section> + <section name="evidences"> + <conditional name="prot_evidence"> + <param name="prot_evidence_source" value="custom" /> + <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" /> + </conditional> + </section> + <section name="busco"> + <param name="busco_seed_species" value="fly" /> + <param name="busco_db" value="insecta" /> + </section> + <section name="augustus"> + <param name="min_training_models" value="3" /> + </section> + <param name="outputs" value="gbk,tbl,gff3,proteins_fa,mrna_transcripts_fa,cds_transcripts_fa,tbl2asn_error,tbl2asn_report,tbl2asn_validation,stats" /> + <!-- non deterministic results, so can't be more precise here --> + <output name="annot_gbk"> + <assert_contents> + <has_text text=" TITLE Direct Submission" /> + <has_text text="/locus_tag="FUN_000001"" /> + </assert_contents> + </output> + <output name="annot_tbl"> + <assert_contents> + <has_text text=">Feature sample" /> + <has_text text="gnl|ncbi|FUN_000001-T1_mrna" /> + </assert_contents> + </output> + <output name="annot_gff3"> + <assert_contents> + <has_text text="##gff-version 3" /> + <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" /> + </assert_contents> + </output> + <output name="fasta_proteins"> + <assert_contents> + <has_text text=">FUN_000001-T1 FUN_000001" /> + </assert_contents> + </output> + <output name="fasta_transcripts_mrna"> + <assert_contents> + <has_text text=">FUN_000001-T1 FUN_000001" /> + </assert_contents> + </output> + <output name="fasta_transcripts_cds"> + <assert_contents> + <has_text text=">FUN_000001-T1 FUN_000001" /> + </assert_contents> + </output> + <assert_stderr> + <has_text text="augustus busco"/> + <has_text text="glimmerhmm busco"/> + <has_text text="snap busco"/> + <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/> + <has_text text="Skipping CodingQuarry as no --rna_bam passed"/> + <has_text text="Running Augustus gene prediction using genus_species parameters"/> + <has_text text="Training Augustus using BUSCO gene models"/> + <not_has_text text="Aligning transcript evidence to genome with minimap2"/> + <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/> + <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/> + <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/> + <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/> + </assert_stderr> + </test> </tests> <help><![CDATA[ Funannotate_ predict