Mercurial > repos > iuc > funannotate_predict
view funannotate_predict.xml @ 0:40b87aef5241 draft
"planemo upload commit 9613152729099079c7465c3d5d42005ef22ca91e"
author | iuc |
---|---|
date | Thu, 26 Aug 2021 06:55:33 +0000 |
parents | |
children | 1a59958c1f76 |
line wrap: on
line source
<tool id="funannotate_predict" name="Funannotate predict annotation" profile="20.01" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"> <description></description> <macros> <import>macros.xml</import> </macros> <requirements> <expand macro="requirements" /> </requirements> <version_command><![CDATA[funannotate check --show-versions]]></version_command> <command><![CDATA[ #if $genemark.genemark_license: if [ -z "\$GENEMARK_PATH" ] ; then echo "GeneMark is not installed on this Galaxy server." >&2 ; exit 1 ; fi && if [ ! -f "\$GENEMARK_PATH/gmes_petap.pl" ] ; then echo "GeneMark is not installed properly on this Galaxy server." >&2 ; exit 1 ; fi && ## GeneMark only search for license in ~/.gm_key cp '${genemark.genemark_license}' ~/.gm_key && #end if #if $uglyTestingHack == "true": ## funannotate_db contains some hard coded path, need to rewrite one for tests (not in real life when using data manager) ## Need to copy too as the test_data is read only on CI cp -r '${database.fields.path}' './hacked_database' && sed -i.bak 's|/tmp/prout|'`pwd`'/hacked_database|' './hacked_database/trained_species/fly/info.json' && #end if funannotate predict --input '${input}' --out output #if $uglyTestingHack == "true": --database `pwd`'/hacked_database' #else --database '$database.fields.path' #end if --species '${organism.species}' --isolate '${organism.isolate}' --strain '${organism.strain}' --organism '${organism.organism}' --ploidy ${organism.ploidy} --SeqCenter '${organism.SeqCenter}' --SeqAccession '${organism.SeqAccession}' --name '${organism.name}' --numbering ${organism.numbering} #if $parameters: --parameters '${parameters}' #end if #if $evidences.rna_bam: --rna_bam ${evidences.rna_bam} #end if #set est_list = "" #if len($evidences.transcript_evidence) > 0: #for $estev in $evidences.transcript_evidence: #if $estev: #set est_list += " '" + str($estev) + "'" #end if #end for #end if #if $est_list: --transcript_evidence $est_list #end if #if $evidences.prot_evidence == 'custom': --protein_evidence #for $protev in $evidences.protein_evidence: '${protev}' #end for #end if --p2g_pident ${evidences.p2g_pident} --p2g_prefilter ${evidences.p2g_prefilter} #if $augustus.augustus_species != 'none': --augustus_species '${augustus.augustus_species}' #end if --min_training_models ${augustus.min_training_models} ${augustus.optimize_augustus} #if $genemark.genemark_license: --genemark_mode '${genemark.genemark_mode}' #if $genemark.genemark_mod: --genemark_mod '${genemark.genemark_mod}' #end if --soft_mask ${genemark.soft_mask} #end if --busco_seed_species '${busco.busco_seed_species}' --busco_db '${busco.busco_db}' $evm.repeats2evm #if $evm.evm_partitioning.evm_partition == "yes": --evm-partition-interval ${evm.evm_partitioning.evm_partition_interval} #else: --no-evm-partitions #end if #if $evm.weights: --weights '${evm.weights}' #end if #if $other_predictors.stringtie: --stringtie '${other_predictors.stringtie}' #end if #if $other_predictors.maker_gff: --maker_gff '${other_predictors.maker_gff}' #end if #if $other_predictors.pasa_gff: --pasa_gff '${other_predictors.pasa_gff}:${other_predictors.pasa_gff_weight}' #end if #if $other_predictors.other_gff: --other_gff '${other_predictors.other_gff}:${other_predictors.other_gff_weight}' #end if --min_intronlen ${filtering.min_intronlen} --max_intronlen ${filtering.max_intronlen} --min_protlen ${filtering.min_protlen} ${filtering.keep_no_stops} --repeat_filter ${filtering.repeat_filter} --cpus \${GALAXY_SLOTS:-2} && mv output/predict_results/*.gbk out.gbk && mv output/predict_results/*.tbl out.tbl && mv output/predict_results/*.gff3 out.gff3 && mv output/predict_results/*.proteins.fa out.proteins.fa && mv output/predict_results/*.mrna-transcripts.fa out.mrna-transcripts.fa && mv output/predict_results/*.cds-transcripts.fa out.cds-transcripts.fa && mv output/predict_results/*.discrepency.report.txt out.discrepency.report.txt && mv output/predict_results/*.error.summary.txt out.error.summary.txt && mv output/predict_results/*.validation.txt out.validation.txt && mv output/predict_results/*.stats.json out.stats.json ]]></command> <inputs> <param argument="--input" type="data" format="fasta" label="Assembly to annotate" help="The assembly should be soft-masked (with RepeatMasker for example)" /> <param name="database" label="Funannotate database" type="select"> <options from_data_table="funannotate"> <column name="value" index="0" /> <column name="name" index="1" /> <column name="path" index="3" /> <filter type="sort_by" column="0" /> <filter type="static_value" column="2" value="1.0" /> </options> </param> <section name="organism" expanded="true" title="Organism"> <param argument="--species" type="text" optional="false" label="Name of the species to annotate" help="e.g. Genus species"> <validator type="empty_field" /> </param> <param argument="--isolate" type="text" label="Isolate name" help="If relevant (e.g. Af293)" /> <param argument="--strain" type="text" label="Strain name" help="If relevant (e.g. FGSCA4)" /> <param argument="--organism" type="boolean" checked="false" truevalue="fungus" falsevalue="other" label="Is it a fungus species?" /> <param argument="--ploidy" type="integer" value="1" label="Ploidy of assembly" /> <param argument="--SeqCenter" type="text" value="CFMR" label="Sequencing facility for NCBI tbl file" /> <param argument="--SeqAccession" type="text" value="12345" label="Sequence accession number for NCBI tbl file" /> <param argument="--name" type="text" value="FUN_" label="Locus tag prefix" help="Will prefix all the gene names" /> <param argument="--numbering" type="integer" value="1" label="Specify where gene numbering starts" /> </section> <section name="evidences" expanded="true" title="Evidences"> <param argument="--rna_bam" type="data" format="bam" optional="true" label="RNA-seq mapped to genome to train Augustus/GeneMark-ET" /> <param argument="--transcript_evidence" type="data" format="fasta" multiple="true" optional="true" label="mRNA/ESTs to align to genome" /> <conditional name="prot_evidence"> <param name="prot_evidence_source" type="select" label="Select protein evidences"> <option value="uniprot" selected="True">Use UniProtKb/SwissProt (from selected Funannotate database)</option> <option value="custom">Custom protein sequences</option> </param> <when value="uniprot"/> <when value="custom"> <param argument="--protein_evidence" type="data" format="fasta" multiple="true" label="Proteins to map to genome" /> </when> </conditional> <param argument="--p2g_pident" type="integer" value="80" label="Exonerate percent identity (for proteins)" /> <param argument="--p2g_prefilter" type="select" label="Prefilter hists with (for proteins)"> <option value="diamond" selected="True">Diamond</option> <option value="tblastn">tblastn (slower)</option> </param> </section> <param argument="--parameters" type="data" format="json" optional="true" label="Ab-initio training parameters from a previous run" help="If specified, will over-rule any other training presets based on sepcies selection." /> <section name="other_predictors" expanded="false" title="Other annotations"> <param argument="--stringtie" type="data" format="gtf" optional="true" label="StringTie GTF result" /> <param argument="--maker_gff" type="data" format="gff3" optional="true" label="MAKER2 GFF file" help="Parse results directly to EVM" /> <param argument="--pasa_gff" type="data" format="gff3" optional="true" label="PASA generated gene models" /> <param name="pasa_gff_weight" type="integer" value="1" label="Weight for PASA generated gene models" /> <param argument="--other_gff" type="data" format="gff3" optional="true" label="Annotation pass-through to EVM" /> <param name="other_gff_weight" type="integer" value="1" label="Weight for annotation pass-through to EVM" /> </section> <section name="augustus" expanded="true" title="Augustus settings"> <param argument="--augustus_species" type="select" label="Augustus species training set" help="Select a species from the list"> <option value="none" selected="True">No corresponding species, train from scratch</option> <expand macro="augustus_species"/> </param> <param argument="--min_training_models" type="integer" value="200" label="Minimum number of models to train Augustus" /> <param argument="--optimize_augustus" type="boolean" checked="false" truevalue="--optimize_augustus" falsevalue="" label="Run 'optimize_augustus.pl' to refine training (long runtime)" /> </section> <section name="genemark" expanded="false" title="GeneMark settings"> <param name="genemark_license" type="data" format="txt" optional="true" label="GeneMark license file" help="GeneMark is not a free software, to use it download and unzip a license from http://topaz.gatech.edu/GeneMark/license_download.cgi (ES/ET/EP version). GeneMark needs to be installed manually by Galaxy administrators, it might not be available on this server." /> <param argument="--genemark_mode" type="select" label="GeneMark mode"> <option value="ES" selected="True">ES</option> <option value="ET">ET</option> </param> <param argument="--genemark_mod" type="data" format="txt" optional="true" label="Use pre-existing Genemark training file (e.g. gmhmm.mod)" /> <param argument="--soft_mask" type="integer" value="2000" label="Softmasked length threshold for GeneMark" help="GeneMark will skip prediction on repeat regions shorter than this value" /> </section> <section name="busco" expanded="true" title="BUSCO settings"> <param argument="--busco_seed_species" type="select" label="Initial Augustus species training set for BUSCO alignment" help="Select the closest species. BUSCO will only be used if no RNASeq (bam) data is given as evidence."> <expand macro="augustus_species"/> </param> <param argument="--busco_db" type="select" label="BUSCO models to align" help="BUSCO will only be used if no RNASeq (bam) data is given as evidence."> <expand macro="busco_species"/> </param> </section> <section name="evm" expanded="false" title="EVM settings"> <param argument="--repeats2evm" type="boolean" checked="false" truevalue="--repeats2evm" falsevalue="" label="Use repeats in EVM consensus model building" help="Not recommended for fungal genomes that have high gene density. You might want to turn this option on for larger genomes or those that have a high repeat content." /> <conditional name="evm_partitioning"> <param name="evm_partition" type="select" label="Split contigs into partitions for EVM processing?" help="Splits big contigs in smaller overlaping chunks to reduce memory usage and parallelize"> <option value="yes" selected="True">Yes</option> <option value="no">No</option> </param> <when value="yes"> <param argument="--evm-partition-interval" type="integer" value="1500" label="Min length between genes to make a partition" /> </when> <when value="no"/> </conditional> <param argument="--weights" type="text" optional="true" label="Custom ab-initio predictor and EVM weight" help="e.g. augustus:2 pasa:10"> <validator type="regex" message="Key must consist of alphanumeric characters only, possibly separated by the period character ('.')">^[\w: ]+$</validator> </param> </section> <section name="filtering" expanded="true" title="Filtering"> <param argument="--min_intronlen" type="integer" value="10" label="Minimum intron length" /> <param argument="--max_intronlen" type="integer" value="3000" label="Maximum intron length" /> <param argument="--min_protlen" type="integer" value="50" label="Minimum protein length" /> <param argument="--keep_no_stops" type="boolean" checked="false" truevalue="--keep_no_stops" falsevalue="" label="Keep gene models without valid stops" /> <param argument="--repeat_filter" type="select" label="Repetitive gene model filtering" help="'overlap' drops gene models that are more than 90% contained within a repeat region; 'blast' compares the amino acid sequences to a small database of known transposons"> <option value="overlap blast" selected="True">overlap + blast</option> <option value="overlap">overlap</option> <option value="blast">blast</option> <option value="none">none</option> </param> </section> <!-- Need this to change path in the test funannotate_db --> <param type="hidden" name="uglyTestingHack" value="" /> </inputs> <outputs> <data name='annot_gbk' format='genbank' label="${tool.name} on ${on_string}: annotation (genbank)" from_work_dir="out.gbk" /> <data name='annot_tbl' format='txt' label="${tool.name} on ${on_string}: NCBI tbl annotation file" from_work_dir="out.tbl" /> <data name='annot_gff3' format='gff3' label="${tool.name} on ${on_string}: annotation (GFF3)" from_work_dir="out.gff3" /> <data name='fasta_proteins' format='fasta' label="${tool.name} on ${on_string}: protein sequences" from_work_dir="out.proteins.fa" /> <data name='fasta_transcripts_mrna' format='fasta' label="${tool.name} on ${on_string}: transcript mRNA sequences" from_work_dir="out.mrna-transcripts.fa" /> <data name='fasta_transcripts_cds' format='fasta' label="${tool.name} on ${on_string}: transcript CDS sequences" from_work_dir="out.cds-transcripts.fa" /> <data name='tbl2asn_report' format='txt' label="${tool.name} on ${on_string}: tbl2asn summary report of annotated genome" from_work_dir="out.discrepency.report.txt" /> <data name='tbl2asn_error' format='txt' label="${tool.name} on ${on_string}: tbl2asn error summary report" from_work_dir="out.error.summary.txt" /> <data name='tbl2asn_validation' format='txt' label="${tool.name} on ${on_string}: tbl2asn genome validation report" from_work_dir="out.validation.txt" /> <data name='stats' format='json' label="${tool.name} on ${on_string}: stats" from_work_dir="out.stats.json" /> <!-- TODO some day: provide trained models as output, reusable as input to other funannotate runs (parameters.json file references files with absolute paths, would probably need to create an archive + edit paths in parameters.json) --> <!--data name='abinitio' format='json' label="${tool.name} on ${on_string}: ab-initio training parameters" from_work_dir="output/predict_results/*.parameters.json" /--> </outputs> <tests> <!-- training from scratch --> <test> <param name="input" value="genome_masked.fa" /> <param name="database" value="2021-07-20-120000" /> <section name="organism"> <param name="species" value="Genus species" /> </section> <section name="augustus"> <param name="min_training_models" value="3" /> </section> <section name="busco"> <param name="busco_seed_species" value="fly" /> <param name="busco_db" value="insecta" /> </section> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> <has_text text=" TITLE Direct Submission" /> <has_text text="/locus_tag="FUN_000001"" /> </assert_contents> </output> <output name="annot_tbl"> <assert_contents> <has_text text=">Feature sample" /> <has_text text="gnl|ncbi|FUN_000001-T1_mrna" /> </assert_contents> </output> <output name="annot_gff3"> <assert_contents> <has_text text="##gff-version 3" /> <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" /> </assert_contents> </output> <output name="fasta_proteins"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_mrna"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_cds"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <!--output name="abinitio" file="predict_scratch/fly.parameters.json" compare="sim_size" /--> <output name="tbl2asn_report" file="predict_scratch/Genus_species.discrepency.report.txt" compare="sim_size" /> <output name="tbl2asn_error" file="predict_scratch/Genus_species.error.summary.txt" compare="sim_size" delta="500" /> <output name="tbl2asn_validation" file="predict_scratch/Genus_species.validation.txt" compare="sim_size" delta="500" /> <output name="stats" file="predict_scratch/Genus_species.stats.json" compare="sim_size" /> <assert_stderr> <has_text text="augustus busco"/> <has_text text="glimmerhmm busco"/> <has_text text="snap busco"/> <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/> <has_text text="Skipping CodingQuarry as no --rna_bam passed"/> <has_text text="Running Augustus gene prediction using genus_species parameters"/> <not_has_text text="Aligning transcript evidence to genome with minimap2"/> <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/> <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/> <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/> <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/> </assert_stderr> </test> <!-- pre-trained augustus --> <test> <param name="input" value="genome_masked.fa" /> <param name="database" value="2021-07-20-120000" /> <section name="organism"> <param name="species" value="Genus species" /> </section> <section name="augustus"> <param name="augustus_species" value="fly" /> </section> <section name="busco"> <param name="busco_seed_species" value="fly" /> <param name="busco_db" value="insecta" /> </section> <param name="uglyTestingHack" value="true" /> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> <has_text text=" TITLE Direct Submission" /> <has_text text="/locus_tag="FUN_000001"" /> </assert_contents> </output> <output name="annot_tbl"> <assert_contents> <has_text text=">Feature sample" /> <has_text text="gnl|ncbi|FUN_000001-T1_mrna" /> </assert_contents> </output> <output name="annot_gff3"> <assert_contents> <has_text text="##gff-version 3" /> <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" /> </assert_contents> </output> <output name="fasta_proteins"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_mrna"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_cds"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <assert_stderr> <has_text text="augustus pretrained"/> <has_text text="glimmerhmm busco"/> <has_text text="snap busco"/> <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/> <has_text text="Skipping CodingQuarry as no --rna_bam passed"/> <has_text text="Running Augustus gene prediction using fly parameters"/> <not_has_text text="Aligning transcript evidence to genome with minimap2"/> <not_has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/> <not_has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/> <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/> <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/> </assert_stderr> </test> <!-- bam --> <test> <param name="input" value="genome_masked.fa" /> <param name="database" value="2021-07-20-120000" /> <section name="organism"> <param name="species" value="Genus species" /> </section> <section name="evidences"> <param name="rna_bam" value="SRR7458692.bam" /> <param name="transcript_evidence" value="predict_scratch/Genus_species.mrna-transcripts.fa" /> <conditional name="prot_evidence"> <param name="prot_evidence_source" value="custom" /> <param name="protein_evidence" value="predict_scratch/Genus_species.proteins.fa" /> </conditional> </section> <section name="augustus"> <param name="min_training_models" value="3" /> </section> <section name="busco"> <param name="busco_seed_species" value="fly" /> <param name="busco_db" value="insecta" /> </section> <!-- non deterministic results, so can't be more precise here --> <output name="annot_gbk"> <assert_contents> <has_text text=" TITLE Direct Submission" /> <has_text text="/locus_tag="FUN_000001"" /> </assert_contents> </output> <output name="annot_tbl"> <assert_contents> <has_text text=">Feature sample" /> <has_text text="gnl|ncbi|FUN_000001-T1_mrna" /> </assert_contents> </output> <output name="annot_gff3"> <assert_contents> <has_text text="##gff-version 3" /> <has_text text="ID=FUN_000001-T1;Parent=FUN_000001;product=hypothetical protein;" /> </assert_contents> </output> <output name="fasta_proteins"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_mrna"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <output name="fasta_transcripts_cds"> <assert_contents> <has_text text=">FUN_000001-T1 FUN_000001" /> </assert_contents> </output> <assert_stderr> <has_text text="augustus busco"/> <has_text text="glimmerhmm busco"/> <has_text text="snap busco"/> <has_text text="Running BUSCO to find conserved gene models for training ab-initio predictors"/> <not_has_text text="Skipping CodingQuarry as no --rna_bam passed"/> <has_text text="Running Augustus gene prediction using genus_species parameters"/> <has_text text="Training Augustus using BUSCO gene models"/> <has_text text="Aligning transcript evidence to genome with minimap2"/> <has_text text="Found 16 alignments, wrote GFF3 and Augustus hints to file"/> <has_text text="Extracting hints from RNA-seq BAM file using bam2hints"/> <has_text text="Mapping 13 proteins to genome using diamond and exonerate"/> <has_text text="Found 4 preliminary alignments --> aligning with exonerate"/> </assert_stderr> </test> </tests> <help><![CDATA[ Funannotate_ predict -------------------- Funannotate_ is a pipeline for genome annotation (built specifically for fungi, but will also work with higher eukaryotes). Script takes genome multi-fasta file and a variety of inputs to do a comprehensive whole genome gene prediction. Uses AUGUSTUS, GeneMark, Snap, GlimmerHMM, BUSCO, EVidence Modeler, tbl2asn, tRNAScan-SE, Exonerate, minimap2. .. _Funannotate: http://funannotate.readthedocs.io ]]></help> <expand macro="citations" /> </tool>