Mercurial > repos > genouest > helixer
changeset 0:1b08e39cc52d draft
planemo upload for repository https://github.com/genouest/galaxy-tools/tree/master/tools/helixer commit 8f0b5d30f8f5daea0f6c03293c8593ac24d9e1b7
author | genouest |
---|---|
date | Wed, 28 Jun 2023 08:39:38 +0000 |
parents | |
children | 7bc75dd0f782 |
files | helixer.xml macros.xml test-data/ouput_species.gff3 test-data/output.gff3 test-data/sequence.fasta |
diffstat | 5 files changed, 208 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/helixer.xml Wed Jun 28 08:39:38 2023 +0000 @@ -0,0 +1,111 @@ +<?xml version="1.0"?> +<tool id="helixer" name="Helixer" version="@TOOL_VERSION@" profile="21.05"> + <description>gene calling</description> + <macros> + <import>macros.xml</import> + </macros> + + <requirements> + <expand macro="requirements" /> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ + ## Not in $PATH in the docker image + ## Manage models with a data manager? + /usr/local/bin/fetch_helixer_models.py + + && + + Helixer.py + --fasta-path '$input' + --species '$species' + --lineage $lineage.lineages + --gff-output-path '$output' + + --temporary-dir ./ + + --subsequence-length $lineage.subsequence_length + #if str($lineage.option_overlap.use_overlap) == "true": + --overlap-offset $lineage.option_overlap.overlap_offset + --overlap-core-length $lineage.option_overlap.overlap_core_length + #else: + --no-overlap + #end if + + --window-size $post_processing.window_size + --min-coding-length $post_processing.min_coding_length + --edge-threshold $post_processing.edge_threshold + --peak-threshold $post_processing.peak_threshold + ]]></command> + + <inputs> + <param argument="--fasta-path" name="input" type="data" format="fasta,fasta.gz" label="Genomic sequence"></param> + <conditional name="lineage"> + <param argument="--lineage" name="lineages" type="select" label="Available lineages" help="Choose the model to use for the annotation"> + <option value="land_plant">land plant</option> + <option value="vertebrate">vertebrate</option> + <option value="invertebrate">invertebrate</option> + <option value="fungi">fungi</option> + </param> + <when value="land_plant"> + <expand macro="subseq" length="106920" offset="53460" offsetlen="80190" /> + </when> + <when value="vertebrate"> + <expand macro="subseq" length="213840" offset="106920" offsetlen="160380" /> + </when> + <when value="invertebrate"> + <expand macro="subseq" length="213840" offset="106920" offsetlen="160380" /> + </when> + <when value="fungi"> + <expand macro="subseq" length="21384" offset="10692" offsetlen="16038" /> + </when> + </conditional> + <param argument="--species" type="text" optional="true" label="Species name"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"> + <add value="_" /> + </valid> + </sanitizer> + <validator type="regex">[0-9a-zA-Z_]+</validator> + </param> + + <section name="post_processing" title="Post-processing"> + <param argument="--window-size" type="integer" min="0" value="100" label="Window size" help="This determines the number of bases averaged during the sliding window approach"/> + <param argument="--edge-threshold" type="float" min="0" max="1" value="0.1" label="Edge threshold" help="This threshold specifies the genic score which defines the start / end boundaries of each candidate region"/> + <param argument="--peak-threshold" type="float" min="0" max="1" value="0.8" label="Peak threshold" help="This threshold specifies the minimum peak genic score required to accept the candidate region"/> + <param argument="--min-coding-length" type="integer" min="0" value="100" label="Minimum coding length"/> + </section> + </inputs> + + <outputs> + <data name="output" format="gff3"> + </data> + </outputs> + <tests> + <!-- Test for land_plant --> + <test expect_num_outputs="1"> + <param name="input" value="sequence.fasta"/> + <conditional name="lineage"> + <param name="lineages" value="land_plant"/> + </conditional> + <output name="output" value="output.gff3" ftype="gff3" lines_diff="2" /> + </test> + <test expect_num_outputs="1"> + <!-- Test for species --> + <param name="input" value="sequence.fasta"/> + <param name="species" value="Arabidopsis"/> + <conditional name="lineage"> + <param name="lineages" value="land_plant"/> + </conditional> + <output name="output" value="ouput_species.gff3" ftype="gff3" lines_diff="2" /> + </test> + </tests> + + + <help><![CDATA[ + Helixer_: Gene calling with Deep Neural Networks. + + .. _Helixer: https://github.com/weberlab-hhu/Helixer + ]]></help> + <expand macro="citation"></expand> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Jun 28 08:39:38 2023 +0000 @@ -0,0 +1,31 @@ +<macros> + <token name="@TOOL_VERSION@">0.3.1</token> + + <xml name="citation"> + <citations> + <citation type="doi">10.1101/2023.02.06.527280</citation> + <citation type="doi">10.1093/bioinformatics/btaa1044</citation> + </citations> + </xml> + + <xml name="requirements"> + <container type="docker">gglyptodon/helixer-docker:helixer_v@TOOL_VERSION@_cuda_11.2.0-cudnn8</container> + </xml> + + <xml name="subseq" tokens="length,offset,offsetlen"> + <param argument="--subsequence-length" type="integer" min="0" max="213840" value="@LENGTH@" label="Subsequence length: how much of the genome the Neural Network can see at once" help="Should ideally be comfortably longer than the typical gene. For genomes with large genes (>20kpb) it is recommended to increase this parameter."></param> + <conditional name="option_overlap"> + <param name="use_overlap" type="select" label="Enable overlapping step after predictions" help="This step combines predictions made on each subsequences, to improve quality near start and end of subsequences"> + <option value="true" selected="true">Yes</option> + <option value="false">No</option> + </param> + <when value="true"> + <param argument="--overlap-offset" type="integer" min="0" value="@OFFSET@" label="Overlap offset" help="Smaller values may lead to better predictions but will take longer. The subsequence length should be evenly divisible by this value."/> + <param argument="--overlap-core-length" type="integer" min="0" value="@OFFSETLEN@" label="Overlap core length" help="Predicted subsequences will be cut to this length to increase prediction quality. Smaller values may lead to better predictions but will take longer. Has to be smaller than subsequence_length."/> + </when> + <when value="false"/> + </conditional> + </xml> + +</macros> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ouput_species.gff3 Wed Jun 28 08:39:38 2023 +0000 @@ -0,0 +1,17 @@ +##gff-version 3.2.1 +##species Arabidopsis +# f0e00efcbea83c66b69258d11119a691 /tmp/tmpg9ws2pgr/job_working_directory/000/4/home/.local/share/Helixer/models/land_plant/land_plant_v0.3_a_0080.h5 +##sequence-region NC_034365.1:c72045-70009 1 2037 +NC_034365.1:c72045-70009 Helixer gene 1035 1269 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000001 +NC_034365.1:c72045-70009 Helixer mRNA 1035 1269 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001 +NC_034365.1:c72045-70009 Helixer exon 1035 1269 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.exon.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer five_prime_UTR 1035 1035 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.five_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer CDS 1036 1260 . + 0 ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.CDS.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer three_prime_UTR 1261 1269 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.three_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer gene 1603 2037 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000002 +NC_034365.1:c72045-70009 Helixer mRNA 1603 2037 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002 +NC_034365.1:c72045-70009 Helixer exon 1603 1725 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.exon.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer five_prime_UTR 1603 1606 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.five_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer CDS 1607 1725 . + 0 ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.CDS.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer exon 1812 2037 . + . ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.exon.2;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer CDS 1812 2037 . + 1 ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.CDS.2;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/output.gff3 Wed Jun 28 08:39:38 2023 +0000 @@ -0,0 +1,17 @@ +##gff-version 3.2.1 +##species +# f0e00efcbea83c66b69258d11119a691 /tmp/tmpyabgcmro/job_working_directory/000/2/home/.local/share/Helixer/models/land_plant/land_plant_v0.3_a_0080.h5 +##sequence-region NC_034365.1:c72045-70009 1 2037 +NC_034365.1:c72045-70009 Helixer gene 1035 1269 . + . ID=_NC_034365.1:c72045-70009_000001 +NC_034365.1:c72045-70009 Helixer mRNA 1035 1269 . + . ID=_NC_034365.1:c72045-70009_000001.1;Parent=_NC_034365.1:c72045-70009_000001 +NC_034365.1:c72045-70009 Helixer exon 1035 1269 . + . ID=_NC_034365.1:c72045-70009_000001.1.exon.1;Parent=_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer five_prime_UTR 1035 1035 . + . ID=_NC_034365.1:c72045-70009_000001.1.five_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer CDS 1036 1260 . + 0 ID=_NC_034365.1:c72045-70009_000001.1.CDS.1;Parent=_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer three_prime_UTR 1261 1269 . + . ID=_NC_034365.1:c72045-70009_000001.1.three_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000001.1 +NC_034365.1:c72045-70009 Helixer gene 1603 2037 . + . ID=_NC_034365.1:c72045-70009_000002 +NC_034365.1:c72045-70009 Helixer mRNA 1603 2037 . + . ID=_NC_034365.1:c72045-70009_000002.1;Parent=_NC_034365.1:c72045-70009_000002 +NC_034365.1:c72045-70009 Helixer exon 1603 1725 . + . ID=_NC_034365.1:c72045-70009_000002.1.exon.1;Parent=_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer five_prime_UTR 1603 1606 . + . ID=_NC_034365.1:c72045-70009_000002.1.five_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer CDS 1607 1725 . + 0 ID=_NC_034365.1:c72045-70009_000002.1.CDS.1;Parent=_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer exon 1812 2037 . + . ID=_NC_034365.1:c72045-70009_000002.1.exon.2;Parent=_NC_034365.1:c72045-70009_000002.1 +NC_034365.1:c72045-70009 Helixer CDS 1812 2037 . + 1 ID=_NC_034365.1:c72045-70009_000002.1.CDS.2;Parent=_NC_034365.1:c72045-70009_000002.1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sequence.fasta Wed Jun 28 08:39:38 2023 +0000 @@ -0,0 +1,32 @@ +>NC_034365.1:c72045-70009 Arabidopsis lyrata chloroplast, complete genome +ATGCCTATTGGCGTTCCAAAAGTACCTTTTCGAAGTCCTGGAGAAGGAGATACATCTTGGGTTGACATAT +AGTGCGACTTGTCAGATATATCGGGCTATATGGGATTTCCCCATTTTCTCCCTCGATCGAGATATCCTCT +GTTTCGCTCAAAAAGATTGATTGAATTTTCAAAAATTTGTAACGCGAAGCGCAAAAAATAAAGTGGAATT +AAATGCAGGGAGTATTTAAATTGATATTAGATTATCAAAATTTTTTTATGGTTTACGTGATCGGACTAAT +AACATGAAGTATCCAGGCTCCGTTTAGCAAAAACCCAATTGATAATTAATATATAATATTTTTATAATTA +CTACTTATATGATATGCAAAATTATGATAAAAATTCTATAAAAAAAATTGAAACAGGGCGATCTAAAACT +GTTGCTCAAATCAAATAATATAATTCAAAATTTGTTGACTATTTGACAGAAGACATGTGAAAGAAATGAA +TTGAAAAAAAAAAGAAGGAGTAGTAAAAAAAGACGCGGTATTTGGTTCATTTGTCCTATATGTGCAAATC +AAAATCGGGCAAATTTTTCCTTTTACTCGGGGTAGAGCATAAACCTAAAAAATGGAATAAAAAAAGGAAG +AAGCCCGTTCAGGAACAAGAAAAAACCATCGCCATTTCAACTGAATCTCGATGAAAAAAAAATATCAATG +AATCAAATGAGTCTGACAGGGTTAATCAATCGTTTTATTAGAGGATTATAATATCTAATACAAGGTACAA +GGCACCAAAACTAAAATTTTCTTTTACTTTTGGGAGCAAGCCCTTCCGTTATAAGTTAGAAAGAAAAACC +CTCTATGAAAAAAGGGGAGGTTGGAATCGACACATTGATTTTTTCACAATTTTTGTTGAACCGTATGCAC +CAAAAGGTGCCTGTACGGCTCCTAAGGAATAAAAATTTATCCTAATCAACCGACTTTATCGAGAAAGATT +ATTTTTTTTAGGCCAAGAGGTTGATACCGAAATCTCGAATCAACTTATTAGTCTTATGATATATCTCAGT +ATAGAAAAGGATACCAAAGATCTTTATTTGTTTATAAACTCTCCTGGTGGATGGGTAATATCTGGAATGG +CTATTTATGATACTATGCAATTTGTGCGACCCGATGTACAGACAATATGCATGGGATTGGCCGCTTCAAT +AGCATCCTTTATCCTGGTCGGAGGAGCAATTACCAAACGTATAGCATTCCCTCACGCTTGGCGCCAATGA +GTTTTTTTTTCGAGAAAAAAATACTATGCCTTCGCCATCGTAAATATGAATTAGTTAAGTAATAATAGCA +TGGCACTTCGAATTCAATATGACATTTTTTAGATTAAAAAAAAAATTCGATTATATATTGAAAGAGTAGT +ATGAGATAAGGAAGAGGTTTTTCAAATGATATCTTACCTATTCGGGCACATTTCAGCGTCACAAACTTTG +TTTTCACACCGTAAAAAAAAAAAAAAACACTTTGGGATTGCTGAATCATAGACGAATCAAAAAAATGATA +TATAAAGCAACGGAACCATCATAGTATTTTTTTAACTCCTACAAAAAAAAGAAGGATGGTAATTGGATGA +TTTCTGGATCTGCGTATAATACAACCTATATTTTGTTTTCTTTCGCCAAAAGGAAAGGTCAAAAAAGTCA +ATTCCATTGTGGAGCCGTATGCAATGCACAAAAAAAGCCTGTACGGTTATTCAATTTTATCTGTTTTTTT +TTGTTTTGGTTATCCCGTCTCATTCTGCGAAATAGAAAAACCTTTTCTATTATATCATCAGGGTAATGAT +CCATCAACCCGCTAGCTCGTTTTATGAGGCACAAACGGGAGAATTTATCTTGGAAGCGGAAGAATTACTA +AAACTTCGCGAAACCATCACAAGGGTTTATGTACAAAGAACGGGCAAACCTATATGGGTTGTATCCGAAG +ACATGGAAAGGGATGTTTTTATGTCAGCAACAGAAGCCCAAGCTCATGGAATTGTTGATCTTGTAGCGGT +TCAATAA +