changeset 0:1b08e39cc52d draft

planemo upload for repository https://github.com/genouest/galaxy-tools/tree/master/tools/helixer commit 8f0b5d30f8f5daea0f6c03293c8593ac24d9e1b7
author genouest
date Wed, 28 Jun 2023 08:39:38 +0000
parents
children 7bc75dd0f782
files helixer.xml macros.xml test-data/ouput_species.gff3 test-data/output.gff3 test-data/sequence.fasta
diffstat 5 files changed, 208 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/helixer.xml	Wed Jun 28 08:39:38 2023 +0000
@@ -0,0 +1,111 @@
+<?xml version="1.0"?>
+<tool id="helixer" name="Helixer" version="@TOOL_VERSION@" profile="21.05">
+    <description>gene calling</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <requirements>
+        <expand macro="requirements" />
+    </requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        ## Not in $PATH in the docker image
+        ## Manage models with a data manager?
+        /usr/local/bin/fetch_helixer_models.py
+
+        &&
+
+        Helixer.py
+        --fasta-path '$input'
+        --species '$species'
+        --lineage $lineage.lineages
+        --gff-output-path '$output'
+
+        --temporary-dir ./
+
+        --subsequence-length $lineage.subsequence_length
+        #if str($lineage.option_overlap.use_overlap) == "true":
+            --overlap-offset $lineage.option_overlap.overlap_offset
+            --overlap-core-length $lineage.option_overlap.overlap_core_length
+        #else:
+            --no-overlap
+        #end if
+
+        --window-size $post_processing.window_size
+        --min-coding-length $post_processing.min_coding_length
+        --edge-threshold $post_processing.edge_threshold
+        --peak-threshold $post_processing.peak_threshold
+    ]]></command>
+
+    <inputs>
+        <param argument="--fasta-path" name="input" type="data" format="fasta,fasta.gz" label="Genomic sequence"></param>
+        <conditional name="lineage">
+            <param argument="--lineage" name="lineages" type="select" label="Available lineages" help="Choose the model to use for the annotation">
+                <option value="land_plant">land plant</option>
+                <option value="vertebrate">vertebrate</option>
+                <option value="invertebrate">invertebrate</option>
+                <option value="fungi">fungi</option>
+            </param>
+            <when value="land_plant">
+                <expand macro="subseq" length="106920" offset="53460" offsetlen="80190" />
+            </when>
+            <when value="vertebrate">
+                <expand macro="subseq" length="213840" offset="106920" offsetlen="160380" />
+            </when>
+            <when value="invertebrate">
+                <expand macro="subseq" length="213840" offset="106920" offsetlen="160380" />
+            </when>
+            <when value="fungi">
+                <expand macro="subseq" length="21384" offset="10692" offsetlen="16038" />
+            </when>
+        </conditional>
+        <param argument="--species" type="text" optional="true" label="Species name">
+            <sanitizer invalid_char="">
+                <valid initial="string.letters,string.digits">
+                    <add value="_" />
+                </valid>
+            </sanitizer>
+            <validator type="regex">[0-9a-zA-Z_]+</validator>
+        </param>
+
+        <section name="post_processing" title="Post-processing">
+            <param argument="--window-size" type="integer" min="0" value="100" label="Window size" help="This determines the number of bases averaged during the sliding window approach"/>
+            <param argument="--edge-threshold" type="float" min="0" max="1" value="0.1" label="Edge threshold" help="This threshold specifies the genic score which defines the start / end boundaries of each candidate region"/>
+            <param argument="--peak-threshold" type="float" min="0" max="1" value="0.8" label="Peak threshold" help="This threshold specifies the minimum peak genic score required to accept the candidate region"/>
+            <param argument="--min-coding-length" type="integer"  min="0" value="100" label="Minimum coding length"/>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data name="output" format="gff3">
+        </data>
+    </outputs>
+    <tests>
+        <!-- Test for land_plant -->
+        <test expect_num_outputs="1">
+            <param name="input" value="sequence.fasta"/>
+            <conditional name="lineage">
+                <param name="lineages" value="land_plant"/>
+            </conditional>
+            <output name="output" value="output.gff3" ftype="gff3" lines_diff="2" />
+        </test>
+        <test expect_num_outputs="1">
+            <!-- Test for species -->
+            <param name="input" value="sequence.fasta"/>
+            <param name="species" value="Arabidopsis"/>
+            <conditional name="lineage">
+                <param name="lineages" value="land_plant"/>
+            </conditional>
+            <output name="output" value="ouput_species.gff3" ftype="gff3" lines_diff="2" />
+        </test>
+    </tests>
+
+
+    <help><![CDATA[
+        Helixer_: Gene calling with Deep Neural Networks.
+
+        .. _Helixer: https://github.com/weberlab-hhu/Helixer
+    ]]></help>
+    <expand macro="citation"></expand>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Jun 28 08:39:38 2023 +0000
@@ -0,0 +1,31 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.3.1</token>
+
+    <xml name="citation">
+        <citations>
+            <citation type="doi">10.1101/2023.02.06.527280</citation>
+            <citation type="doi">10.1093/bioinformatics/btaa1044</citation>
+        </citations>
+    </xml>
+
+    <xml name="requirements">
+        <container type="docker">gglyptodon/helixer-docker:helixer_v@TOOL_VERSION@_cuda_11.2.0-cudnn8</container>
+    </xml>
+
+    <xml name="subseq" tokens="length,offset,offsetlen">
+        <param argument="--subsequence-length" type="integer" min="0" max="213840" value="@LENGTH@" label="Subsequence length: how much of the genome the Neural Network can see at once" help="Should ideally be comfortably longer than the typical gene. For genomes with large genes (>20kpb) it is recommended to increase this parameter."></param>
+        <conditional name="option_overlap">
+            <param name="use_overlap" type="select" label="Enable overlapping step after predictions" help="This step combines predictions made on each subsequences, to improve quality near start and end of subsequences">
+                <option value="true" selected="true">Yes</option>
+                <option value="false">No</option>
+            </param>
+            <when value="true">
+                <param argument="--overlap-offset" type="integer" min="0" value="@OFFSET@" label="Overlap offset" help="Smaller values may lead to better predictions but will take longer. The subsequence length should be evenly divisible by this value."/>
+                <param argument="--overlap-core-length" type="integer" min="0" value="@OFFSETLEN@" label="Overlap core length" help="Predicted subsequences will be cut to this length to increase prediction quality. Smaller values may lead to better predictions but will take longer. Has to be smaller than subsequence_length."/>
+            </when>
+            <when value="false"/>
+        </conditional>
+    </xml>
+
+</macros>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ouput_species.gff3	Wed Jun 28 08:39:38 2023 +0000
@@ -0,0 +1,17 @@
+##gff-version 3.2.1
+##species Arabidopsis
+# f0e00efcbea83c66b69258d11119a691  /tmp/tmpg9ws2pgr/job_working_directory/000/4/home/.local/share/Helixer/models/land_plant/land_plant_v0.3_a_0080.h5
+##sequence-region NC_034365.1:c72045-70009 1 2037
+NC_034365.1:c72045-70009	Helixer	gene	1035	1269	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000001
+NC_034365.1:c72045-70009	Helixer	mRNA	1035	1269	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001
+NC_034365.1:c72045-70009	Helixer	exon	1035	1269	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.exon.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	five_prime_UTR	1035	1035	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.five_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	CDS	1036	1260	.	+	0	ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.CDS.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	three_prime_UTR	1261	1269	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000001.1.three_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	gene	1603	2037	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000002
+NC_034365.1:c72045-70009	Helixer	mRNA	1603	2037	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002
+NC_034365.1:c72045-70009	Helixer	exon	1603	1725	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.exon.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	five_prime_UTR	1603	1606	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.five_prime_UTR.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	CDS	1607	1725	.	+	0	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.CDS.1;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	exon	1812	2037	.	+	.	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.exon.2;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	CDS	1812	2037	.	+	1	ID=Arabidopsis_NC_034365.1:c72045-70009_000002.1.CDS.2;Parent=Arabidopsis_NC_034365.1:c72045-70009_000002.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output.gff3	Wed Jun 28 08:39:38 2023 +0000
@@ -0,0 +1,17 @@
+##gff-version 3.2.1
+##species 
+# f0e00efcbea83c66b69258d11119a691  /tmp/tmpyabgcmro/job_working_directory/000/2/home/.local/share/Helixer/models/land_plant/land_plant_v0.3_a_0080.h5
+##sequence-region NC_034365.1:c72045-70009 1 2037
+NC_034365.1:c72045-70009	Helixer	gene	1035	1269	.	+	.	ID=_NC_034365.1:c72045-70009_000001
+NC_034365.1:c72045-70009	Helixer	mRNA	1035	1269	.	+	.	ID=_NC_034365.1:c72045-70009_000001.1;Parent=_NC_034365.1:c72045-70009_000001
+NC_034365.1:c72045-70009	Helixer	exon	1035	1269	.	+	.	ID=_NC_034365.1:c72045-70009_000001.1.exon.1;Parent=_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	five_prime_UTR	1035	1035	.	+	.	ID=_NC_034365.1:c72045-70009_000001.1.five_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	CDS	1036	1260	.	+	0	ID=_NC_034365.1:c72045-70009_000001.1.CDS.1;Parent=_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	three_prime_UTR	1261	1269	.	+	.	ID=_NC_034365.1:c72045-70009_000001.1.three_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000001.1
+NC_034365.1:c72045-70009	Helixer	gene	1603	2037	.	+	.	ID=_NC_034365.1:c72045-70009_000002
+NC_034365.1:c72045-70009	Helixer	mRNA	1603	2037	.	+	.	ID=_NC_034365.1:c72045-70009_000002.1;Parent=_NC_034365.1:c72045-70009_000002
+NC_034365.1:c72045-70009	Helixer	exon	1603	1725	.	+	.	ID=_NC_034365.1:c72045-70009_000002.1.exon.1;Parent=_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	five_prime_UTR	1603	1606	.	+	.	ID=_NC_034365.1:c72045-70009_000002.1.five_prime_UTR.1;Parent=_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	CDS	1607	1725	.	+	0	ID=_NC_034365.1:c72045-70009_000002.1.CDS.1;Parent=_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	exon	1812	2037	.	+	.	ID=_NC_034365.1:c72045-70009_000002.1.exon.2;Parent=_NC_034365.1:c72045-70009_000002.1
+NC_034365.1:c72045-70009	Helixer	CDS	1812	2037	.	+	1	ID=_NC_034365.1:c72045-70009_000002.1.CDS.2;Parent=_NC_034365.1:c72045-70009_000002.1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence.fasta	Wed Jun 28 08:39:38 2023 +0000
@@ -0,0 +1,32 @@
+>NC_034365.1:c72045-70009 Arabidopsis lyrata chloroplast, complete genome
+ATGCCTATTGGCGTTCCAAAAGTACCTTTTCGAAGTCCTGGAGAAGGAGATACATCTTGGGTTGACATAT
+AGTGCGACTTGTCAGATATATCGGGCTATATGGGATTTCCCCATTTTCTCCCTCGATCGAGATATCCTCT
+GTTTCGCTCAAAAAGATTGATTGAATTTTCAAAAATTTGTAACGCGAAGCGCAAAAAATAAAGTGGAATT
+AAATGCAGGGAGTATTTAAATTGATATTAGATTATCAAAATTTTTTTATGGTTTACGTGATCGGACTAAT
+AACATGAAGTATCCAGGCTCCGTTTAGCAAAAACCCAATTGATAATTAATATATAATATTTTTATAATTA
+CTACTTATATGATATGCAAAATTATGATAAAAATTCTATAAAAAAAATTGAAACAGGGCGATCTAAAACT
+GTTGCTCAAATCAAATAATATAATTCAAAATTTGTTGACTATTTGACAGAAGACATGTGAAAGAAATGAA
+TTGAAAAAAAAAAGAAGGAGTAGTAAAAAAAGACGCGGTATTTGGTTCATTTGTCCTATATGTGCAAATC
+AAAATCGGGCAAATTTTTCCTTTTACTCGGGGTAGAGCATAAACCTAAAAAATGGAATAAAAAAAGGAAG
+AAGCCCGTTCAGGAACAAGAAAAAACCATCGCCATTTCAACTGAATCTCGATGAAAAAAAAATATCAATG
+AATCAAATGAGTCTGACAGGGTTAATCAATCGTTTTATTAGAGGATTATAATATCTAATACAAGGTACAA
+GGCACCAAAACTAAAATTTTCTTTTACTTTTGGGAGCAAGCCCTTCCGTTATAAGTTAGAAAGAAAAACC
+CTCTATGAAAAAAGGGGAGGTTGGAATCGACACATTGATTTTTTCACAATTTTTGTTGAACCGTATGCAC
+CAAAAGGTGCCTGTACGGCTCCTAAGGAATAAAAATTTATCCTAATCAACCGACTTTATCGAGAAAGATT
+ATTTTTTTTAGGCCAAGAGGTTGATACCGAAATCTCGAATCAACTTATTAGTCTTATGATATATCTCAGT
+ATAGAAAAGGATACCAAAGATCTTTATTTGTTTATAAACTCTCCTGGTGGATGGGTAATATCTGGAATGG
+CTATTTATGATACTATGCAATTTGTGCGACCCGATGTACAGACAATATGCATGGGATTGGCCGCTTCAAT
+AGCATCCTTTATCCTGGTCGGAGGAGCAATTACCAAACGTATAGCATTCCCTCACGCTTGGCGCCAATGA
+GTTTTTTTTTCGAGAAAAAAATACTATGCCTTCGCCATCGTAAATATGAATTAGTTAAGTAATAATAGCA
+TGGCACTTCGAATTCAATATGACATTTTTTAGATTAAAAAAAAAATTCGATTATATATTGAAAGAGTAGT
+ATGAGATAAGGAAGAGGTTTTTCAAATGATATCTTACCTATTCGGGCACATTTCAGCGTCACAAACTTTG
+TTTTCACACCGTAAAAAAAAAAAAAAACACTTTGGGATTGCTGAATCATAGACGAATCAAAAAAATGATA
+TATAAAGCAACGGAACCATCATAGTATTTTTTTAACTCCTACAAAAAAAAGAAGGATGGTAATTGGATGA
+TTTCTGGATCTGCGTATAATACAACCTATATTTTGTTTTCTTTCGCCAAAAGGAAAGGTCAAAAAAGTCA
+ATTCCATTGTGGAGCCGTATGCAATGCACAAAAAAAGCCTGTACGGTTATTCAATTTTATCTGTTTTTTT
+TTGTTTTGGTTATCCCGTCTCATTCTGCGAAATAGAAAAACCTTTTCTATTATATCATCAGGGTAATGAT
+CCATCAACCCGCTAGCTCGTTTTATGAGGCACAAACGGGAGAATTTATCTTGGAAGCGGAAGAATTACTA
+AAACTTCGCGAAACCATCACAAGGGTTTATGTACAAAGAACGGGCAAACCTATATGGGTTGTATCCGAAG
+ACATGGAAAGGGATGTTTTTATGTCAGCAACAGAAGCCCAAGCTCATGGAATTGTTGATCTTGTAGCGGT
+TCAATAA
+