diff gstf_preparation.xml @ 4:284f64ad9d43 draft

planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
author earlhaminst
date Fri, 08 Dec 2017 05:32:12 -0500
parents 19644996bc2a
children 56bbdbfe3eaa
line wrap: on
line diff
--- a/gstf_preparation.xml	Fri Nov 24 12:32:39 2017 -0500
+++ b/gstf_preparation.xml	Fri Dec 08 05:32:12 2017 -0500
@@ -1,4 +1,4 @@
-<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.3.0">
+<tool id="gstf_preparation" name="GeneSeqToFamily preparation" version="0.4.0">
     <description>converts data for the workflow</description>
     <command detect_errors="exit_code">
 <![CDATA[
@@ -14,6 +14,12 @@
 #for $fasta_input in $fasta_inputs
     --fasta '${fasta_input}'
 #end for
+#if $headers
+    --headers
+#end if
+#if $longestCDS
+    -l
+#end if
 -o '$output_db'
 --of '$output_fasta'
 ]]>
@@ -28,6 +34,8 @@
         </repeat>
         <param name="json" type="data" format="json" multiple="true" optional="true" label="Gene features in JSON format generated by 'Get features by Ensembl ID' tool" />
         <param name="fasta_inputs" type="data" format="fasta" multiple="true" label="Corresponding FASTA datasets" help="Each FASTA header line should start with a transcript id" />
+        <param name="longestCDS" type="boolean" checked="false" label="Keep only the longest CDS per gene" />
+        <param name="headers" type="boolean" checked="true" label="Change the header line of the FASTA sequences to the &gt;TranscriptId_species format" help="As required by TreeBest, part of the GeneSeqToFamily workflow" />
     </inputs>
 
     <outputs>
@@ -40,12 +48,37 @@
             <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
             <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
             <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="true" />
+
             <output name="output_db" file="test1.sqlite" compare="sim_size" />
             <output name="output_fasta" file="test1.fasta" />
         </test>
         <test>
+            <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+            <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+            <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="true" />
+            <param name="headers" value="true" />
+
+            <output name="output_db" file="test1.sqlite" compare="sim_size" />
+            <output name="output_fasta" file="test1_longest.fasta" />
+        </test>
+        <test>
+            <param name="fasta_inputs" ftype="fasta" value="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+            <param name="gff3_input" ftype="gff3" value="Caenorhabditis_elegans.WBcel235.87.chromosome.I.shortened.gff3" />
+            <param name="genome" value="caenorhabditis_elegans" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="false" />
+
+            <output name="output_db" file="test1.sqlite" compare="sim_size" />
+            <output name="output_fasta" file="Caenorhabditis_elegans.WBcel235.cds.all.shortened.fa" />
+        </test>
+        <test>
             <param name="fasta_inputs" ftype="fasta" value="CDS.fasta" />
             <param name="json" ftype="json" value="gene.json" />
+            <param name="longestCDS" value="false" />
+            <param name="headers" value="true" />
 
             <output name="output_db" file="test2.sqlite" compare="sim_size" />
             <output name="output_fasta" file="test2.fasta" />
@@ -55,7 +88,9 @@
 <![CDATA[
 **What it does**
 
-This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format and modify the header lines of a corresponding CDS FASTA to be used with the GeneSeqToFamily workflow.
+This tool converts a set of GFF3 and/or JSON gene feature information datasets into SQLite format.
+
+It also filters a CDS FASTA dataset to keep only the transcripts present in the gene feature information. Optionally it can also keep only the longest CDS per gene and/or change the header line of the FASTA sequences to the >TranscriptId_species format (as required by TreeBest, part of the GeneSeqToFamily workflow).
 
 Example GFF3 file::