diff gecco.xml @ 3:359232b58f6a draft

"Update Galaxy tool wrapper to follow the IUC best practices"
author althonos
date Sun, 21 Nov 2021 19:47:22 +0000
parents 1625927fc16f
children 88dc16b4f583
line wrap: on
line diff
--- a/gecco.xml	Sun Nov 21 17:40:58 2021 +0000
+++ b/gecco.xml	Sun Nov 21 19:47:22 2021 +0000
@@ -1,8 +1,8 @@
 <?xml version='1.0' encoding='utf-8'?>
-<tool id="gecco" name="GECCO" version="0.8.4" python_template_version="3.5">
-    <description>GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
+<tool id="gecco" name="GECCO" version="0.8.5" python_template_version="3.5">
+    <description>is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
     <requirements>
-        <requirement type="package" version="0.8.4">gecco</requirement>
+        <requirement type="package" version="0.8.5">gecco</requirement>
     </requirements>
     <version_command>gecco --version</version_command>
     <command detect_errors="aggressive"><![CDATA[
@@ -14,13 +14,37 @@
         #end if
         ln -s '$input' input_tempfile.$file_extension &&
 
-        gecco -vv run -g input_tempfile.$file_extension &&
-        mv input_tempfile.features.tsv $features &&
-        mv input_tempfile.clusters.tsv $clusters
+        gecco -vv run
+        --format $input.ext
+        --genome input_tempfile.$file_extension
+        --postproc $postproc
+        --force-clusters-tsv
+        #if $cds:
+            --cds $cds
+        #end if
+        #if $threshold:
+            --threshold $threshold
+        #end if
+        #if $antismash_sideload:
+            --antismash-sideload
+        #end if
+
+        && mv input_tempfile.features.tsv '$features'
+        && mv input_tempfile.clusters.tsv '$clusters'
+        #if $antismash_sideload
+        && mv input_tempfile.sideload.json '$sideload'
+        #end if
 
     ]]></command>
     <inputs>
-        <param name="input" type="data" format="genbank,fasta" label="Sequence file in GenBank or FASTA format"/>
+        <param name="input" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank, EMBL or FASTA format"/>
+        <param argument="--cds" type="integer" min="0" value="" optional="true" label="Minimum number of genes required for a cluster"/>
+        <param argument="--threshold" type="float" min="0" max="1" value="" optional="true" label="Probability threshold for cluster detection"/>
+        <param argument="--postproc" type="select" label="Post-processing method for gene cluster validation">
+            <option value="antismash">antiSMASH</option>
+            <option value="gecco" selected="true">GECCO</option>
+        </param>
+        <param argument="--antismash-sideload" type="boolean" checked="false" label="Generate an antiSMASH v6 sideload JSON file"/>
     </inputs>
     <outputs>
         <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
@@ -28,6 +52,9 @@
         </collection>
         <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
         <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
+        <data name="sideload" format="json" label="antiSMASH v6 sideload file with ${tool.name} detected BGCs on ${on_string} (JSON)">
+            <filter>antismash_sideload</filter>
+        </data>
     </outputs>
     <tests>
         <test>
@@ -38,49 +65,48 @@
                 <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" lines_diff="2"/>
             </output_collection>
         </test>
+        <test>
+            <param name="input" value="BGC0001866.fna"/>
+            <param name="antismash_sideload" value="True"/>
+            <output name="features" file="features.tsv"/>
+            <output name="clusters" file="clusters.tsv"/>
+            <output name="sideload" file="sideload.json"/>
+            <output_collection name="records" type="list">
+                <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" lines_diff="2"/>
+            </output_collection>
+        </test>
     </tests>
-    <help>
-<![CDATA[
+    <help><![CDATA[
 
-**Overview**
+Overview
+--------
 
-GECCO is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
+GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
 It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.
 
-**Input**
+Input
+-----
 
 GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.
 
-**Output**
+Output
+------
 
 GECCO will create the following files once done (using the same prefix as the input file):
 
-- features.tsv: The features file, containing the identified proteins and domains in the input sequences.
-- clusters.tsv: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
-- {sequence}_cluster_{N}.gbk: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
-
-**Contact**
+- ``features.tsv``: The features file, containing the identified proteins and domains in the input sequences.
+- ``clusters.tsv``: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
+- ``{sequence}_cluster_{N}.gbk``: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
 
-If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the GitHub repository. 
-You can also directly contact Martin Larralde via email. If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to 
-open a pull request on the GitHub repository.
+Contact
+-------
 
-]]>
-    </help>
+If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the
+`GitHub repository <https://github.com/zellerlab/gecco>`_. You can also directly contact `Martin Larralde via email <mailto:martin.larralde@embl.de>`_.
+If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to open a pull request on the GitHub repository.
+
+    ]]></help>
     <citations>
-        <citation type="bibtex">
-@article {Carroll2021.05.03.442509,
-	author = {Carroll, Laura M. and Larralde, Martin and Fleck, Jonas Simon and Ponnudurai, Ruby and Milanese, Alessio and Cappio, Elisa and Zeller, Georg},
-	title = {Accurate de novo identification of biosynthetic gene clusters with GECCO},
-	elocation-id = {2021.05.03.442509},
-	year = {2021},
-	doi = {10.1101/2021.05.03.442509},
-	publisher = {Cold Spring Harbor Laboratory},
-	abstract = {Biosynthetic gene clusters (BGCs) are enticing targets for (meta)genomic mining efforts, as they may encode novel, specialized metabolites with potential uses in medicine and biotechnology. Here, we describe GECCO (GEne Cluster prediction with COnditional random fields; https://gecco.embl.de), a high-precision, scalable method for identifying novel BGCs in (meta)genomic data using conditional random fields (CRFs). Based on an extensive evaluation of de novo BGC prediction, we found GECCO to be more accurate and over 3x faster than a state-of-the-art deep learning approach. When applied to over 12,000 genomes, GECCO identified nearly twice as many BGCs compared to a rule-based approach, while achieving higher accuracy than other machine learning approaches. Introspection of the GECCO CRF revealed that its predictions rely on protein domains with both known and novel associations to secondary metabolism. The method developed here represents a scalable, interpretable machine learning approach, which can identify BGCs de novo with high precision.Competing Interest StatementThe authors have declared no competing interest.},
-	URL = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509},
-	eprint = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509.full.pdf},
-	journal = {bioRxiv}
-}
-        </citation>
+        <citation type="doi">10.1101/2021.05.03.442509</citation>
     </citations>
 </tool>