annotate gecco.xml @ 1:0699939e6dd6 draft

"Release v0.8.4"
author althonos
date Sun, 21 Nov 2021 17:00:40 +0000
parents 1625927fc16f
children 359232b58f6a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
1 <?xml version='1.0' encoding='utf-8'?>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
2 <tool id="gecco" name="GECCO" version="0.8.4" python_template_version="3.5">
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
3 <description>GECCO (Gene Cluster prediction with Conditional Random Fields) is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).</description>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
4 <requirements>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
5 <requirement type="package" version="0.8.4">gecco</requirement>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
6 </requirements>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
7 <version_command>gecco --version</version_command>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
8 <command detect_errors="aggressive"><![CDATA[
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
9
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
10 #if str($input.ext) == 'genbank':
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
11 #set $file_extension = 'gbk'
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
12 #else:
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
13 #set $file_extension = $input.ext
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
14 #end if
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
15 ln -s '$input' input_tempfile.$file_extension &&
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
16
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
17 gecco -vv run -g input_tempfile.$file_extension &&
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
18 mv input_tempfile.features.tsv $features &&
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
19 mv input_tempfile.clusters.tsv $clusters
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
20
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
21 ]]></command>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
22 <inputs>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
23 <param name="input" type="data" format="genbank,fasta" label="Sequence file in GenBank or FASTA format"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
24 </inputs>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
25 <outputs>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
26 <collection name="records" type="list" label="${tool.name} detected Biosynthetic Gene Clusters on ${on_string} (GenBank)">
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
27 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" ext="genbank" visible="false" />
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
28 </collection>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
29 <data name="features" format="tabular" label="${tool.name} summary of detected features on ${on_string} (TSV)"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
30 <data name="clusters" format="tabular" label="${tool.name} summary of detected BGCs on ${on_string} (TSV)"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
31 </outputs>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
32 <tests>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
33 <test>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
34 <param name="input" value="BGC0001866.fna"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
35 <output name="features" file="features.tsv"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
36 <output name="clusters" file="clusters.tsv"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
37 <output_collection name="records" type="list">
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
38 <element name="BGC0001866.1_cluster_1" file="BGC0001866.1_cluster_1.gbk" ftype="genbank" lines_diff="2"/>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
39 </output_collection>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
40 </test>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
41 </tests>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
42 <help>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
43 <![CDATA[
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
44
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
45 **Overview**
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
46
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
47 GECCO is a fast and scalable method for identifying putative novel Biosynthetic Gene Clusters (BGCs) in genomic and metagenomic data using Conditional Random Fields (CRFs).
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
48 It is developed in the Zeller group and is part of the suite of computational microbiome analysis tools hosted at EMBL.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
49
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
50 **Input**
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
51
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
52 GECCO works with DNA sequences, and loads them using Biopython, allowing it to support a large variety of formats, including the common FASTA and GenBank files.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
53
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
54 **Output**
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
55
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
56 GECCO will create the following files once done (using the same prefix as the input file):
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
57
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
58 - features.tsv: The features file, containing the identified proteins and domains in the input sequences.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
59 - clusters.tsv: If any were found, a clusters file, containing the coordinates of the predicted clusters, along their putative biosynthetic type.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
60 - {sequence}_cluster_{N}.gbk: If any BGCs were found, a GenBank file per cluster, containing the cluster sequence annotated with its member proteins and domains.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
61
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
62 **Contact**
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
63
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
64 If you have any question about GECCO, if you run into any issue, or if you would like to make a feature request, please create an issue in the GitHub repository.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
65 You can also directly contact Martin Larralde via email. If you want to contribute to GECCO, please have a look at the contribution guide first, and feel free to
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
66 open a pull request on the GitHub repository.
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
67
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
68 ]]>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
69 </help>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
70 <citations>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
71 <citation type="bibtex">
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
72 @article {Carroll2021.05.03.442509,
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
73 author = {Carroll, Laura M. and Larralde, Martin and Fleck, Jonas Simon and Ponnudurai, Ruby and Milanese, Alessio and Cappio, Elisa and Zeller, Georg},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
74 title = {Accurate de novo identification of biosynthetic gene clusters with GECCO},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
75 elocation-id = {2021.05.03.442509},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
76 year = {2021},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
77 doi = {10.1101/2021.05.03.442509},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
78 publisher = {Cold Spring Harbor Laboratory},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
79 abstract = {Biosynthetic gene clusters (BGCs) are enticing targets for (meta)genomic mining efforts, as they may encode novel, specialized metabolites with potential uses in medicine and biotechnology. Here, we describe GECCO (GEne Cluster prediction with COnditional random fields; https://gecco.embl.de), a high-precision, scalable method for identifying novel BGCs in (meta)genomic data using conditional random fields (CRFs). Based on an extensive evaluation of de novo BGC prediction, we found GECCO to be more accurate and over 3x faster than a state-of-the-art deep learning approach. When applied to over 12,000 genomes, GECCO identified nearly twice as many BGCs compared to a rule-based approach, while achieving higher accuracy than other machine learning approaches. Introspection of the GECCO CRF revealed that its predictions rely on protein domains with both known and novel associations to secondary metabolism. The method developed here represents a scalable, interpretable machine learning approach, which can identify BGCs de novo with high precision.Competing Interest StatementThe authors have declared no competing interest.},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
80 URL = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
81 eprint = {https://www.biorxiv.org/content/early/2021/05/04/2021.05.03.442509.full.pdf},
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
82 journal = {bioRxiv}
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
83 }
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
84 </citation>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
85 </citations>
1625927fc16f "Release v0.8.4"
althonos
parents:
diff changeset
86 </tool>