comparison bigscape.xml @ 0:a9e5d237d7d4 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/bigscape/ commit 1c7a35c3aabb33682b263cb3a8dbeaf605469c23
author iuc
date Sun, 25 Feb 2024 10:51:27 +0000
parents
children 353b2de0eabf
comparison
equal deleted inserted replaced
-1:000000000000 0:a9e5d237d7d4
1 <tool id="bigscape" name="BiG-SCAPE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="21.05">
2 <description>Construct sequence similarity networks of BGCs and group them into GCF</description>
3 <macros>
4 <token name="@TOOL_VERSION@">1.1.9</token>
5 <token name="@VERSION_SUFFIX@">0</token>
6 </macros>
7 <requirements>
8 <requirement type="package" version="@TOOL_VERSION@">bigscape</requirement>
9 </requirements>
10 <command detect_errors="exit_code">
11 <![CDATA[
12
13 #set $path_to_html = $html.files_path
14 mkdir -p '$path_to_html' result input &&
15 #for $files in $inputdir:
16 #set $filename = "region." + $files.element_identifier
17 ln -s '$files' './input/$filename' &&
18 #end for
19
20 mkdir pfam &&
21 ln -s '$pfam_dir' './pfam/$pfam_dir.element_identifier' &&
22 hmmpress './pfam/Pfam-A.hmm' &&
23
24 #if $anchor.is_select == "yes":
25 ln -s '$anchorfile' '$anchorfile.element_identifier' &&
26 #end if
27 #if $list.is_select == "yes":
28 cat '$__tool_directory__/domain_includelist.txt' > save.txt &&
29 cat '$domain_includelist' > '$__tool_directory__/domain_includelist.txt' &&
30 #end if
31
32 bigscape
33 --inputdir input
34 #if $mibig.is_select == "yes"
35 $mibig.mibig.value
36 #end if
37 --outputdir result
38 #if $use_label.is_select == "yes":
39 --label '${label}'
40 #end if
41 --pfam_dir pfam
42 --cores \${GALAXY_SLOTS:-8}
43 ${verbose}
44 ${log}
45 ${include_singletons}
46 --domain_overlap_cutoff ${domain_overlap_cutoff}
47 --min_bgc_size ${min_big_size}
48 ${mix}
49 ${no_classify}
50 #if $banned_classes.value:
51 --banned_classes
52 #for $banned in str($banned_classes).split( "," ):
53 '$banned'
54 #end for
55 #end if
56 --cutoffs #for $c in $cutoff# ${c.cutoffs} #end for#
57 ${clans_off}
58 #if $clan_cutoff.is_select == "yes":
59 --clan_cutoff $clan_cutoff_val1 $clan_cutoff_val2
60 #end if
61 ${hybrids_off}
62 --mode ${mode.value}
63 #if $anchor.is_select == "yes":
64 --anchorfile '${anchorfile.element_identifier}'
65 #end if
66 ${force_hmmscan}
67 #if $list.is_select == "yes":
68 --domain_includelist
69 #end if
70 &&
71
72 cp './result/index.html' '$html' &&
73 cp -r './result/html_content' '$path_to_html'
74
75 #if $list.is_select == "yes":
76 && cat save.txt > '$__tool_directory__/domain_includelist.txt'
77 #end if
78
79 #if $log:
80 && cp log.txt '$logfile'
81 #end if
82 ]]>
83 </command>
84 <inputs>
85 <param argument="--inputdir" format="genbank" multiple="true" type="data"
86 label="Data files to include in the clustering"
87 help="Add your .gbk files here. Do not wonder, in the filename 'region. will be added to ensure that every file will be included!" />
88 <conditional name="mibig">
89 <param name="is_select" type="select" label="Include BGCs from MIBiG database?"
90 help="Select yes and select which version of the database you want to use" >
91 <option value="yes" selected="False">Yes</option>
92 <option value="no" selected="True">No</option>
93 </param>
94 <when value="yes">
95 <param name="mibig" type="select" optional="false" label="Version from the MIBiG database"
96 help="Select which version of the MIBiG databse you want to use." >
97 <option value="--mibig">3.1</option>
98 <option value="--mibig21">2.1</option>
99 <option value="--mibig14">1.4</option>
100 <option value="--mibig13">1.3</option>
101 </param>
102 </when>
103 <when value="no" />
104 </conditional>
105 <conditional name="use_label" >
106 <param name="is_select" type="select" label="Adding extra string to BiG-SCAPE runs?"
107 help="Select yes if you want to add an extra string to the outputs">
108 <option value="yes" selected="False">Yes</option>
109 <option value="no" selected="True">No</option>
110 </param>
111 <when value="yes">
112 <param argument="--label" type="text" value="" optional="false" label="Adding extra string to BiG-SCAPE runs name"
113 help="By default the BiG-SCAPE runs are named as (YYYY-MM-DD_HH-MM-SS_[extra]) where extra means the mode and if activated (hybrids).">
114 <validator type="empty_field" />
115 </param>
116 </when>
117 <when value="no" />
118 </conditional>
119 <param argument="--pfam_dir" format="hmm3" type="data"
120 label="Data file Pfam-A.hmm"
121 help="Add Pfam-A.hmm.gz file here, please. Look at the help section where you can download it!" />
122 <param argument="--verbose" type="boolean" truevalue="--verbose" falsevalue="" checked="False"
123 label="Print more detailed information about each step"
124 help="Getting more information about the analysis if toggled is true." />
125 <param name="log" type="boolean" truevalue="> log.txt" falsevalue="" checked="False"
126 label="Extra log file as output"
127 help="When using you will receive a log file for the printed output of the toll. Can be useful when using the option above!" />
128 <param argument="--include_singletons" type="boolean" truevalue="--include_singletons" falsevalue="" checked="False"
129 label="Include BGCs with lower cutoff distance"
130 help="With this option you can include BGCs who don't have a distance lower than the cutoff distance specified" />
131 <param argument="--domain_overlap_cutoff" type="float" value="0.1" min="0.0" max="1.0"
132 label="Specify when domains are considered to overlap"
133 help="When using this option you can specify at which percentage domains are considered to overlap. The domain with the best score is kept. The default value is 0.1" />
134 <param argument="--min_big_size" type="integer" value="0"
135 label="Minimum size of a BGC (bp)"
136 help="Minimum size of a BGC to be included in the analysis. The Default value is 0 base pairs. This also includes the sum of all loci in a multi-record GenBank file." />
137 <param argument="--mix" type="boolean" truevalue="--mix" falsevalue="" checked="False"
138 label="mix all classes in the analysis"
139 help="BiG-SCAPE separates the analysis according to the BGC product by default. If used BiG-SCAPE will mix all classes and analyse them." />
140 <param argument="--no_classify" type="boolean" truevalue="--no_classify" falsevalue="" checked="False"
141 label="No classified output based on the BGC product"
142 help="By default, BiG-SCAPE classifies the output based on the BGC product. If toggled it will deactivate it. Note: when (--mix) is not activated, BiG-SCAPE will not create any network file!" />
143 <param argument="--banned_classes" type="select" optional="true" multiple="true" display="checkboxes"
144 label="Excluded classes from classification in BiG-SCAPE"
145 help="You can exclude any of these classes to not be classified. Multiple banned classes are possible." >
146 <option value="PKSI" selected="False">PKSI</option>
147 <option value="PKSother" selected="False">PKSother</option>
148 <option value="NRPS" selected="False">NRPS</option>
149 <option value="RIPPs" selected="False">RIPPs</option>
150 <option value="Saccharides" selected="False">Saccharides</option>
151 <option value="Terpene" selected="False">Terpene</option>
152 <option value="PKS-NRP_Hybrids" selected="False">PKS-NRP_Hybrids</option>
153 <option value="Others" selected="False">Others</option>
154 </param>
155 <repeat name="cutoff" title="Cutoffs" default="1"
156 help="Generate networks using multiple raw distance cutoff values. The default value here is 0.3.">
157 <param argument="--cutoffs" type="float" value="0.3" min="0.1" max="1.0"
158 label="GCF cutoff value"/>
159 </repeat>
160 <param argument="--clans-off" type="boolean" truevalue="--clans-off" falsevalue="" checked="False"
161 label="Turn off cluster GCFs into GCCs"
162 help="By default, BiG-SCAPE will perform a second layer of clustering to group GCFs into GCCs. Toggle to deactivate this." />
163 <conditional name="clan_cutoff" >
164 <param name="is_select" type="select" label="Change cutoff values for cluster GCF into GCC?"
165 help="Select yes if you want to change cutoffs for cluster families into clans.">
166 <option value="yes" selected="False">Yes</option>
167 <option value="no" selected="True">No</option>
168 </param>
169 <when value="yes">
170 <param name="clan_cutoff_val1" type="float" value="0.3" min="0.1" max="1.0" label="GCF cutoff value"
171 help="This value is for finding GCFs which will be used for clan calling. The default value is 0.3." />
172 <param name="clan_cutoff_val2" type="float" value="0.7" min="0.1" max="1.0" label="GCC cutoff value"
173 help="This value is for clustering families into clans. The default value is 0.7. Every pair of GCFs linked with a distance of this value or less will be taken into account!" />
174 </when>
175 <when value="no" />
176 </conditional>
177 <param argument="--hybrids-off" type="boolean" truevalue="--hybrids-off" falsevalue="" checked="False"
178 label="Exclude hybrid predicted products"
179 help="By default, BGCs with hybrid predicted products from the PKS/NRPS Hybrids and Others classes will be included in each subclass. Since the same cluster can appear in different classes you can turn this off here." />
180 <param argument="--mode" type="select"
181 label="Alignment Mode"
182 help="Here you can choose between 3 Alignment Mode which is used in comparing each pair of gene clusters. For more information look into the help section!" >
183 <option value="glocal">glocal</option>
184 <option value="global">global</option>
185 <option value="auto">auto</option>
186 </param>
187 <conditional name="anchor">
188 <param name="is_select" type="select" label="Change Anchorfile?"
189 help="Select yes if you want to use an Anchorfile. BiG-SCAPE has a default file which is always used and only these domains are counted in the result. Look in the help section to see what are the default domains!" >
190 <option value="yes" selected="False">Yes</option>
191 <option value="no" selected="True">No</option>
192 </param>
193 <when value="yes">
194 <param argument="--anchorfile" type="data" format="txt" optional="false"
195 label="Using a different Anchorfile instead of default file"
196 help="Use a custom Anchorfile (in .txt format) to give certain domains a special weight in the DSS index. This file is important because otherwise certain domains (given from Pfam IDs) will not be listed in the results!" />
197 </when>
198 <when value="no" />
199 </conditional>
200 <param argument="--force_hmmscan" type="boolean" truevalue="--force_hmmscan" falsevalue="" checked="False"
201 label="Use hmmscan for the domain prediction"
202 help="Even if BiG-SCAPE finds processed domtables files we can force the domain prediction with hmmscan. Toggle to force hmmscan!" />
203 <conditional name="list">
204 <param name="is_select" type="select" label="Use a domain list to include domains?"
205 help="Select yes if you want to use a list to filter the input to certain domains. Look in the help section how this list should look like">
206 <option value="yes" selected="False">Yes</option>
207 <option value="no" selected="True">No</option>
208 </param>
209 <when value="yes">
210 <param argument="--domain_includelist" type="data" format="txt" optional="false"
211 label="Use a .txt file to include domains"
212 help="Upload a text file where the inputs are filtered based on the Pfam IDs/domains given in this file!" />
213 </when>
214 <when value="no" />
215 </conditional>
216 </inputs>
217 <outputs>
218 <data name="html" format="html" label="${tool.name}: HTML"/>
219 <collection name="tsv_collection_1" type="list" format="tabular" label="${tool.name}: NETWORK ANNOTATIONS COLLECTION">
220 <discover_datasets pattern="Network_Annotations_(?P&lt;designation&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
221 </collection>
222 <collection name="tsv_collection_2" type="list" format="tabular" label="${tool.name}: CLAN TABULAR FILES COLLECTION">
223 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_clans_(?P&lt;designation&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
224 <filter> clans_off == False </filter>
225 </collection>
226 <collection name="tsv_collection_3" type="list:list" format="tabular" label="${tool.name}: CLUSTERING TABULAR FILES COLLECTION">
227 <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_clustering_(?P&lt;identifier_1&gt;.+)\.tsv" directory="result/network_files" recurse="true"/>
228 </collection>
229 <collection name="newtwork_collection" type="list" format="network" label="${tool.name}: NETWORK FILES COLLECTION">
230 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.network" directory="result/network_files" recurse="true"/>
231 </collection>
232 <data name="logfile" format="txt" label="${tool.name}: LOG FILE">
233 <filter>log == True</filter>
234 </data>
235 </outputs>
236 <tests>
237 <test expect_num_outputs="5">
238 <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
239 <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
240 </test>
241 <test expect_num_outputs="6">
242 <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
243 <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
244 <param name="log" value="true" />
245 <repeat name="cutoff">
246 <param name="cutoffs" value="1.0"/>
247 </repeat>
248 <output name="html" >
249 <assert_contents>
250 <has_line line="&lt;!DOCTYPE html&gt;" />
251 </assert_contents>
252 </output>
253 <output name="logfile">
254 <assert_contents>
255 <has_line line=" - - Processing input files - -" />
256 </assert_contents>
257 </output>
258 <output_collection name="tsv_collection_1" type="list">
259 <element name="Full" >
260 <assert_contents>
261 <has_text text="Accession ID" n="1"/>
262 </assert_contents>
263 </element>
264 </output_collection>
265 <output_collection name="tsv_collection_2" type="list">
266 <element name="NRPS">
267 <assert_contents>
268 <has_text text="Clan Number" n="1"/>
269 </assert_contents>
270 </element>
271 </output_collection>
272 <output_collection name="tsv_collection_3" type="list:list">
273 <element name="NRPS">
274 <element name="c0.30">
275 <assert_contents>
276 <has_text text="Family Number" n="1" />
277 </assert_contents>
278 </element>
279 <element name="c1.00">
280 <assert_contents>
281 <has_text text="Family Number" n="1" />
282 </assert_contents>
283 </element>
284 </element>
285 </output_collection>
286 <output_collection name="newtwork_collection" type="list">
287 <element name="NRPS_c0.30">
288 <assert_contents>
289 <has_text text="Raw distance" n="1"/>
290 </assert_contents>
291 </element>
292 </output_collection>
293 </test>
294 <test expect_num_outputs="5">
295 <param name="pfam_dir" value="Pfam-A.hmm" ftype="hmm3"/>
296 <param name="inputdir" value="NC_010530.1.region005.gbk,NC_012963.1.region001.gbk,NW_009799099.1.region003.gbk,NW_021940918.1.region003.gbk,NW_009799102.1.region001.gbk,NW_022985561.1.region002.gbk,NW_022985549.1.region005.gbk,NW_022985575.1.region001.gbk" ftype="genbank" />
297 <param name="log" value="true" />
298 <param name="clans_off" value="true" />
299 </test>
300 </tests>
301 <help>
302 <![CDATA[
303 .. class:: infomark
304
305 **What is BiG-SCAPE**
306
307 BiG-SCAPE (Biosynthetic Gene Similarity Clustering and Prospecting Engine) is a software package, written in Python, that constructs sequence similarity networks of Biosynthetic Gene Clusters (BGCs) and groups them into Gene Cluster Families (GCFs).
308
309 .. class:. infomark
310
311 **What it does**
312
313 BiG-SCAPE does this by rapidly calculating a distance matrix between gene clusters based on a comparison of their protein domain content, order, copy number and sequence identity.
314
315 In principle, BiG-SCAPE can also be used on any other gene clusters, such as pathogenicity islands, secretion system-encoding gene clusters, or even whole viral genomes.
316
317 Here is a grapic how BiG-SCAPE works:
318
319 .. image:: bigscape_corason.png
320 :alt: BiG-SCAPE + CORASON workflow
321
322 For more information you can visit `BiG-SCAPE on GitHub <https://github.com/medema-group/BiG-SCAPE>`_ or go on the `combine website <https://bigscape-corason.secondarymetabolites.org/index.html>`_.
323
324 **Input**
325
326 BiG-SCAPE uses two kind of inputs:
327
328 - The genbank files from antiSMASH
329
330 .. class:: infomark
331
332 Note: By default, BiG-SCAPE includes any Genbank file where the filename contains either region or cluster. To ensure every file will be included we add 'region.' in in the file name!
333
334 - The Pfamm-A.hmm file
335
336 .. class:: infomark
337
338 Note: You can download `Pfam-A.hmm.gz <https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz>`_ here and then unzip it or you can use the command: *$ wget https://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.hmm.gz && gunzip Pfam-A.hmm.gz* in e.g. VSC.
339
340 There are two additional inputs which can be used:
341
342 - An anchor_domains.txt file
343
344 .. class:: infomark:
345
346 Example (default file which will be used):
347
348 ::
349
350 PF00668 Condensation domain [NRPS]
351 PF00501 AMP-binding enzyme [NRPS]
352 PF00109 Beta-ketoacyl synthase N-terminal [PKS]
353 PF02801 Beta-ketoacyl synthase C-terminal [PKS]
354 PF01397 Terpene synthase, N-terminal domain (Terpene_synth) [Terpene]
355 PF03936 Terpene synthase family, metal binding domain (Terpene_synth_C) [Terpene]
356 PF00195 Chalcone and stilbene synthases, N-terminal domain (Cahl_sti_synt_N)
357 PF02797 Chalcone and stilbene synthases, C-terminal domain (Chal_sti_synt_C)
358 PF05147 Lanthionine synthetase C-like protein (LANC_like) [lantipeptide/RiPP]
359 PF00494 Squalene/phytoene synthase (SQS_PSY) [Terpene]
360 PF00432 Prenyltransferase and squalene oxidase repeat (Prenyltrans)
361 PF02624 YcaO cyclodehydratase, ATP-ad MG2+-binding (YcaO) [RiPP]
362
363 The first column contains the Pfam model ID while the second column is optionally for writing a comment. The columns are tab-separated!
364
365 - A domain_includelist.txt
366
367 .. class:: infomark
368
369 Example:
370
371 ::
372
373 PF00067 Cytochrome P450
374 PF01451 Any Comment
375
376 The first column contains the Pfam model ID while the second column is optionally for writing a comment. The columns are tab-separated and any line that starts with a # will be ignored!
377
378
379 **Output**
380
381 BiG-SCAPE will produce one HTML Output together with a dataset with different tabular files depending on the input. When the log file option is set it will create another output, where all prints made from this tool are stored.
382
383
384 **Additionally information for the alignment Mode**
385
386 - glocal: This is the default mode. Here the subset of the domains used to calculate distance is redefined by finding the longest slice of common domain content per gene in both BGCs, and then expanding each slice.
387
388 - global: The whole list of domains of each BGC are compared.
389
390 - auto: Use glocal mode when at least one of the BGCs in each pair has the contig_edge annotation from antiSMASH. Otherwise global will be used.
391 ]]>
392 </help>
393 <citations>
394 <citation type="doi">10.1038/s41589-019-0400-9</citation>
395 </citations>
396 </tool>