comparison quickmerge.xml @ 0:436c1d3c990a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/quickmerge commit c171c7bde4063ec25fff4a284f5d0330d46ca290
author iuc
date Fri, 08 Jul 2022 10:35:37 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:436c1d3c990a
1 <tool
2 id="quickmerge"
3 name="QuickMerge"
4 version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"
5 profile="21.05"
6 >
7 <description>merge long-read and hybrid genome assemblies to increase contiguity</description>
8
9 <macros>
10 <token name="@TOOL_VERSION@">0.3</token>
11 <token name="@VERSION_SUFFIX@">0</token>
12 </macros>
13
14 <xrefs>
15 <xref type="bio.tools">quickmerge</xref>
16 </xrefs>
17
18 <requirements>
19 <requirement type="package" version="@TOOL_VERSION@">quickmerge</requirement>
20 <requirement type="package" version="0.1.4">fastkit</requirement>
21 </requirements>
22
23 <command detect_errors="exit_code"><![CDATA[
24
25 ## Strip spaces from FASTA header
26 fastkit format --strip-header-space '$input_ref' > input.ref.fasta
27 && fastkit format --strip-header-space '$input_query' > input.query.fasta
28
29 ## Nucmer step
30 ## ----------------------------------------------------------------------------
31
32 && nucmer
33 $advanced.nucmer_match_protocol
34 -l $advanced.nucmer_min_len
35 input.ref.fasta
36 input.query.fasta
37
38
39 ## Delta-filter step
40 ## ----------------------------------------------------------------------------
41
42 && delta-filter
43 $advanced.delta_map_ref
44 $advanced.delta_map_query
45 -l $advanced.delta_min_len
46
47 out.delta > out.rq.delta
48
49
50 ## Quickmerge step
51 ## ----------------------------------------------------------------------------
52
53 && quickmerge
54 -d out.rq.delta
55 -q input.query.fasta
56 -r input.ref.fasta
57 -p out
58 -hco $advanced.hco
59 -c $advanced.c
60 -l $advanced.l
61 -ml $advanced.ml
62
63 ]]></command>
64
65 <inputs>
66 <param
67 name="input_query"
68 type="data"
69 format="fasta,fasta.gz"
70 label="Query (hybrid) assembly"
71 help="A hydrid or long-read assembly."
72 />
73 <param
74 name="input_ref"
75 type="data"
76 format="fasta,fasta.gz"
77 label="Reference assembly"
78 help="The self assembly - this can also be a long-read assembly."
79 />
80
81 <section name="advanced" title="Advanced parameters" expanded="false">
82 <param
83 name="nucmer_match_protocol"
84 type="select"
85 display="radio"
86 label="Anchor matching protocol"
87 help="By default, nucmer will include anchor matches that are unique
88 in the reference but not necessarily unique in the query. You can
89 change the protocol to include all matches, or only those unique to
90 both the reference and query sequence."
91 >
92 <option value="" selected="true">
93 Default (unique in reference only)
94 </option>
95 <option value="--mum">
96 Include matches that are unique to both reference and query
97 </option>
98 <option value="--maxmatch">
99 Include all matches
100 </option>
101 </param>
102
103 <param
104 name="nucmer_min_len"
105 type="integer"
106 label="Nucmer -l"
107 help="Minimum length for a single match (nt)"
108 value="20"
109 min="0"
110 />
111
112 <param
113 name="delta_min_len"
114 type="integer"
115 label="Delta-filter -l"
116 help="Minimum alignment length (nt)"
117 value="0"
118 min="0"
119 />
120
121 <param
122 name="delta_map_ref"
123 type="boolean"
124 truevalue="-r"
125 falsevalue=""
126 checked="true"
127 label="Delta-filter -r"
128 help="Maps each position of each reference to its best hit in the
129 query, allowing for query overlaps."
130 />
131
132 <param
133 name="delta_map_query"
134 type="boolean"
135 truevalue="-q"
136 falsevalue=""
137 checked="true"
138 label="Delta-filter -q"
139 help="Maps each position of each query to its best hit in the
140 reference, allowing for reference overlaps."
141 />
142
143 <param
144 type="integer"
145 argument="-l"
146 help="Minimum seed contig length to be merged (nt). A good rule of
147 thumb is to start with the N50 of the self assembly."
148 value="0"
149 min="0"
150 />
151
152 <param
153 type="integer"
154 argument="-ml"
155 help="Set the merging length cutoff (nt). This is especially helpful
156 for repeat-rich genomes, where a higher minimum length can reduce
157 errors arising from long repeats."
158 value="5000"
159 min="0"
160 />
161
162 <param
163 type="float"
164 argument="-hco"
165 help="Controls the overlap cutoff used in selection of anchor contigs.
166 Increasing this will raise stringency, resulting in fewer joins but
167 higher confidence."
168 value="5.0"
169 min="0.0"
170 />
171
172 <param
173 type="float"
174 argument="-c"
175 help="Controls the overlap cutoff for contigs used for extension of
176 the anchor contig. Increasing this will raise stringency,
177 resulting in fewer joins but higher confidence."
178 value="1.5"
179 min="0.0"
180 />
181 </section>
182
183 <section name="output" title="Optional outputs" expanded="true">
184 <param
185 name="check_align_summary"
186 type="boolean"
187 label="Alignment summary table"
188 />
189 <param
190 name="check_anchor_summary"
191 type="boolean"
192 label="Anchor summary table"
193 />
194 <param
195 name="check_param_summary"
196 type="boolean"
197 label="Parameter summary table"
198 />
199 </section>
200 </inputs>
201
202 <outputs>
203 <data
204 name="merged_fasta"
205 format="fasta"
206 from_work_dir="merged_out.fasta"
207 label="${tool.name} on ${on_string}: Merged assembly"
208 />
209
210 <data
211 name="align_summary"
212 format="tabular"
213 from_work_dir="aln_summary_out.tsv"
214 label="${tool.name} on ${on_string}: Alignment summary"
215 >
216 <filter>output['check_align_summary']</filter>
217 </data>
218
219 <data
220 name="anchor_summary"
221 format="tabular"
222 from_work_dir="anchor_summary_out.txt"
223 label="${tool.name} on ${on_string}: Anchor summary"
224 >
225 <filter>output['check_anchor_summary']</filter>
226 </data>
227
228 <data
229 name="param_summary"
230 format="tabular"
231 from_work_dir="param_summary_out.txt"
232 label="${tool.name} on ${on_string}: Parameter summary"
233 >
234 <filter>output['check_param_summary']</filter>
235 </data>
236 </outputs>
237
238 <tests>
239 <!-- Standard run -->
240 <test expect_num_outputs="1">
241 <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
242 <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
243 <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
244 </test>
245
246 <!-- Standard run with additional outputs -->
247 <test expect_num_outputs="4">
248 <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
249 <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
250 <param name="check_align_summary" value="true" />
251 <param name="check_anchor_summary" value="true" />
252 <param name="check_param_summary" value="true" />
253 <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
254 <output name="align_summary" ftype="tabular" file="2_aln_summary_out.tsv" />
255 <output name="anchor_summary" ftype="tabular" file="2_anchor_summary_out.txt" />
256 <output name="param_summary" ftype="tabular" file="2_param_summary_out.txt" />
257 </test>
258
259 <!-- Standard run with advanced params set -->
260 <test expect_num_outputs="1">
261 <param name="input_query" value="ecoli_nanopore.fasta" ftype="fasta" />
262 <param name="input_ref" value="ecoli_illumina.fasta" ftype="fasta" />
263 <param name="advanced.nucmer_min_len" value="20" />
264 <param name="advanced.delta_min_len" value="0" />
265 <param name="advanced.hco" value="0" />
266 <param name="advanced.c" value="5000" />
267 <param name="advanced.l" value="1.5" />
268 <param name="advanced.ml" value="5.0" />
269 <output name="merged_fasta" ftype="fasta" file="merged.fasta" />
270 </test>
271 </tests>
272
273 <help><![CDATA[
274
275 .. class:: infomark
276
277 **What it does**
278
279 The program uses complementary information from genomes assembled with long reads in order to improve contiguity, and works with assemblies derived from both Pacific Biosciences or Oxford Nanopore. Quickmerge will even work with hybrid assemblies made by combining long reads and Illumina short reads. This can be useful when long read coverage is limiting. For more details, please see the paper: (`10.1093/nar/gkw654 <https://doi.org/10.1093/nar/gkw654>`_) that describes it.
280
281 Although this program was written to merge a hybrid assembly (e.g. as generated by DBG2OLC) and a PacBio or ONP only assembly, it can also be used to merge two different long molecule only assemblies (e.g. one generated with PBcR or canu and another generated with FALCON).
282
283 Please consult the `QuickMerge Wiki <https://github.com/mahulchak/quickmerge/wiki>`_ for extended usage information.
284
285 **Why use quickmerge?**
286
287 - More bang for your buck by using complementary information from different assemblers.
288 - It is fast. Takes less than a minute to run on most genomes. You run nucmer once (nucmer is the most time consuming step) and then you can run quickmerge over a large number of parameters in a very short time.
289 - Requires only FASTA files and does not depend on any special data or computational resources.
290 - Saves money. When long read coverage is limiting, there are some hybrid approaches that lead to good results (e.g. DBG2OLC). So quickmerge allows you to cut your long molecule requirement by as much as half by replacing the same with Illumina short reads. E.g. if you think you would get a N50 of 8Mb from 75X long reads (ONP or PacBio), try sequencing 45X long and 70X Illumina reads instead of 75X long reads. You may not need that extra 35X long reads.
291 - Allows dramatic improvements with reanalysis of legacy data collected when long reads were prohibitively expensive. See an excellent example from `this paper by Thomas Mather <https://academic.oup.com/g3journal/article/10/3/899/6026189>`_ on the Soybean Aphid genome.
292
293
294 **The process**
295
296 - **Nucmer** aligns the two assemblies so that the merger can find the correct splice sites
297 - **Delta-filter** filters out alignments with repeats and duplicates
298 - **Quickmerge** merges the aligned assemblies
299
300 **Inputs**
301
302 - **Query assembly**: a hydrid or long-read assembly (see `QuickMerge Wiki <https://github.com/mahulchak/quickmerge/wiki>`_ for more details)
303
304 - **Reference assembly**: "Self" assembly. Can also be a hybrid assembly
305
306 - **Nucmer -l**: the minimum length of a single match
307
308 - **Delta-filter -l**: the minimum length of a single alignment
309
310 - **hco**: controls the overlap cutoff used in selection of anchor contigs (default 5.0)
311
312 - **c**: controls the overlap cutoff for contigs used for extension of the anchor contig (default 1.5)
313
314 - **l**: controls the length cutoff for anchor contigs. A good rule of thumb is to start with the N50 of the self assembly. E.g. if the N50 of your self assembly is 2Mb then use 2000000 as your cutoff. Lowering this value may lead to more merging but may increase the probability of mis-joins.
315
316 - **ml**: controls the minimum alignment length to be considered for merging. This is especially helpful for repeat-rich genomes (default 5000; higher recommended).
317
318 For both ``hco`` and ``c``, bigger the number, more stringent is the criteria for contig selection (which will lead to fewer contigs being merged). If they are too small (<1), chances of spurious merging will increase. It is better to be conservative while merging contigs!
319
320
321 **Outputs**
322
323 - **Merged assembly**: your merged assembly in FASTA format
324
325 - **Alignment summary**: tabular summary of alignments used for merging
326
327 - **Anchor summary**: tabular summary of anchor matches used for merging
328
329 - **Parameter summary**: summary of merge parameters
330
331
332 **Helpful tips**
333
334 - For optimal merging results, identify the major misassemblies (especially translocations and inversions) in the component assemblies and break the contigs at such misassembly boundaries. Alignment of the component assemblies to the merged assembly may help to identify such assembly errors because a specific error typically occurs in only one of the assemblies.
335
336 - The FASTA sequence headers *should not have white spaces* in them. In case they do, as might happen for assemblies obtained from FALCON assembler, the white space needs to be removed before running.
337
338 - You can run Ka-kit's `finisherSC <https://github.com/kakitone/finishingTool>`_ after running quickmerge to improve the contiguity even further.
339
340 - Assembly polishing with Quiver and pilon before and after assembly merging is strongly recommended. However, if you are running finisherSC, you may perform the quiver polishing after the finisher step.
341
342 - Check the merged assembly by aligning the hybrid and/or non-hybrid assembly to the merged assembly. You can use ``nucmer`` for alignment (with default anchoring) and ``mummerplot`` for dot plot visualization.
343
344
345 Quickmerge was wrapped by the Galaxy Australia team.
346
347 ]]></help>
348 <citations>
349 <citation type="doi">https://doi.org/10.1093/nar/gkw654</citation>
350 </citations>
351 </tool>