comparison progressivemauve.xml @ 0:74093fb62bdf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author iuc
date Wed, 17 Aug 2016 14:46:55 -0400
parents
children bca52822843e
comparison
equal deleted inserted replaced
-1:000000000000 0:74093fb62bdf
1 <?xml version="1.0"?>
2 <tool id="progressivemauve" name="progressiveMauve" version="@WRAPPER_VERSION@.0">
3 <description>constructs multiple genome alignments</description>
4 <macros>
5 <import>macros.xml</import>
6 </macros>
7 <expand macro="requirements"/>
8 <expand macro="stdio"/>
9 <version_command>progressiveMauve --version</version_command>
10 <command><![CDATA[
11 ## Symlink files in with correct extensions
12 #for $file in $sequences:
13 ln -s $file `basename $file`;
14 #end for
15
16 progressiveMauve
17 ## Input Options
18
19 #if $apply_backbone:
20 --apply-backbone=$apply_backbone
21 #end if
22 --island-gap-size=$island_gap_size
23 $mums
24
25 #if $seed_weight:
26 --seed-weight=$seed_weight
27 #end if
28
29 #if $max_gapped_aligner_length:
30 --max-gapped-aligner-length=$max_gapped_aligner_length
31 #end if
32
33 #if $match_input:
34 --match-input=$match_input
35 #end if
36
37 $collinear
38 --scoring-scheme=$scoring_scheme
39 $no_weight_scaling
40
41 --max-breakpoint-distance-scale=$max_breakpoint_distance_scale
42 --conservation-distance-scale=$conservation_distance_scale
43 $skip_refinement
44 $skip_gapped_alignment
45
46 #if $bp_dist_estimate_min_score:
47 --bp-dist-estimate-min-score=$bp_dist_estimate_min_score
48 #end if
49
50 #if $gap_open:
51 --gap-open=$gap_open
52 #end if
53
54 #if $gap_extend:
55 --gap-extend=$gap_extend
56 #end if
57
58 #if $weight:
59 --weight=$weight
60 #end if
61
62 #if $min_scaled_penalty:
63 --min-scaled-penalty=$min_scaled_penalty
64 #end if
65
66 --hmm-p-go-homologous=$hmm_p_go_homologous
67 --hmm-p-go-unrelated=$hmm_p_go_unrelated
68 --hmm-identity=$hmm_identity
69
70 $seed_family
71 $solid_seeds
72 $coding_seeds
73 $no_recursion
74 $disable_backbone
75
76 ## Outputs
77 --output=$output
78 #if $output_guide_tree:
79 --output-guide-tree=$output_guide_tree_file
80 #end if
81
82 #if $output_backbone:
83 --backbone-output=$output_backbone_file
84 #end if
85
86 ## Sequences
87 #for file in $sequences:
88 `basename "${file}"`
89 #end for
90
91 ]]></command>
92 <inputs>
93 <param type="data" format="fasta" name="sequences" multiple="True"
94 label="Select sequences to align" help="in fasta format" />
95 <param type="data" format="xmfa" label="Apply Backbone" name="apply_backbone" optional="True"
96 help="Read an existing sequence alignment in XMFA format and apply backbone statistics to it (--apply-backbone)" />
97
98 <param type="integer" label="Island gap size" value="20" name="island_gap_size"
99 help="Alignment gaps above this size in nucleotides are considered to be islands (--island-gap-size)"/>
100
101 <param type="boolean" truevalue="--disable-backbone" falsevalue="" name="disable_backbone"
102 label="Disable backbone" help="Disable backbone detection (--disable-backbone)" />
103
104 <param type="boolean" truevalue="True" falsevalue="" name="output_guide_tree"
105 label="Output Guide Tree" help="Write out the guide tree used for alignment to a file (--output-guide-tree)" />
106
107 <param type="boolean" truevalue="True" falsevalue="" name="output_backbone"
108 label="Output Backbone" help="Write out the backbone to a file (--backbone-output)" />
109
110 <param type="boolean" truevalue="--mums" falsevalue="" label="MUMs" name="mums"
111 help="Find MUMs only, do not attempt to determine locally collinear blocks (LCBs) (--mums)" />
112
113 <param type="integer" label="Seed weight" name="seed_weight" value="0" optional="True"
114 help="Use the specified seed weight for calculating initial anchors (--seed-weight)" />
115
116 <param type="data" format="tabular" label="Match Input" name="match_input" optional="True"
117 help="Use specified match file instead of searching for matches (--match-input)" />
118
119 <!--<param type="file" label="input-id-matrix" help="An identity matrix describing similarity among all pairs of input sequences/alignments (- -input-id-matrix)" />-->
120 <param type="integer" label="Max gapped aligner length" value="0" optional="True" name="max_gapped_aligner_length"
121 help="Maximum number of base pairs to attempt aligning with the gapped aligner (--max-gapped-aligner-length)" />
122
123 <param type="data" format="nhx" label="input-guide-tree" optional="True" name="input_guide_tree"
124 help="A phylogenetic guide tree in Newick format that describes the order in which sequences will be aligned (--input-guide-tree)" />
125
126 <param type="boolean" truevalue="--collinear" falsevalue="" label="Collinear inputs" name="collinear"
127 help="Assume that input sequences are collinear--they have no rearrangements (--collinear)" />
128
129 <param type="select" label="Scoring scheme" name="scoring_scheme" help="Selects the anchoring score function. (--scoring-scheme)" >
130 <option value="sp" selected="True">Extant sum-of-pairs (sp)</option>
131 <option value="ancestral_sp">Sum-of-pairs + Ancestral (ancestral_sp)</option>
132 <option value="ancestral">Ancestral (ancestral)</option>
133 </param>
134
135 <param type="boolean" truevalue="--no-weight-scaling" falsevalue="" label="No weight scaling" name="no_weight_scaling"
136 help="Don't scale LCB weights by conservation distance and breakpoint distance (--no-weight-scaling)" />
137
138 <param type="float" min="0" max="1" label="max-breakpoint-distance-scale" value="0.5" name="max_breakpoint_distance_scale"
139 help="Set the maximum weight scaling by breakpoint distance. (--max-breakpoint-distance-scale)" />
140
141 <param type="float" min="0" max="1" label="conservation-distance-scale" value="0.5" name="conservation_distance_scale"
142 help="Scale conservation distances by this amount. (--conservation-distance-scale)" />
143
144 <param type="boolean" truevalue="--skip-refinement" falsevalue="" label="Skip refinement" name="skip_refinement"
145 help="Do not perform iterative refinement (--skip-refinement)" />
146 <param type="boolean" truevalue="--skip-gapped-alignment" falsevalue="" label="Skip gapped alignment" name="skip_gapped_alignment"
147 help="Do not perform gapped alignment (--skip-gapped-alignment)" />
148 <param type="integer" label="BP dist estimate min score" name="bp_dist_estimate_min_score" value="0" optional="True"
149 help="Minimum LCB score for estimating pairwise breakpoint distance (--bp-dist-estimate-min-score)" />
150
151 <param type="integer" label="Gap open" name="gap_open" value="0" optional="True"
152 help="Gap open penalty (--gap-open)" />
153
154 <param type="select" label="Repeat penalty" name="repeat_penalty"
155 help="Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. (--repeat-penalty)">
156 <option value="negative" selected="True">Negative</option>
157 <option value="zero">Zero</option>
158 </param>
159
160 <param type="integer" label="Gap extend" name="gap_extend" value="0" optional="True"
161 help="Gap extend penalty (--gap-extend)" />
162
163 <!--<param type="data" label="Substitution matrix" -->
164 <!--help="Nucleotide substitution matrix in NCBI format (- -substitution-matrix)" />-->
165
166 <param type="integer" label="Weight" name="weight" value="0" optional="True"
167 help="Minimum pairwise LCB score (--weight)" />
168 <param type="integer" label="Min scaled penalty" name="min_scaled_penalty" value="0" optional="True"
169 help="Minimum breakpoint penalty after scaling the penalty by expected divergence (--min-scaled-penalty)" />
170
171 <param type="float" label="HMM p go homologous" name="hmm_p_go_homologous" min="0" max="1" value="0.00001"
172 help="Probability of transitioning from the unrelated to the homologous state (--hmm-p-go-homologous)" />
173 <param type="float" label="HMM p go unrelated" name="hmm_p_go_unrelated" min="0" max="1" value="0.000000001"
174 help="Probability of transitioning from the homologous to the unrelated state (--hmm-p-go-unrelated)" />
175 <param type="float" label="HMM identity" name="hmm_identity" min="0" max="1" value="0.7"
176 help="Expected level of sequence identity among pairs of sequences(--hmm-identity)" />
177
178 <param type="boolean" truevalue="--seed-family" falsevalue="" label="Seed family" name="seed_family"
179 help="Use a family of spaced seeds to improve sensitivity (--seed-family)" />
180 <param type="boolean" truevalue="--solid-seeds" falsevalue="" label="Solid seeds" name="solid_seeds"
181 help="Use solid seeds. Do not permit substitutions in anchor matches. (--solid-seeds)" />
182 <param type="boolean" truevalue="--coding-seeds" falsevalue="" label="Coding seeds" name="coding_seeds"
183 help="Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy. (--coding-seeds)" />
184 <param type="boolean" truevalue="--no-recursion" falsevalue="" label="No recursion" name="no_recursion"
185 help="Disable recursive anchor search (--no-recursion)" />
186 </inputs>
187 <outputs>
188 <data format="xmfa" name="output" label="${tool.name} alignment of ${on_string}">
189 <change_format>
190 <when input="mums" value="--mums" format="tabular" />
191 </change_format>
192 </data>
193 <data format="nhx" name="output_guide_tree_file" label="${tool.name} alignment of ${on_string}: Guide tree">
194 <when>output_guide_tree</when>
195 </data>
196 <data format="tabular" name="output_backbone_file" label="${tool.name} alignment of ${on_string}: Backbone">
197 <when>output_backbone</when>
198 </data>
199 </outputs>
200 <tests>
201 <test>
202 <param name="sequences" value="phagey.fa,karma.fa" />
203 <output name="output" file="1.xmfa" lines_diff="20"/>
204 </test>
205 <test>
206 <param name="sequences" value="merged.fa" />
207 <output name="output" file="1.xmfa" lines_diff="20"/>
208 </test>
209 <test>
210 <param name="sequences" value="merged.fa" />
211 <param name="output_guide_tree" value="True" />
212 <output name="output" file="1.xmfa" lines_diff="20"/>
213 <output name="output_guide_tree_file" file="1.nhx" />
214 </test>
215 <test>
216 <param name="sequences" value="merged.fa" />
217 <param name="mums" value="True" />
218 <output name="output" file="1.mums" compare="sim_size" delta="1000"/>
219 </test>
220 <test>
221 <param name="sequences" value="merged.fa" />
222 <param name="match_input" value="1.mums" />
223 <output name="output" file="1.xmfa" lines_diff="24"/>
224 </test>
225 </tests>
226 <help><![CDATA[
227 What it does
228 ============
229
230 Mauve is a system for efficiently constructing multiple genome alignments in
231 the presence of large-scale evolutionary events such as rearrangement and
232 inversion. Multiple genome alignment provides a basis for research into
233 comparative genomics and the study of evolutionary dynamics. Aligning whole
234 genomes is a fundamentally different problem than aligning short sequences.
235
236 Mauve has been developed with the idea that a multiple genome aligner should
237 require only modest computational resources. It employs algorithmic techniques
238 that scale well in the amount of sequence being aligned. For example, a pair of
239 Y. pestis genomes can be aligned in under a minute, while a group of 9
240 divergent Enterobacterial genomes can be aligned in a few hours.
241
242 progressiveMauve XMFA alignment visualized with the Mauve tool:
243
244 .. image:: $PATH_TO_IMAGES/hemolysin.jpg
245
246 Example Usage
247 =============
248
249 +-----------------------------------+-------------+
250 | Usage | Notes |
251 +===================================+=============+
252 | Align genomes |Simply |
253 | |select as |
254 | |many fasta |
255 | |files with |
256 | |one or more |
257 | |sequences as |
258 | |necessary |
259 +-----------------------------------+-------------+
260 | Align genomes but also save |Use the |
261 | the guide tree and produce a |**Output |
262 | backbone file |Guide Tree** |
263 | |and **Output |
264 | |Backbone** |
265 | |options |
266 +-----------------------------------+-------------+
267 | Align genomes, but do not |Use the |
268 | detect forced alignment of |**Disable |
269 | unrelated sequences |backbone** |
270 | |option |
271 +-----------------------------------+-------------+
272 | Detect forced alignment of |Use the |
273 | unrelated sequence in the |**Apply |
274 | alignment produced |Backbone** |
275 | in previous example, use |option and |
276 | custom Homology HMM transition |specify the |
277 | parameters. |XMFA file |
278 | |produced |
279 | |in the |
280 | |previous |
281 | |example |
282 +-----------------------------------+-------------+
283 | Compute ungapped |Use the |
284 | local-multiple alignments among |**MUMs** |
285 | the input sequences |option |
286 +-----------------------------------+-------------+
287 | Compute an alignment of the |Set the |
288 | same genomes, using previously |**Match |
289 | computed local-multiple |Input** to |
290 | alignments |the tabular |
291 | |MUMs file |
292 | |produced in |
293 | |the previous |
294 | |example |
295 +-----------------------------------+-------------+
296 | Set a minimum scaled |Use the |
297 | breakpoint penalty to cope with |**Min Scaled |
298 | the case where most genomes |Penalty** and|
299 | are aligned correctly, but manual |set to a |
300 | inspection reveals that |value like |
301 | a divergent genome has too |5000 |
302 | many predicted rearrangements. | |
303 +-----------------------------------+-------------+
304 | Globally align a set of |Use the |
305 | collinear virus |**Colinear**,|
306 | genomes, using seed families |**Seed |
307 | to improve anchoring sensitivity |Family** |
308 | in regions below 70% sequence |options |
309 | identity. | |
310 +-----------------------------------+-------------+
311
312
313 The progressiveMauve algorithm: addressing limitations of the original algorithm
314 ================================================================================
315
316 Comparative genomics has revealed that closely-related bacteria often have
317 highly divergent gene content. While the original Mauve algorithm could align
318 regions conserved among all organisms, the portion of the genome conserved
319 among all taxa (the core genome) shrinks as more taxa are added to the
320 analysis. As such, the original Mauve algorithm did not scale well to large
321 numbers of taxa because it could not align regions conserved among subsets of
322 the genomes under study. progressiveMauve employs a different algorithmic
323 approach to scoring alignments that allows alignment of segments conserved
324 among subsets of taxa. The progressiveMauve algorithm has been described in
325 Aaron Darling's Ph.D. Thesis, and is also the subject of a manuscript published
326 in PLoS ONE. A brief overview is given here.
327
328 Finding initial local multiple alignments
329 -----------------------------------------
330
331 progressiveMauve elaborates on the original algorithm for finding local
332 multiple alignments. Instead of using a single seed pattern for match
333 filtration, progressiveMauve uses a combination of three seed patterns for
334 improved sensitivity. The palindromic seed patterns have been described in
335 Darling et al. 2006 "Procrastination leads to efficient filtration for local
336 multiple alignment"
337
338 Seed matches which represent a unique subsequence shared by two or more input
339 genomes are subjected to ungapped extension until the seed pattern no longer
340 matches. The result is an ungapped local multiple alignment with at most one
341 component from each of the input genome sequences.
342
343 Computing a pairwise genome content distance matrix and guide tree
344 ------------------------------------------------------------------
345
346 progressiveMauve builds up genome alignments progressively according to a guide
347 tree. The guide tree is computed based on an estimate of the shared gene
348 content among each pair of input genomes. For a pair of input genomes, g.x and
349 g.y, shared gene content is estimated by counting the number of nucleotides in
350 gx and gy aligned to each other in the initial set of local multiple
351 alignments. The count is normalized to a similarity value between 0 and 1 by
352 dividing by the average size of gx and gy. The similarity value is subtracted
353 from 1 to arrive at a distance estimate. Neighbor joining is then applied to
354 the matrix of distance estimates to yield a guide tree topology. Note that the
355 guide tree is not intended to be a phylogeny indicative of the genealogy of
356 input genomes. It is merely a computational crutch for progressive genome
357 alignment. Also note that alignments are later refined independently of a
358 single guide tree toplogy to avoid biasing later phylogenetic inference.
359
360 Computing a pairwise breakpoint distance matrix
361 -----------------------------------------------
362
363 Prior to alignment, progressiveMauve attempts to compute a conservative
364 estimate of the number of rearrangement breakpoints among any pair of genomes.
365 For each pair of genomes, pairwise alignments are created from the
366 local-multiple alignments and the pairwise alignments are subjected to greedy
367 breakpoint elimination. The breakpoint penalty used for greedy breakpoint
368 elimination is set high for closely related genomes and scaled downward
369 according to the estimate of genomic content distance. Because the breakpoint
370 penalty is high, the resulting set of locally collinear blocks represent
371 robustly supported segmental homology, and a conservative estimate of the
372 breakpoint distance can be made on this basis. The conservative estimate of
373 breakpoint distance is used later during progressive alignment to scale
374 breakpoint penalties.
375
376 Progressive genome alignment
377 ----------------------------
378
379 A genome alignment is progressively built up according to the guide tree. At
380 each step of the progressive genome alignment, alignment anchors are selected
381 from the initial set of local multiple alignments. Anchors are selected so that
382 they maximize a Sum-of-pairs scoring scheme which applies a penalty for
383 predicting breakpoints among any pair of genomes. Because rates of genomic
384 rearrangement are highly variable, especially in some bacterial pathogens, some
385 genomes may be expected to exhibit greater rearrangement than others. As such,
386 a single choice of scoring penalty is unlikely to yield accurate alignments for
387 all genomes. To cope with this phenomenon, progressiveMauve scales the
388 breakpoint penalty according to the expected level of sequence divergence and
389 the number of well-supported genomic rearrangements among the pair of input
390 genomes. These scaling values are taken from the distance matrices computed
391 earlier in the algorithm.
392
393 Anchored alignment
394 ------------------
395
396 Once anchors have been computed at a node in the guide tree, a global alignment
397 is computed on the basis of the anchors. Given a set of anchors among two
398 genomes, a genome and an alignment, or a pair of alignments, a modified MUSCLE
399 global alignment algorithm is applied to compute an anchored profile-profile
400 alignment. MUSCLE is then used to perform tree-independent iterative refinement
401 on the global genome alignment.
402
403 Rejecting alignment of unrelated sequence
404 -----------------------------------------
405
406 Although we compute a global alignment among sequences, genomes often contain
407 lineage-specific sequence and are thus not globally related. The global
408 alignment will often contain forced alignment of unrelated sequence. A simple
409 hidden Markov model structure is used to detect forced alignment of unrelated
410 sequence, which are then removed from the alignment.
411
412 Strengths of the progressiveMauve algorithm
413 -------------------------------------------
414
415 - It can be applied to a much larger number of genomes than the original Mauve
416 algorithm
417 - It can align more divergent genomes than the original algorithm. Genomes
418 with as little as 50% nucleotide identity can be alignable
419 - Manual adjustment of the alignment scoring parameters is usually not
420 necessary
421 - It aligns the pan-genome, e.g. regions conserved among subsets of the input
422 genomes
423 - It is more accurate than the previous Mauve algorithm
424
425 Notes on Reproducibility
426 ------------------------
427
428 The command line programme progressiveMauve seems to behave differently when::
429
430 --max-breakpoint-distance-scale=0.5 --conservation-distance-scale=0.5
431
432 are passed to the tool, compared to when those options are not passed. This
433 means that if you wish to precisely replicate the results you see in Galaxy at
434 the command line, you'll need to pass these flags with their "default" values.
435
436 @ATTRIBUTION@
437 ]]></help>
438 <expand macro="citation" />
439 </tool>