diff progressivemauve.xml @ 0:74093fb62bdf draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/progressivemauve commit 2645abbd04dd68266f995b8259e991c31388cda8
author iuc
date Wed, 17 Aug 2016 14:46:55 -0400
parents
children bca52822843e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/progressivemauve.xml	Wed Aug 17 14:46:55 2016 -0400
@@ -0,0 +1,439 @@
+<?xml version="1.0"?>
+<tool id="progressivemauve" name="progressiveMauve" version="@WRAPPER_VERSION@.0">
+  <description>constructs multiple genome alignments</description>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <expand macro="requirements"/>
+  <expand macro="stdio"/>
+  <version_command>progressiveMauve --version</version_command>
+  <command><![CDATA[
+## Symlink files in with correct extensions
+#for $file in $sequences:
+    ln -s $file `basename $file`;
+#end for
+
+progressiveMauve
+## Input Options
+
+#if $apply_backbone:
+    --apply-backbone=$apply_backbone
+#end if
+--island-gap-size=$island_gap_size
+$mums
+
+#if $seed_weight:
+    --seed-weight=$seed_weight
+#end if
+
+#if $max_gapped_aligner_length:
+    --max-gapped-aligner-length=$max_gapped_aligner_length
+#end if
+
+#if $match_input:
+    --match-input=$match_input
+#end if
+
+$collinear
+--scoring-scheme=$scoring_scheme
+$no_weight_scaling
+
+--max-breakpoint-distance-scale=$max_breakpoint_distance_scale
+--conservation-distance-scale=$conservation_distance_scale
+$skip_refinement
+$skip_gapped_alignment
+
+#if $bp_dist_estimate_min_score:
+    --bp-dist-estimate-min-score=$bp_dist_estimate_min_score
+#end if
+
+#if $gap_open:
+    --gap-open=$gap_open
+#end if
+
+#if $gap_extend:
+    --gap-extend=$gap_extend
+#end if
+
+#if $weight:
+    --weight=$weight
+#end if
+
+#if $min_scaled_penalty:
+    --min-scaled-penalty=$min_scaled_penalty
+#end if
+
+--hmm-p-go-homologous=$hmm_p_go_homologous
+--hmm-p-go-unrelated=$hmm_p_go_unrelated
+--hmm-identity=$hmm_identity
+
+$seed_family
+$solid_seeds
+$coding_seeds
+$no_recursion
+$disable_backbone
+
+## Outputs
+--output=$output
+#if $output_guide_tree:
+    --output-guide-tree=$output_guide_tree_file
+#end if
+
+#if $output_backbone:
+    --backbone-output=$output_backbone_file
+#end if
+
+## Sequences
+#for file in $sequences:
+    `basename "${file}"`
+#end for
+
+]]></command>
+  <inputs>
+      <param type="data" format="fasta" name="sequences" multiple="True"
+          label="Select sequences to align" help="in fasta format" />
+      <param type="data" format="xmfa" label="Apply Backbone" name="apply_backbone" optional="True"
+          help="Read an existing sequence alignment in XMFA format and apply backbone statistics to it (--apply-backbone)" />
+
+      <param type="integer" label="Island gap size" value="20" name="island_gap_size"
+          help="Alignment gaps above this size in nucleotides are considered to be islands (--island-gap-size)"/>
+
+      <param type="boolean" truevalue="--disable-backbone" falsevalue="" name="disable_backbone"
+          label="Disable backbone" help="Disable backbone detection (--disable-backbone)" />
+
+      <param type="boolean" truevalue="True" falsevalue="" name="output_guide_tree"
+          label="Output Guide Tree" help="Write out the guide tree used for alignment to a file (--output-guide-tree)" />
+
+      <param type="boolean" truevalue="True" falsevalue="" name="output_backbone"
+          label="Output Backbone" help="Write out the backbone to a file (--backbone-output)" />
+
+      <param type="boolean" truevalue="--mums" falsevalue="" label="MUMs" name="mums"
+          help="Find MUMs only, do not attempt to determine locally collinear blocks (LCBs) (--mums)" />
+
+      <param type="integer" label="Seed weight" name="seed_weight" value="0" optional="True"
+          help="Use the specified seed weight for calculating initial anchors (--seed-weight)" />
+
+      <param type="data" format="tabular" label="Match Input" name="match_input" optional="True"
+          help="Use specified match file instead of searching for matches (--match-input)" />
+
+    <!--<param type="file" label="input-id-matrix" help="An identity matrix describing similarity among all pairs of input sequences/alignments (- -input-id-matrix)" />-->
+    <param type="integer" label="Max gapped aligner length" value="0" optional="True" name="max_gapped_aligner_length"
+        help="Maximum number of base pairs to attempt aligning with the gapped aligner (--max-gapped-aligner-length)" />
+
+    <param type="data" format="nhx" label="input-guide-tree" optional="True" name="input_guide_tree"
+        help="A phylogenetic guide tree in Newick format that describes the order in which sequences will be aligned (--input-guide-tree)" />
+
+    <param type="boolean" truevalue="--collinear" falsevalue="" label="Collinear inputs" name="collinear"
+        help="Assume that input sequences are collinear--they have no rearrangements (--collinear)" />
+
+    <param type="select" label="Scoring scheme" name="scoring_scheme" help="Selects the anchoring score function. (--scoring-scheme)" >
+        <option value="sp" selected="True">Extant sum-of-pairs (sp)</option>
+        <option value="ancestral_sp">Sum-of-pairs + Ancestral (ancestral_sp)</option>
+        <option value="ancestral">Ancestral (ancestral)</option>
+    </param>
+
+    <param type="boolean" truevalue="--no-weight-scaling" falsevalue="" label="No weight scaling" name="no_weight_scaling"
+        help="Don't scale LCB weights by conservation distance and breakpoint distance (--no-weight-scaling)" />
+
+    <param type="float" min="0" max="1" label="max-breakpoint-distance-scale" value="0.5" name="max_breakpoint_distance_scale"
+        help="Set the maximum weight scaling by breakpoint distance. (--max-breakpoint-distance-scale)" />
+
+    <param type="float" min="0" max="1"  label="conservation-distance-scale" value="0.5" name="conservation_distance_scale"
+        help="Scale conservation distances by this amount. (--conservation-distance-scale)" />
+
+    <param type="boolean" truevalue="--skip-refinement" falsevalue="" label="Skip refinement" name="skip_refinement"
+        help="Do not perform iterative refinement (--skip-refinement)" />
+    <param type="boolean" truevalue="--skip-gapped-alignment" falsevalue="" label="Skip gapped alignment" name="skip_gapped_alignment"
+        help="Do not perform gapped alignment (--skip-gapped-alignment)" />
+    <param type="integer" label="BP dist estimate min score" name="bp_dist_estimate_min_score" value="0" optional="True"
+        help="Minimum LCB score for estimating pairwise breakpoint distance (--bp-dist-estimate-min-score)" />
+
+    <param type="integer" label="Gap open" name="gap_open" value="0" optional="True"
+        help="Gap open penalty (--gap-open)" />
+
+    <param type="select" label="Repeat penalty" name="repeat_penalty"
+        help="Sets whether the repeat scores go negative or go to zero for highly repetitive sequences. (--repeat-penalty)">
+        <option value="negative" selected="True">Negative</option>
+        <option value="zero">Zero</option>
+    </param>
+
+    <param type="integer" label="Gap extend" name="gap_extend" value="0" optional="True"
+        help="Gap extend penalty (--gap-extend)" />
+
+    <!--<param type="data" label="Substitution matrix" -->
+        <!--help="Nucleotide substitution matrix in NCBI format (- -substitution-matrix)" />-->
+
+    <param type="integer" label="Weight" name="weight" value="0" optional="True"
+        help="Minimum pairwise LCB score (--weight)" />
+    <param type="integer" label="Min scaled penalty" name="min_scaled_penalty" value="0" optional="True"
+        help="Minimum breakpoint penalty after scaling the penalty by expected divergence (--min-scaled-penalty)" />
+
+    <param type="float" label="HMM p go homologous" name="hmm_p_go_homologous" min="0" max="1" value="0.00001"
+        help="Probability of transitioning from the unrelated to the homologous state (--hmm-p-go-homologous)" />
+    <param type="float" label="HMM p go unrelated" name="hmm_p_go_unrelated" min="0" max="1" value="0.000000001"
+        help="Probability of transitioning from the homologous to the unrelated state (--hmm-p-go-unrelated)" />
+    <param type="float" label="HMM identity" name="hmm_identity" min="0" max="1" value="0.7"
+        help="Expected level of sequence identity among pairs of sequences(--hmm-identity)" />
+
+    <param type="boolean" truevalue="--seed-family" falsevalue="" label="Seed family" name="seed_family"
+        help="Use a family of spaced seeds to improve sensitivity (--seed-family)" />
+    <param type="boolean" truevalue="--solid-seeds" falsevalue="" label="Solid seeds" name="solid_seeds"
+        help="Use solid seeds. Do not permit substitutions in anchor matches. (--solid-seeds)" />
+    <param type="boolean" truevalue="--coding-seeds" falsevalue="" label="Coding seeds" name="coding_seeds"
+        help="Use coding pattern seeds. Useful to generate matches coding regions with 3rd codon position degeneracy. (--coding-seeds)" />
+    <param type="boolean" truevalue="--no-recursion" falsevalue="" label="No recursion" name="no_recursion"
+        help="Disable recursive anchor search (--no-recursion)" />
+  </inputs>
+  <outputs>
+    <data format="xmfa" name="output" label="${tool.name} alignment of ${on_string}">
+       <change_format>
+           <when input="mums" value="--mums" format="tabular" />
+       </change_format>
+    </data>
+    <data format="nhx" name="output_guide_tree_file" label="${tool.name} alignment of ${on_string}: Guide tree">
+        <when>output_guide_tree</when>
+    </data>
+    <data format="tabular" name="output_backbone_file" label="${tool.name} alignment of ${on_string}: Backbone">
+        <when>output_backbone</when>
+    </data>
+  </outputs>
+  <tests>
+      <test>
+          <param name="sequences" value="phagey.fa,karma.fa" />
+          <output name="output" file="1.xmfa" lines_diff="20"/>
+      </test>
+      <test>
+          <param name="sequences" value="merged.fa" />
+          <output name="output" file="1.xmfa" lines_diff="20"/>
+      </test>
+      <test>
+          <param name="sequences" value="merged.fa" />
+          <param name="output_guide_tree" value="True" />
+          <output name="output" file="1.xmfa" lines_diff="20"/>
+          <output name="output_guide_tree_file" file="1.nhx" />
+      </test>
+      <test>
+          <param name="sequences" value="merged.fa" />
+          <param name="mums" value="True" />
+          <output name="output" file="1.mums" compare="sim_size" delta="1000"/>
+      </test>
+      <test>
+          <param name="sequences" value="merged.fa" />
+          <param name="match_input" value="1.mums" />
+          <output name="output" file="1.xmfa" lines_diff="24"/>
+      </test>
+  </tests>
+  <help><![CDATA[
+What it does
+============
+
+Mauve is a system for efficiently constructing multiple genome alignments in
+the presence of large-scale evolutionary events such as rearrangement and
+inversion. Multiple genome alignment provides a basis for research into
+comparative genomics and the study of evolutionary dynamics. Aligning whole
+genomes is a fundamentally different problem than aligning short sequences.
+
+Mauve has been developed with the idea that a multiple genome aligner should
+require only modest computational resources. It employs algorithmic techniques
+that scale well in the amount of sequence being aligned. For example, a pair of
+Y. pestis genomes can be aligned in under a minute, while a group of 9
+divergent Enterobacterial genomes can be aligned in a few hours.
+
+progressiveMauve XMFA alignment visualized with the Mauve tool:
+
+.. image:: $PATH_TO_IMAGES/hemolysin.jpg
+
+Example Usage
+=============
+
++-----------------------------------+-------------+
+| Usage                             | Notes       |
++===================================+=============+
+| Align genomes                     |Simply       |
+|                                   |select as    |
+|                                   |many fasta   |
+|                                   |files with   |
+|                                   |one or more  |
+|                                   |sequences as |
+|                                   |necessary    |
++-----------------------------------+-------------+
+| Align genomes but also save       |Use the      |
+| the guide tree and produce a      |**Output     |
+| backbone file                     |Guide Tree** |
+|                                   |and **Output |
+|                                   |Backbone**   |
+|                                   |options      |
++-----------------------------------+-------------+
+| Align genomes, but do not         |Use the      |
+| detect forced alignment of        |**Disable    |
+| unrelated sequences               |backbone**   |
+|                                   |option       |
++-----------------------------------+-------------+
+| Detect forced alignment of        |Use the      |
+| unrelated sequence in the         |**Apply      |
+| alignment produced                |Backbone**   |
+| in previous example, use          |option and   |
+| custom Homology HMM transition    |specify the  |
+| parameters.                       |XMFA file    |
+|                                   |produced     |
+|                                   |in the       |
+|                                   |previous     |
+|                                   |example      |
++-----------------------------------+-------------+
+| Compute ungapped                  |Use the      |
+| local-multiple alignments among   |**MUMs**     |
+| the input sequences               |option       |
++-----------------------------------+-------------+
+| Compute an alignment of the       |Set the      |
+| same genomes, using previously    |**Match      |
+| computed local-multiple           |Input** to   |
+| alignments                        |the tabular  |
+|                                   |MUMs file    |
+|                                   |produced in  |
+|                                   |the previous |
+|                                   |example      |
++-----------------------------------+-------------+
+| Set a minimum scaled              |Use the      |
+| breakpoint penalty to cope with   |**Min Scaled |
+| the case where most genomes       |Penalty** and|
+| are aligned correctly, but manual |set to a     |
+| inspection reveals that           |value like   |
+| a divergent genome has too        |5000         |
+| many predicted rearrangements.    |             |
++-----------------------------------+-------------+
+| Globally align a set of           |Use the      |
+| collinear virus                   |**Colinear**,|
+| genomes, using seed families      |**Seed       |
+| to improve anchoring sensitivity  |Family**     |
+| in regions below 70% sequence     |options      |
+| identity.                         |             |
++-----------------------------------+-------------+
+
+
+The progressiveMauve algorithm: addressing limitations of the original algorithm
+================================================================================
+
+Comparative genomics has revealed that closely-related bacteria often have
+highly divergent gene content. While the original Mauve algorithm could align
+regions conserved among all organisms, the portion of the genome conserved
+among all taxa (the core genome) shrinks as more taxa are added to the
+analysis. As such, the original Mauve algorithm did not scale well to large
+numbers of taxa because it could not align regions conserved among subsets of
+the genomes under study. progressiveMauve employs a different algorithmic
+approach to scoring alignments that allows alignment of segments conserved
+among subsets of taxa. The progressiveMauve algorithm has been described in
+Aaron Darling's Ph.D. Thesis, and is also the subject of a manuscript published
+in PLoS ONE. A brief overview is given here.
+
+Finding initial local multiple alignments
+-----------------------------------------
+
+progressiveMauve elaborates on the original algorithm for finding local
+multiple alignments. Instead of using a single seed pattern for match
+filtration, progressiveMauve uses a combination of three seed patterns for
+improved sensitivity. The palindromic seed patterns have been described in
+Darling et al. 2006 "Procrastination leads to efficient filtration for local
+multiple alignment"
+
+Seed matches which represent a unique subsequence shared by two or more input
+genomes are subjected to ungapped extension until the seed pattern no longer
+matches. The result is an ungapped local multiple alignment with at most one
+component from each of the input genome sequences.
+
+Computing a pairwise genome content distance matrix and guide tree
+------------------------------------------------------------------
+
+progressiveMauve builds up genome alignments progressively according to a guide
+tree. The guide tree is computed based on an estimate of the shared gene
+content among each pair of input genomes. For a pair of input genomes, g.x and
+g.y, shared gene content is estimated by counting the number of nucleotides in
+gx and gy aligned to each other in the initial set of local multiple
+alignments. The count is normalized to a similarity value between 0 and 1 by
+dividing by the average size of gx and gy. The similarity value is subtracted
+from 1 to arrive at a distance estimate. Neighbor joining is then applied to
+the matrix of distance estimates to yield a guide tree topology. Note that the
+guide tree is not intended to be a phylogeny indicative of the genealogy of
+input genomes. It is merely a computational crutch for progressive genome
+alignment. Also note that alignments are later refined independently of a
+single guide tree toplogy to avoid biasing later phylogenetic inference.
+
+Computing a pairwise breakpoint distance matrix
+-----------------------------------------------
+
+Prior to alignment, progressiveMauve attempts to compute a conservative
+estimate of the number of rearrangement breakpoints among any pair of genomes.
+For each pair of genomes, pairwise alignments are created from the
+local-multiple alignments and the pairwise alignments are subjected to greedy
+breakpoint elimination. The breakpoint penalty used for greedy breakpoint
+elimination is set high for closely related genomes and scaled downward
+according to the estimate of genomic content distance. Because the breakpoint
+penalty is high, the resulting set of locally collinear blocks represent
+robustly supported segmental homology, and a conservative estimate of the
+breakpoint distance can be made on this basis. The conservative estimate of
+breakpoint distance is used later during progressive alignment to scale
+breakpoint penalties.
+
+Progressive genome alignment
+----------------------------
+
+A genome alignment is progressively built up according to the guide tree. At
+each step of the progressive genome alignment, alignment anchors are selected
+from the initial set of local multiple alignments. Anchors are selected so that
+they maximize a Sum-of-pairs scoring scheme which applies a penalty for
+predicting breakpoints among any pair of genomes. Because rates of genomic
+rearrangement are highly variable, especially in some bacterial pathogens, some
+genomes may be expected to exhibit greater rearrangement than others. As such,
+a single choice of scoring penalty is unlikely to yield accurate alignments for
+all genomes. To cope with this phenomenon, progressiveMauve scales the
+breakpoint penalty according to the expected level of sequence divergence and
+the number of well-supported genomic rearrangements among the pair of input
+genomes. These scaling values are taken from the distance matrices computed
+earlier in the algorithm.
+
+Anchored alignment
+------------------
+
+Once anchors have been computed at a node in the guide tree, a global alignment
+is computed on the basis of the anchors. Given a set of anchors among two
+genomes, a genome and an alignment, or a pair of alignments, a modified MUSCLE
+global alignment algorithm is applied to compute an anchored profile-profile
+alignment. MUSCLE is then used to perform tree-independent iterative refinement
+on the global genome alignment.
+
+Rejecting alignment of unrelated sequence
+-----------------------------------------
+
+Although we compute a global alignment among sequences, genomes often contain
+lineage-specific sequence and are thus not globally related. The global
+alignment will often contain forced alignment of unrelated sequence. A simple
+hidden Markov model structure is used to detect forced alignment of unrelated
+sequence, which are then removed from the alignment.
+
+Strengths of the progressiveMauve algorithm
+-------------------------------------------
+
+-  It can be applied to a much larger number of genomes than the original Mauve
+   algorithm
+-  It can align more divergent genomes than the original algorithm. Genomes
+   with as little as 50% nucleotide identity can be alignable
+-  Manual adjustment of the alignment scoring parameters is usually not
+   necessary
+-  It aligns the pan-genome, e.g. regions conserved among subsets of the input
+   genomes
+-  It is more accurate than the previous Mauve algorithm
+
+Notes on Reproducibility
+------------------------
+
+The command line programme progressiveMauve seems to behave differently when::
+
+    --max-breakpoint-distance-scale=0.5 --conservation-distance-scale=0.5
+
+are passed to the tool, compared to when those options are not passed. This
+means that if you wish to precisely replicate the results you see in Galaxy at
+the command line, you'll need to pass these flags with their "default" values.
+
+@ATTRIBUTION@
+]]></help>
+  <expand macro="citation" />
+</tool>