Mercurial > repos > devteam > lastz

--- a/lastz.xml	Thu May 17 04:29:13 2018 -0400
+++ b/lastz.xml	Fri May 18 16:58:38 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="lastz_wrapper_2" name="LASTZ" version="1.3.1">
+<tool id="lastz_wrapper_2" name="LASTZ" version="1.3.2">
     <description>: align long sequences</description>
     <macros>
         <import>lastz_macros.xml</import>
@@ -238,6 +238,11 @@
             '--inner=${interpolation.inner}'
         #end if

+## HOUSEKEEPING ----------------------------------
+
+        --traceback=160M
+
+
 ## OUTPUT FORMATS --------------------------------

         #if str( $output_format.out.format ) == "bam":
@@ -252,13 +257,16 @@
             '--format=general-:${output_format.out.fields}'
         #end if
         --action:target=multiple
-        --rdotplot=plot.r
+        $output_format.rplot
         #if str( $output_format.out.format ) == "bam":
-            | samtools sort -@\${GALAXY_SLOTS:-2} -O bam -o '${output}' &&
+            | samtools sort -@\${GALAXY_SLOTS:-2} -O bam -o '${output}'
         #else:
-            > '${output}' &&
+            > '${output}'
         #end if
-        Rscript $r_plot > /dev/null 2>&1
+        #if $output_format.rplot:
+            &&
+            Rscript $r_plot > /dev/null 2>&1
+        #end if

         ]]>
     </command>
@@ -349,7 +357,7 @@
                 </when>
             </conditional>
             <param name="ambigN" type="boolean" truevalue="--ambiguous=n" checked="false" label="Treat each N in the input sequences as an ambiguous nucleotide" argument="--ambiguous=n" help="Substitutions with N are scored as zero, instead of using the fill_score value from the scoring file (which is -100 by default)."/>
-            <param name="ambigIUPAC" type="boolean" truevalue="--ambiguous=iupac" checked="false" label="Treat each of the IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, W, and Y, as well as N) in the input sequences as a completely ambiguous nucleotide." argument="--ambiguous=iupac" help="Substitutions with these characters are scored as zero, instead of using the fill_score value from the scoring file (which is -100 by default)."/>
+            <param name="ambigIUPAC" type="boolean" truevalue="--ambiguous=iupac" checked="true" label="Treat each of the IUPAC-IUB ambiguity codes (B, D, H, K, M, R, S, V, W, and Y, as well as N) in the input sequences as a completely ambiguous nucleotide." argument="--ambiguous=iupac" help="Substitutions with these characters are scored as zero, instead of using the fill_score value from the scoring file (which is -100 by default)."/>
         </section>
         <section name="seeding" expanded="false" title="Seeding">
             <conditional name="seed">
@@ -560,6 +568,7 @@
                     <!-- Do nothing -->
                 </when>
             </conditional>
+            <param name="rplot" type="boolean" truevalue="--rdotplot=plot.r" falsevalue="" checked="false" argument="--rdotplot" label="Create a dotplot representation of alignments?" help="The dotplot is only useful if query and target contain exactly one sequence each"/>
         </section>
     </inputs>
     <outputs>
@@ -569,7 +578,9 @@
                   <when input="output_format.out.format" value="maf" format="maf" />
             </change_format>
         </data>
-        <data format="png" name="out_plot" label="${tool.name} on ${on_string}: dot plot"/>
+        <data format="png" name="out_plot" label="${tool.name} on ${on_string}: dot plot">
+            <filter>output_format['rplot']</filter>
+        </data>
     </outputs>
     <tests>
         <test>
@@ -609,6 +620,8 @@
             <param name="ref_source" value="history" />
             <param name="target" ftype="fasta.gz" value="chrM_human.fa.gz" />
             <param name="query" ftype="fastq.bz2" value="chrM_mouse.fq.bz2" />
+            <param name="traceback" value="83886080" />
+            <param name="word" value="28" />
             <param name="strand" value="--strand=both" />
             <param name="format" value="blastn" />
             <output name="output" value="test5.out" />
@@ -633,84 +646,88 @@

     <help><![CDATA[

-        **What is does**
+**What is does**

-        LASTZ is designed to preprocess one sequence or set of sequences (which we collectively call the *TARGET*) and then align several *QUERY* sequences to it. It was developed by `Bob Harris <http://www.bx.psu.edu/~rsharris/>`_ in the lab of Webb Miller at Penn State.
+LASTZ is designed to preprocess one sequence or set of sequences (which we collectively call the *TARGET*) and then align several *QUERY* sequences to it. It was developed by `Bob Harris <http://www.bx.psu.edu/~rsharris/>`_ in the lab of Webb Miller at Penn State.

-        .. class:: warningmark
+.. class:: infomark

-        **Read documentation** before proceeding. LASTZ is a complex tool with many parameter options. Fortunately, there is a `great manual <https://lastz.github.io/lastz/>`_ maintained by its author. Default parameters may be sufficient to obtain the initial idea about how similar your sequences are, but to produce reliable alignments you may need to tweak the parameters. So RTFM!
+**Read documentation** before proceeding. LASTZ is a complex tool with many parameter options. Fortunately, there is a `great manual <https://lastz.github.io/lastz/>`_ maintained by its author. Default parameters may be sufficient to obtain the initial idea about how similar your sequences are, but to produce reliable alignments you may need to tweak the parameters. So RTFM!

-        **About LASTZ parameters**
+.. class:: warningmark

-        Galaxy's version of LASTZ has nine parameter sections (*Where to look*, *Scoring*, *Seeding*, *HSPs*, *Chaining*, *Gapped extension*, *Filtering*, *Interpolation*, and *Output*). These sections closely follow parameter description in the `manual <https://lastz.github.io/lastz/#syntax>`_.
+Galaxy version of LASTZ sets **--ambiguous=iupac** as default (see **Scoring** section). This prevents LASTZ from erroring out if one of the DNA inputrs contains "non-strandard" nucleotides.

-        **Defaults**
+**About LASTZ parameters**

-        here are defaults for some of the most important parameters::
+Galaxy's version of LASTZ has nine parameter sections (*Where to look*, *Scoring*, *Seeding*, *HSPs*, *Chaining*, *Gapped extension*, *Filtering*, *Interpolation*, and *Output*). These sections closely follow parameter description in the `manual <https://lastz.github.io/lastz/#syntax>`_.

-            --seed=<pattern>       set seed pattern (12of19, 14of22, or general pattern)
-                                   (default is 1110100110010101111)
-                                   SEE "Seeding" SECTION -> "Select seed type"
+**Defaults**
+
+here are defaults for some of the most important parameters::

-            --[no]transition       allow (or don't) one transition in a seed hit
-                                   (by default a transition is allowed)
-                                   SEE "Seeding" SECTION -> "Allow transitions"
+    --seed=<pattern>       set seed pattern (12of19, 14of22, or general pattern)
+                           (default is 1110100110010101111)
+                           SEE "Seeding" SECTION -> "Select seed type"

-            --[no]chain            perform chaining
-                                   (by default no chaining is performed)
-                                   SEE "Chaining" SECTION
+    --[no]transition       allow (or don't) one transition in a seed hit
+                           (by default a transition is allowed)
+                           SEE "Seeding" SECTION -> "Allow transitions"

-            --[no]gapped           perform gapped alignment (instead of gap-free)
-                                   (by default gapped alignment is performed)
-                                   SEE "Gapped extension" SECTION
+    --[no]chain            perform chaining
+                           (by default no chaining is performed)
+                           SEE "Chaining" SECTION
+
+    --[no]gapped           perform gapped alignment (instead of gap-free)
+                           (by default gapped alignment is performed)
+                           SEE "Gapped extension" SECTION

-            --strand=both          search both strands
-            --strand=plus          search + strand only (matching strand of query spec)
-                                   (by default both strands are searched)
-                                   SEE "Where to look" SECTION
+    --strand=both          search both strands
+    --strand=plus          search + strand only (matching strand of query spec)
+                           (by default both strands are searched)
+                           SEE "Where to look" SECTION

-            --scores=<file>        read substitution and gap scores from a file
-                                   SEE "Scoring" SECTION
+    --scores=<file>        read substitution and gap scores from a file
+                           SEE "Scoring" SECTION

-            --xdrop=<score>        set x-drop threshold (default is 10sub[A][A])
-                                   SEE "HSPs" SECTION
+    --xdrop=<score>        set x-drop threshold (default is 10sub[A][A])
+                           SEE "HSPs" SECTION

-            --ydrop=<score>        set y-drop threshold (default is open+300extend)
-                                   SEE "Gapped extension" SECTION
+    --ydrop=<score>        set y-drop threshold (default is open+300extend)
+                           SEE "Gapped extension" SECTION

-            --hspthresh=<score>    set threshold for high scoring pairs (default is 3000)
-                                   ungapped extensions scoring lower are discarded
-                                   <score> can also be a percentage or base count
-                                   SEE "HSPs" SECTION
+    --hspthresh=<score>    set threshold for high scoring pairs (default is 3000)
+                           ungapped extensions scoring lower are discarded
+                           <score> can also be a percentage or base count
+                           SEE "HSPs" SECTION

-            --gappedthresh=<score> set threshold for gapped alignments
-                                   gapped extensions scoring lower are discarded
-                                   <score> can also be a percentage or base count
-                                   (default is to use same value as --hspthresh)
-                                   SEE "Gapped extension" SECTION
+    --gappedthresh=<score> set threshold for gapped alignments
+                           gapped extensions scoring lower are discarded
+                           <score> can also be a percentage or base count
+                           (default is to use same value as --hspthresh)
+                           SEE "Gapped extension" SECTION


-        **Substitution matrix**
+**Substitution matrix**

-        By default the HOXD70 substitution scores are used (from `Chiaromonte et al. 2002 <https://www.ncbi.nlm.nih.gov/pubmed/11928468>`_)::
+By default the HOXD70 substitution scores are used (from `Chiaromonte et al. 2002 <https://www.ncbi.nlm.nih.gov/pubmed/11928468>`_)::

-            bad_score          = X:-1000  # used for sub['X'][*] and sub[*]['X']
-            fill_score         = -100     # used when sub[*][*] is not defined
-            gap_open_penalty   =  400
-            gap_extend_penalty =   30
+    bad_score          = X:-1000  # used for sub['X'][*] and sub[*]['X']
+    fill_score         = -100     # used when sub[*][*] is not defined
+    gap_open_penalty   =  400
+    gap_extend_penalty =   30

-                 A     C     G     T
-            A   91  -114   -31  -123
-            C -114   100  -125   -31
-            G  -31  -125   100  -114
-            T -123   -31  -114    91
+         A     C     G     T
+    A   91  -114   -31  -123
+    C -114   100  -125   -31
+    G  -31  -125   100  -114
+    T -123   -31  -114    91

-        Matrix can be supplied as an input to **Read the substitution scores** parameter in *Scoring* section. Substitution matrix can be inferred from your data using another LASTZ-based tool (LASTZ_D: Infer substitution scores).
+Matrix can be supplied as an input to **Read the substitution scores** parameter in *Scoring* section. Substitution matrix can be inferred from your data using another LASTZ-based tool (LASTZ_D: Infer substitution scores).

-        **Output**
+**Output**

-        This version of LASTZ produces two outputs by default: a BAM alignment file and a dot-plot in PNG format. Other formats can be configured in *Output* section. This incarnation of LASTZ produces outputs without comment line starting with '#'. To learn identity of each column, consult `formats section of LASTZ manual <https://lastz.github.io/lastz/#formats>`_.
+This version of LASTZ produces one output by default: a BAM alignment file. Other formats as well as a Dot Plot can be configured in *Output* section. This incarnation of LASTZ produces outputs without comment line starting with '#'. To learn identity of each column, consult `formats section of LASTZ manual <https://lastz.github.io/lastz/#formats>`_.

         ]]>
     </help>
--- a/lastz_d.xml	Thu May 17 04:29:13 2018 -0400
+++ b/lastz_d.xml	Fri May 18 16:58:38 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="lastz_d_wrapper" name="LASTZ_D" version="1.3.1">
+<tool id="lastz_d_wrapper" name="LASTZ_D" version="1.3.2">
     <description>: estimate substitution scores matrix</description>
     <macros>
     	<import>lastz_macros.xml</import>
@@ -45,39 +45,39 @@

     <help><![CDATA[

-        **What is does**
+**What is does**

-        LASTZ_D is a non-integer (**D** stands for Double) version of LASTZ that can be used to estimate substitution matrix that will be used to score alignments. It was developed by `Bob Harris <http://www.bx.psu.edu/~rsharris/>`_ in the lab of Webb Miller at Penn State as a part of LASTZ. Matrix computed by this tool is to be used by LASTZ (see below).
+LASTZ_D is a non-integer (**D** stands for Double) version of LASTZ that can be used to estimate substitution matrix that will be used to score alignments. It was developed by `Bob Harris <http://www.bx.psu.edu/~rsharris/>`_ in the lab of Webb Miller at Penn State as a part of LASTZ. Matrix computed by this tool is to be used by LASTZ (see below).

-        .. class:: warningmark
+.. class:: warningmark

-        **Read documentation** before proceeding. LASTZ is a complex tool with many parameter options. Fortunately, there is a `great manual <https://lastz.github.io/lastz/>`_ maintained by its author. The two sections that are particularly relevant to the inference of substitution matrix are `Inferring Score Sets <http://www.bx.psu.edu/~rsharris/lastz/README.lastz-1.04.00.html#adv_inference>`_ and `Inference Control File <http://www.bx.psu.edu/~rsharris/lastz/README.lastz-1.04.00.html#fmt_inference>`_.
+**Read documentation** before proceeding. LASTZ is a complex tool with many parameter options. Fortunately, there is a `great manual <https://lastz.github.io/lastz/>`_ maintained by its author. The two sections that are particularly relevant to the inference of substitution matrix are `Inferring Score Sets <http://www.bx.psu.edu/~rsharris/lastz/README.lastz-1.04.00.html#adv_inference>`_ and `Inference Control File <http://www.bx.psu.edu/~rsharris/lastz/README.lastz-1.04.00.html#fmt_inference>`_.

-        **Notes on the inference**
+**Notes on the inference**

-        Inference is achieved by computing the probability of each of the 18 different alignment events (gap open, gap extend, and 16 substitutions). These probabilities are estimated from alignments of the sequences. Of course, at first we don't have alignments, so the process begins by using a generic scoring set to create alignments, infer scores from those, then realign, and so on, until the scores stabilize or "converge". Ungapped alignments are performed until the substitution scores converge, then gapped alignments are performed (holding the substitution scores constant) until the gap penalties converge. In the end you get a matrix like this::
+Inference is achieved by computing the probability of each of the 18 different alignment events (gap open, gap extend, and 16 substitutions). These probabilities are estimated from alignments of the sequences. Of course, at first we don't have alignments, so the process begins by using a generic scoring set to create alignments, infer scores from those, then realign, and so on, until the scores stabilize or "converge". Ungapped alignments are performed until the substitution scores converge, then gapped alignments are performed (holding the substitution scores constant) until the gap penalties converge. In the end you get a matrix like this::

-            # (a LASTZ scoring set, created by "LASTZ --infer")
+    # (a LASTZ scoring set, created by "LASTZ --infer")

-            bad_score          = X:-1781 # used for sub[X][*] and sub[*][X]
-            fill_score         = -178    # used when sub[*][*] not otherwise defined
-            gap_open_penalty   = 400
-            gap_extend_penalty = 30
+    bad_score          = X:-1781 # used for sub[X][*] and sub[*][X]
+    fill_score         = -178    # used when sub[*][*] not otherwise defined
+    gap_open_penalty   = 400
+    gap_extend_penalty = 30

-                  A     C     G     T
-            A    72   -79   -49   -97
-            C   -79   100  -178   -49
-            G   -49  -178   100   -79
-            T   -97   -49   -79    72
+          A     C     G     T
+    A    72   -79   -49   -97
+    C   -79   100  -178   -49
+    G   -49  -178   100   -79
+    T   -97   -49   -79    72

-        This dataset can then be used as an input to the **Read the substitution scores** parameter of LASTZ (Parameter section *Scoring*).
+This dataset can then be used as an input to the **Read the substitution scores** parameter of LASTZ (Parameter section *Scoring*).

-        The iterative process can fail if there's not a lot of sequence to align.  E.g. if after the 4th iteration there's nothing in the central 50% denominators go to zero and the process fails.
+The iterative process can fail if there's not a lot of sequence to align.  E.g. if after the 4th iteration there's nothing in the central 50% denominators go to zero and the process fails.

-        If the sequences you are aligning have GC content different than the usual ACGT 30-20-20-30 split, scoring inference should discover this and give you better alignments.
+If the sequences you are aligning have GC content different than the usual ACGT 30-20-20-30 split, scoring inference should discover this and give you better alignments.


-        ]]>
+]]>
     </help>
     <expand macro="citations"/>
 </tool>
--- a/test-data/test4.out	Thu May 17 04:29:13 2018 -0400
+++ b/test-data/test4.out	Fri May 18 16:58:38 2018 -0400
@@ -1,1 +1,1 @@
-chrM	chrM	70.79	16186	4208	152	577	16569	1	15860	0	14129.5
+chrM	chrM	70.79	16186	4208	152	577	16569	1	15860	0	14131.6