Mercurial > repos > iuc > miniprot

--- a/macros.xml	Mon Sep 19 12:30:10 2022 +0000
+++ b/macros.xml	Fri Sep 23 22:35:23 2022 +0000
@@ -1,3 +1,3 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.2</token>
+    <token name="@TOOL_VERSION@">0.3</token>
 </macros>
--- a/miniprot.xml	Mon Sep 19 12:30:10 2022 +0000
+++ b/miniprot.xml	Fri Sep 23 22:35:23 2022 +0000
@@ -26,6 +26,11 @@
             -C $adv.alignment.non_canonical_splice
             -F $adv.alignment.frameshift
             -B $adv.alignment.end_bonus
+        #if str($adv.output.prefix) != 'MP'
+            -P '$adv.output.prefix'
+        #end if
+            $adv.output.print_unmapped_proteins
+            --outn=$adv.output.outputs_per_query
         #end if
         #if str($db.dbtype) == 'fasta'
             '$db.genomic_fasta'
@@ -73,21 +78,35 @@
                     <param argument="-S" name="no_splicing" type="boolean" truevalue="-S" falsevalue="" checked="false" label="No splicing" help="No splicing (apply -G1000 -J1000 -e1000)" />
                     <param argument="-c" name="max_kmer" type="integer" min="1" value="50000" label="Max k-mer occurences" />
                     <param argument="-G" name="max_intron" type="integer" min="0" value="200000" label="Max intron size" />
+                    <!-- the -w option is mentioned in the help text but apparently not implmented: https://github.com/lh3/miniprot/issues/12 -->
+                    <!-- <param argument="-w" name="log_gap_penalty_weight" type="float" value="0.75" label="Log gap penalty weight" /> -->
                     <param argument="-n" name="min_syncmers" type="integer" min="1" value="5" label="Minimum number of syncmers in a chain" />
                     <param argument="-m" name="min_chain_score" type="integer" min="0" value="0" label="Minimum chaining score" />
                     <param argument="-l" name="second_round_kmer_size" type="integer" min="1" value="5" label="K-mer size for second round of chaining" />
                     <param argument="-e" name="max_extension" type="integer" min="0" value="10000" label="Max extension for second round of chaining" />
-                    <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.5" label="Minimum secondary-to-primary score ratio" />
-                    <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="100" label="Max secondary alignments to consider" />
+                    <param argument="-p" name="score_ratio" type="float" min="0" max="1" value="0.7" label="Minimum secondary-to-primary score ratio" />
+                    <param argument="-N" name="max_secondary_alignments" type="integer" min="0" value="50" label="Max secondary alignments to consider" />
                 </section>
                 <section name="alignment" title="Alignment">
                     <param argument="-O" name="gap_open" type="integer" min="0" value="11" label="Gap open penalty" />
                     <param argument="-E" name="gap_extension" type="integer" min="0" value="1" label="Gap extension penalty" help="A k-long gap costs open_penalty+k*extension_penalty" />
                     <param argument="-J" name="intron_open" type="integer" min="0" value="31" label="Intron open penalty" />
                     <param argument="-C" name="non_canonical_splice" type="integer" min="0" value="11" label="Penalty for non-canonical splicing" />
-                    <param argument="-F" name="frameshift" type="integer" min="0" value="15" label="Frameshift penalty" />
+                    <param argument="-F" name="frameshift" type="integer" min="0" value="17" label="Frameshift penalty" />
                     <param argument="-B" name="end_bonus" type="integer" min="0" value="5" label="End bonus" />
                 </section>
+                <section name="output" title="Output">
+                    <param argument="-P" name="prefix" type="text" label="Prefix for IDs in GFF3 output" value="MP">
+                        <sanitizer invalid_char="">
+                            <valid initial="string.ascii_letters,string.digits">
+                                <add value="_" />
+                                <add value="-" />
+                            </valid>
+                        </sanitizer>
+                    </param>
+                    <param argument="-u" name="print_unmapped_proteins" type="boolean" truevalue="-u" falsevalue="" label="Print unmapped proteins" checked="false" />
+                    <param argument="--outn" name="outputs_per_query" type="integer" min="0" value="100" label="Outputs per query" help="The number of outputs will be the minimum of this and the max secondary alignments option" />
+                </section>
                 <param argument="-K" name="query_batch_size" type="integer" min="1" value="2000000" label="Query batch size" />
             </when>
             <when value="no">
@@ -111,7 +130,7 @@
             <output name="output_alignment" ftype="gff3">
                 <assert_contents>
                     <has_text text="ID=MP000001;Identity=1.0000;Positive=1.0000;Target=tr|O06302|O06302_MYCTU 1 126" />
-                    <has_text text="Parent=MP000372;Target=tr|V5QPR5|V5QPR5_MYCTU 1 53" />
+                    <has_text text="ID=MP000359;Identity=0.9811;Positive=1.0000;Target=tr|V5QPR5|V5QPR5_MYCTU 1 53" />
                 </assert_contents>
             </output>
         </test>
@@ -151,7 +170,25 @@
         miniprot_  rapidly aligns a protein sequence against a genome with affine gap penalty, splicing and frameshift.
         It is primarily intended for annotating protein-coding genes in a new species using known genes from other species.

-        **NOTE:** miniprot is in the early stages of development and should be considered experimental at this stage.
+        While an index of the genome to be mapped to can be built "on the fly", the Miniprot index tool can pre-index a genome
+        and will result in faster performance if the genome index is reused multiple times.
+
+        For details of the algorithm and some insight into how parameters can be tuned see this overview_.
+
         .. _miniprot: https://github.com/lh3/miniprot
+        .. _overview: https://github.com/lh3/miniprot#algorithm-overview
     ]]></help>
+    <citations>
+        <citation type="bibtex"><![CDATA[
+            @misc{Li2022,
+                author = {Li, Heng},
+                title = {miniprot},
+                year = {2022},
+                publisher = {GitHub},
+                journal = {GitHub repository},
+                howpublished = {\url{https://github.com/lh3/miniprot}},
+                commit = {b442b7a6b60dbd15f460ea9af75fa0b7293d4a8c}
+              }
+        ]]></citation>
+    </citations>
 </tool>