Mercurial > repos > iuc > semibin

--- a/macros.xml	Fri Nov 10 20:50:01 2023 +0000
+++ b/macros.xml	Tue Mar 25 15:55:28 2025 +0000
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <macros>
     <token name="@TOOL_VERSION@">2.0.2</token>
-    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@VERSION_SUFFIX@">1</token>
     <token name="@PROFILE@">21.01</token>
     <xml name="biotools">
         <xrefs>
@@ -140,9 +140,16 @@
     #end for
 #end if
     ]]></token>
+    <xml name="ref_select_cannot">
+        <param name="select" type="select" label="Reference database">
+            <option value="cached">Cached database</option>
+            <option value="taxonomy">Pre-computed taxonomy</option>
+        </param>
+    </xml>
     <xml name="ref_select">
         <param name="select" type="select" label="Reference database">
-            <option value="cached" selected="true">Cached database</option>
+            <option value="ml" selected="true">Use SemiBin ML function</option>
+            <option value="cached">Cached database</option>
             <option value="taxonomy">Pre-computed taxonomy</option>
         </param>
     </xml>
@@ -153,6 +160,28 @@
             </options>
         </param>
     </xml>
+    <xml name="ref-single-cannot">
+        <conditional name="ref">
+            <expand macro="ref_select_cannot"/>
+            <when value="cached">
+                <expand macro="cached_db"/>
+            </when>
+            <when value="taxonomy">
+                <param argument="--taxonomy-annotation-table" type="data" format="tabular" label="Pre-computed mmseqs2 format taxonomy TSV file"/>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="ref-multi-cannot">
+        <conditional name="ref">
+            <expand macro="ref_select_cannot"/>
+            <when value="cached">
+                <expand macro="cached_db"/>
+            </when>
+            <when value="taxonomy">
+                <param argument="--taxonomy-annotation-table" type="data" format="tabular" multiple="true" label="Pre-computed mmseqs2 format taxonomy TSV file" help="One per bin file"/>
+            </when>
+        </conditional>
+    </xml>
     <xml name="ref-single">
         <conditional name="ref">
             <expand macro="ref_select"/>
@@ -162,6 +191,7 @@
             <when value="taxonomy">
                 <param argument="--taxonomy-annotation-table" type="data" format="tabular" label="Pre-computed mmseqs2 format taxonomy TSV file"/>
             </when>
+            <when value="ml"/>
         </conditional>
     </xml>
     <xml name="ref-multi">
@@ -173,6 +203,7 @@
             <when value="taxonomy">
                 <param argument="--taxonomy-annotation-table" type="data" format="tabular" multiple="true" label="Pre-computed mmseqs2 format taxonomy TSV file" help="One per bin file"/>
             </when>
+            <when value="ml"/>
         </conditional>
     </xml>
     <xml name="ref_single">
@@ -184,6 +215,7 @@
             <when value="taxonomy">
                 <param argument="--taxonomy-annotation-table" type="data" format="tabular" label="Pre-computed mmseqs2 format taxonomy TSV file"/>
             </when>
+            <when value="ml"/>
         </conditional>
     </xml>
     <xml name="min_len">
--- a/semibin.xml	Fri Nov 10 20:50:01 2023 +0000
+++ b/semibin.xml	Tue Mar 25 15:55:28 2025 +0000
@@ -18,17 +18,19 @@
     #if $mode.select == 'single' and str($mode.environment) != ''
     --environment '$mode.environment'
     #end if
-    #if $mode.ref.select == "cached"
+    #if $mode.ref.select == "cached":
     --reference-db-data-dir '$mode.ref.cached_db.fields.path'
-    #else
+    #end if
+    #if $mode.ref.select == "taxonomy"
     --taxonomy-annotation-table '$mode.ref.taxonomy_annotation_table'
     #end if
 #else
     multi_easy_bin
     --separator '$separator'
-    #if $mode.ref.select == "cached"
+    #if $mode.ref.select == "cached":
     --reference-db-data-dir '$mode.ref.cached_db.fields.path'
-    #else
+    #end if
+    #if $mode.ref.select == "taxonomy"
     --taxonomy-annotation-table
         #for $e in $mode.ref.taxonomy_annotation_table
             '$e'
@@ -43,7 +45,7 @@
     --orf-finder '$orf_finder'
     --random-seed $random_seed

-#if str($annot.ml_threshold) != ''
+#if $annot.ml_threshold:
     --ml-threshold $annot.ml_threshold
 #end if
     --epoches $training.epoches
@@ -51,7 +53,7 @@
     --max-node $bin.max_node
     --max-edges $bin.max_edges
     --minfasta-kbs $bin.minfasta_kbs
-#if ($mode.select == 'single' or $mode.select == 'co') and "pre_reclustering_bins" in $extra_output
+#if ($mode.select == 'single' or $mode.select == 'co') and $extra_output and "pre_reclustering_bins" in $extra_output
     --write-pre-reclustering-bins
 #end if
     --compression none
@@ -153,10 +155,58 @@
                 <param name="min_len" value="0" />
             </conditional>
             <param name="orf_finder" value="prodigal"/>
-            <param name="random-seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
+            <param name="random_seed" value="0"/>
+            <section name="training">
+                <param name="epoches" value="20"/>
+                <param name="batch_size" value="2048"/>
+            </section>
+            <section name="bin">
+                <param name="max_node" value="1"/>
+                <param name="max_edges" value="200"/>
+                <param name="minfasta_kbs" value="200"/>
             </section>
+            <param name="extra_output" value="data,coverage,contigs"/>
+            <output_collection name="output_bins" count="0"/>
+            <output name="single_data" ftype="csv">
+                <assert_contents>
+                    <has_text text="g1k_0"/>
+                    <has_text text="g4k_7"/>
+                </assert_contents>
+            </output>
+            <output name="single_data_split" ftype="csv">
+                <assert_contents>
+                    <has_text text="g1k_0_1"/>
+                    <has_text text="g1k_6_2"/>
+                </assert_contents>
+            </output>
+            <output name="single_cov" ftype="csv">
+                <assert_contents>
+                    <has_text text="g1k_0"/>
+                    <has_text text="0.027"/>
+                </assert_contents>
+            </output>
+            <output name="single_split_cov" ftype="csv">
+                <assert_contents>
+                    <has_size value="1" delta="1"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="5">
+            <conditional name="mode">
+                <param name="select" value="single"/>
+                <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
+                <param name="input_bam" ftype="bam" value="input_single.bam"/>
+                <conditional name="ref">
+                    <param name="select" value="ml"/>
+                </conditional>
+                <param name="environment" value="human_gut"/>
+            </conditional>
+            <conditional name="min_len">
+                <param name="method" value="min-len"/>
+                <param name="min_len" value="0" />
+            </conditional>
+            <param name="orf_finder" value="prodigal"/>
+            <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -198,6 +248,73 @@
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
                 <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
                 <conditional name="ref">
+                    <param name="select" value="ml"/>
+                </conditional>
+            </conditional>
+            <conditional name="min_len">
+                <param name="method" value="ratio"/>
+                <param name="ratio" value="0.05"/>
+            </conditional>
+            <param name="orf_finder" value="fast-naive"/>
+            <param name="random_seed" value="0"/>
+            <section name="training">
+                <param name="epoches" value="20"/>
+                <param name="batch_size" value="2048"/>
+            </section>
+            <section name="bin">
+                <param name="max_node" value="1"/>
+                <param name="max_edges" value="200"/>
+                <param name="minfasta_kbs" value="200"/>
+            </section>
+            <param name="extra_output" value="coverage"/>
+            <output_collection name="output_bins" count="0"/>
+            <output_collection name="co_cov" count="5">
+                <element name="0" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0"/>
+                        <has_text text="g2k_7"/>
+                    </assert_contents>
+                </element>
+                <element name="1" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0"/>
+                        <has_text text="g2k_7"/>
+                    </assert_contents>
+                </element>
+                <element name="4" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0"/>
+                        <has_text text="g2k_7"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="co_split_cov" count="5">
+                <element name="0" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0_1"/>
+                        <has_text text="g2k_7_2"/>
+                    </assert_contents>
+                </element>
+                <element name="1" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0_1"/>
+                        <has_text text="g2k_7_2"/>
+                    </assert_contents>
+                </element>
+                <element name="2" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0_1"/>
+                        <has_text text="g2k_7_2"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>
+        <test expect_num_outputs="3">
+            <conditional name="mode">
+                <param name="select" value="co"/>
+                <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
+                <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                <conditional name="ref">
                     <param name="select" value="taxonomy"/>
                     <param name="taxonomy_annotation_table" value="taxonomy.tsv"/>
                 </conditional>
@@ -207,10 +324,7 @@
                 <param name="ratio" value="0.05"/>
             </conditional>
             <param name="orf_finder" value="fast-naive"/>
-            <param name="random-seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
-            </section>
+            <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -278,10 +392,7 @@
                 <param name="ratio" value="0.05"/>
             </conditional>
             <param name="orf_finder" value="fraggenescan"/>
-            <param name="random-seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
-            </section>
+            <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -340,7 +451,7 @@
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
                 <param name="input_bam" ftype="bam" value="input_single.bam"/>
                 <conditional name="ref">
-                    <param name="db_selector" value="cached"/>
+                    <param name="select" value="cached"/>
                     <param name="cached_db" value="test-db"/>
                 </conditional>
             </conditional>
@@ -349,10 +460,7 @@
                 <param name="ratio" value="0.05"/>
             </conditional>
             <param name="orf_finder" value="fraggenescan"/>
-            <param name="random-seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
-            </section>
+            <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -362,7 +470,6 @@
                 <param name="max_edges" value="200"/>
                 <param name="minfasta_kbs" value="200"/>
             </section>
-            <param name="extra_output" value=""/>
             <output_collection name="output_bins" count="1">
                 <element name="SemiBin_30" ftype="fasta">
                     <assert_contents>
@@ -377,7 +484,7 @@
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
                 <param name="input_bam" ftype="bam" value="input_single.bam"/>
                 <conditional name="ref">
-                    <param name="db_selector" value="cached"/>
+                    <param name="select" value="cached"/>
                     <param name="cached_db" value="test-db"/>
                 </conditional>
             </conditional>
@@ -386,10 +493,7 @@
                 <param name="ratio" value="0.05"/>
             </conditional>
             <param name="orf_finder" value="fraggenescan"/>
-            <param name="random-seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
-            </section>
+            <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -444,9 +548,6 @@
             </conditional>
             <param name="orf_finder" value="fraggenescan"/>
             <param name="random_seed" value="0"/>
-            <section name="annot">
-                <param name="ml_threshold" value=""/>
-            </section>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -520,4 +621,4 @@

     ]]></help>
     <expand macro="citations"/>
-</tool>
+</tool>
\ No newline at end of file