Mercurial > repos > onnodg > cdhit_analysis

Binary file __pycache__/__init__.cpython-313.pyc has changed
Binary file __pycache__/cdhit_analysis.cpython-313.pyc has changed
--- a/cdhit_analysis.py	Tue Oct 14 09:09:46 2025 +0000
+++ b/cdhit_analysis.py	Mon Oct 20 12:27:31 2025 +0000
@@ -1,5 +1,3 @@
-#!/usr/bin/env python3
-
 import argparse
 import os
 import re
--- a/cdhit_analysis.xml	Tue Oct 14 09:09:46 2025 +0000
+++ b/cdhit_analysis.xml	Mon Oct 20 12:27:31 2025 +0000
@@ -1,4 +1,4 @@
-<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.0">
+<tool id="cdhit_cluster_analysis" name="CD-HIT Cluster Analysis" version="1.0.1">
     <description>Analyze CD-HIT clustering results with taxonomic annotation</description>

     <requirements>
@@ -14,19 +14,19 @@
         --input_annotation '$input_annotation'

         #if $output_options.similarity_output:
-            --output_similarity_txt '$output_similarity_txt'
-            --output_similarity_plot '$output_similarity_plot'
+            --output_similarity_txt '$similarity_txt'
+            --output_similarity_plot '$similarity_plot'
         #end if
         #if $output_options.evalue_output:
-            --output_evalue_txt '$output_evalue_txt'
-            --output_evalue_plot '$output_evalue_plot'
+            --output_evalue_txt '$evalue_txt'
+            --output_evalue_plot '$evalue_plot'
         #end if
         #if $output_options.count_output:
-            --output_count '$output_count'
+            --output_count '$cluster_count'
         #end if
         #if $output_options.taxa_output:
-            --output_taxa_clusters '$output_taxa_clusters'
-            --output_taxa_processed '$output_taxa_processed'
+            --output_taxa_clusters '$cluster_taxa'
+            --output_taxa_processed '$processed_taxa'
         #end if

         --simi_plot_y_min '$plot_params.simi_plot_y_min'
@@ -48,24 +48,24 @@
     ]]></command>

     <inputs>
-        <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file (.clstr/.txt)"
+        <param name="input_cluster" type="data" format="txt" label="CD-HIT cluster file"
                help="Output cluster file from cd-hit-est" />
         <param name="input_annotation" type="data" format="xlsx"
-               label="Annotation file"
-               help="Excel workfile with sequence annotations (header, evalue, taxa)" />
+               label="Excel Annotations file"
+               help="Excel workfile with annotations per header" />

         <section name="output_options" title="Output Options" expanded="true">
             <param name="similarity_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create similarity output"
+                   checked="true" label="Create cluster similarity output"
                    help="Generate similarity analysis and plots" />
             <param name="evalue_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create E-value output"
+                   checked="true" label="Create cluster E-value output"
                    help="Generate E-value analysis and plots" />
             <param name="count_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create count output"
+                   checked="true" label="Create cluster count output"
                    help="Generate read count summaries" />
             <param name="taxa_output" type="boolean" truevalue="true" falsevalue="false"
-                   checked="true" label="Create taxa output"
+                   checked="true" label="Create taxa annotations output"
                    help="Generate taxonomic analysis" />
         </section>

@@ -104,31 +104,31 @@
     </inputs>

     <outputs>
-        <data name="output_similarity_txt" format="txt" label="Similarity data" >
+        <data name="similarity_txt" format="txt" label="Similarity data" >
             <filter>output_options['similarity_output']</filter>
         </data>

-        <data name="output_similarity_plot" format="png" label="Similarity plot" >
+        <data name="similarity_plot" format="png" label="Similarity plot" >
             <filter>output_options['similarity_output']</filter>
         </data>

-        <data name="output_evalue_txt" format="txt" label="E-value data" >
+        <data name="evalue_txt" format="txt" label="E-value data" >
             <filter>output_options['evalue_output']</filter>
         </data>

-        <data name="output_evalue_plot" format="png" label="E-value plot" >
+        <data name="evalue_plot" format="png" label="E-value plot" >
             <filter>output_options['evalue_output']</filter>
         </data>

-        <data name="output_count" format="txt" label="Count summary" >
+        <data name="cluster_count" format="txt" label="Count summary" >
             <filter>output_options['count_output']</filter>
         </data>

-        <data name="output_taxa_clusters" format="xlsx" label="Raw taxa per cluster" >
+        <data name="cluster_taxa" format="xlsx" label="Raw taxa per cluster" >
             <filter>output_options['taxa_output']</filter>
         </data>

-        <data name="output_taxa_processed" format="xlsx" label="Processed taxa" >
+        <data name="processed_taxa" format="xlsx" label="Processed taxa" >
             <filter>output_options['taxa_output']</filter>
         </data>
     </outputs>
@@ -143,13 +143,13 @@
                 <param name="count_output" value="true" />
                 <param name="taxa_output" value="true" />
             </section>
-            <output name="output_similarity_txt" file="sim_out.txt" />
-            <output name="output_similarity_plot" file="sim_out.png" compare="sim_size"/>
-            <output name="output_evalue_txt" file="evalue_out.txt" />
-            <output name="output_evalue_plot" file="evalue_out.png" compare="sim_size"/>
-            <output name="output_count" file="count_out.txt" />
-            <output name="output_taxa_clusters" file="taxa_out.xlsx" decompress="true"/>
-            <output name="output_taxa_processed" file="processed.xlsx" decompress="true"/>
+            <output name="similarity_txt" file="sim_out.txt" />
+            <output name="similarity_plot" file="sim_out.png" compare="sim_size"/>
+            <output name="evalue_txt" file="evalue_out.txt" />
+            <output name="evalue_plot" file="evalue_out.png" compare="sim_size"/>
+            <output name="cluster_count" file="count_out.txt" />
+            <output name="cluster_taxa" file="taxa_out.xlsx" decompress="true"/>
+            <output name="processed_taxa" file="processed.xlsx" decompress="true"/>
         </test>
         <test expect_num_outputs="7">
             <param name="input_cluster" value="input2_test.clstr.txt" />
@@ -160,13 +160,13 @@
                 <param name="count_output" value="true" />
                 <param name="taxa_output" value="true" />
             </section>
-            <output name="output_similarity_txt" file="test2_sim_out.txt" />
-            <output name="output_similarity_plot" file="test2_sim_out.png" compare="sim_size"/>
-            <output name="output_evalue_txt" file="test2_evalue_out.txt" />
-            <output name="output_evalue_plot" file="test2_evalue_out.png" compare="sim_size"/>
-            <output name="output_count" file="test_2count_out.txt" />
-            <output name="output_taxa_clusters" file="test_2taxa_out.xlsx" decompress="true"/>
-            <output name="output_taxa_processed" file="test_2processed.xlsx" decompress="true"/>
+            <output name="similarity_txt" file="test2_sim_out.txt" />
+            <output name="similarity_plot" file="test2_sim_out.png" compare="sim_size"/>
+            <output name="evalue_txt" file="test2_evalue_out.txt" />
+            <output name="evalue_plot" file="test2_evalue_out.png" compare="sim_size"/>
+            <output name="cluster_count" file="test_2count_out.txt" />
+            <output name="cluster_taxa" file="test_2taxa_out.xlsx" decompress="true"/>
+            <output name="processed_taxa" file="test_2processed.xlsx" decompress="true"/>
         </test>
         <test expect_num_outputs="5">
             <param name="input_cluster" value="input2_test.clstr.txt" />
@@ -178,7 +178,7 @@
                 <param name="evalue_output" value="false" />
             </section>
             <section name="processing_options">
-                <param name="show_unnanotated_clusters" value="true"/>
+                <param name="show_unannotated_clusters" value="true"/>
                 <param name="make_taxa_in_cluster_split" value="true"/>
                 <param name="print_empty_files" value="true"/>
             </section>
@@ -187,15 +187,15 @@
                 <param name="min_to_split" value="0.6"/>
                 <param name="min_count_to_split" value="6"/>
             </section>
-            <section name="plot_params" title="Plot Parameters" expanded="false">
+            <section name="plot_params">
                 <param name="simi_plot_y_min" value="0.4" />
                 <param name="simi_plot_y_max"  value="0.4"  />
             </section>
-            <output name="output_similarity_txt" file="test2_sim_extra_out.txt" />
-            <output name="output_similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/>
-            <output name="output_count" file="test_2count_extra_out.txt" />
-            <output name="output_taxa_clusters" file="test_2taxa_extra_out.xlsx" decompress="true"/>
-            <output name="output_taxa_processed" file="test_2processed_extra.xlsx" decompress="true"/>
+            <output name="similarity_txt" file="test2_sim_extra_out.txt" />
+            <output name="similarity_plot" file="test2_sim_extra_out.png" compare="sim_size"/>
+            <output name="cluster_count" file="test_2count_extra_out.txt" />
+            <output name="cluster_taxa" file="test_2taxa_extra_out.xlsx" decompress="true"/>
+            <output name="processed_taxa" file="test_2processed_extra.xlsx" decompress="true"/>
         </test>
     </tests>

@@ -212,10 +212,10 @@

 **Output Options:**

-- **Similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
-- **E-value output**: Creates E-value analysis with plots and text files showing E-value distributions
-- **Count output**: Creates summary tables with annotated/unannotated read counts per cluster
-- **Taxa output**: Creates taxonomic analysis determining the most likely taxa for each cluster
+- **Cluster similarity output**: Creates similarity analysis with plots and text files showing intra-cluster similarity distributions
+- **Cluster e-value output**: Creates E-value analysis with plots and text files showing E-value distributions
+- **Cluster count output**: Creates summary tables with annotated/unannotated read counts per cluster
+- **Taxa annotations output**: Creates taxonomic analysis determining the most likely taxa for each cluster

 **Parameters:**

@@ -235,9 +235,23 @@

 **Note**: The tool expects that sequence counts are included in the cluster file headers in the format "header(count)".

+-------------
+
+.. class:: infomark
+
 **Credits**
-Authors = Onno de Gorter, 2025.
+
 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
-Developed for the New light on old remedies project, a PhD research by Anja Fischer
+Developed for the New light on old remedies project, a PhD research by Anja Fischer.
+
+Link to the project website:
+
+* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html
+
     ]]></help>
+    <creator>
+        <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
+        <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
+        <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
+    </creator>
 </tool>
\ No newline at end of file
--- a/test-data/malformed_cluster.clstr	Tue Oct 14 09:09:46 2025 +0000
+++ b/test-data/malformed_cluster.clstr	Mon Oct 20 12:27:31 2025 +0000
@@ -1,4 +1,4 @@
->Cluster 0
-0	100nt, >read1:50..._CONS(50) *
-invalid_line_without_proper_format
-1	90nt, >read2:25..._CONS(25) at /+/95%
+>Cluster 0
+0	100nt, >read1:50..._CONS(50) *
+invalid_line_without_proper_format
+1	90nt, >read2:25..._CONS(25) at /+/95%
--- a/test-data/simple_cluster.clstr	Tue Oct 14 09:09:46 2025 +0000
+++ b/test-data/simple_cluster.clstr	Mon Oct 20 12:27:31 2025 +0000
@@ -1,2 +1,2 @@
->Cluster 0
-0	100nt, >read_no_anno:50... *
+>Cluster 0
+0	100nt, >read_no_anno:50... *
--- a/test-data/test_count.txt	Tue Oct 14 09:09:46 2025 +0000
+++ b/test-data/test_count.txt	Mon Oct 20 12:27:31 2025 +0000
@@ -1,26 +1,26 @@
-cluster	unannotated	annotated	total	perc_unannotated	perc_annotated
-0	2.0	408	410.0	0.49	99.51
-1	1.0	0	1.0	100.00	0.00
-2	0.0	1	1.0	0.00	100.00
-3	0.0	52	52.0	0.00	100.00
-4	1.0	0	1.0	100.00	0.00
-5	0.0	176	176.0	0.00	100.00
-6	1.0	0	1.0	100.00	0.00
-7	0.0	79	79.0	0.00	100.00
-8	1.0	0	1.0	100.00	0.00
-9	9.0	0	9.0	100.00	0.00
-10	3.0	0	3.0	100.00	0.00
-11	2.0	0	2.0	100.00	0.00
-12	1.0	0	1.0	100.00	0.00
-13	1.0	0	1.0	100.00	0.00
-14	1.0	0	1.0	100.00	0.00
-15	5.0	0	5.0	100.00	0.00
-16	21.0	0	21.0	100.00	0.00
-17	38.0	0	38.0	100.00	0.00
-18	5.0	0	5.0	100.00	0.00
-19	5.0	0	5.0	100.00	0.00
-20	1.0	0	1.0	100.00	0.00
-21	1.0	0	1.0	100.00	0.00
-22	4.0	0	4.0	100.00	0.00
-23	0.0	1	1.0	0.00	100.00
-TOTAL	103.0	717	820.0	12.56	87.44
+cluster	unannotated	annotated	total	perc_unannotated	perc_annotated
+0	2.0	408	410.0	0.49	99.51
+1	1.0	0	1.0	100.00	0.00
+2	0.0	1	1.0	0.00	100.00
+3	0.0	52	52.0	0.00	100.00
+4	1.0	0	1.0	100.00	0.00
+5	0.0	176	176.0	0.00	100.00
+6	1.0	0	1.0	100.00	0.00
+7	0.0	79	79.0	0.00	100.00
+8	1.0	0	1.0	100.00	0.00
+9	9.0	0	9.0	100.00	0.00
+10	3.0	0	3.0	100.00	0.00
+11	2.0	0	2.0	100.00	0.00
+12	1.0	0	1.0	100.00	0.00
+13	1.0	0	1.0	100.00	0.00
+14	1.0	0	1.0	100.00	0.00
+15	5.0	0	5.0	100.00	0.00
+16	21.0	0	21.0	100.00	0.00
+17	38.0	0	38.0	100.00	0.00
+18	5.0	0	5.0	100.00	0.00
+19	5.0	0	5.0	100.00	0.00
+20	1.0	0	1.0	100.00	0.00
+21	1.0	0	1.0	100.00	0.00
+22	4.0	0	4.0	100.00	0.00
+23	0.0	1	1.0	0.00	100.00
+TOTAL	103.0	717	820.0	12.56	87.44
--- a/test-data/test_evalue.txt	Tue Oct 14 09:09:46 2025 +0000
+++ b/test-data/test_evalue.txt	Mon Oct 20 12:27:31 2025 +0000
@@ -1,20 +1,20 @@
-evalue	count
-unannotated	103.0
-1.41e-39	414
-4.99e-39	166
-1.54e-33	72
-6.56e-38	25
-2.32e-37	16
-7.17e-32	6
-1.82e-38	4
-5.07e-39	3
-8.21e-37	2
-1.43e-39	1
-6.45e-38	1
-6.66e-38	1
-2.28e-37	1
-8.62e-37	1
-1.06e-35	1
-1.08e-35	1
-3.33e-30	1
-8.16e-12	1
+evalue	count
+unannotated	103.0
+1.41e-39	414
+4.99e-39	166
+1.54e-33	72
+6.56e-38	25
+2.32e-37	16
+7.17e-32	6
+1.82e-38	4
+5.07e-39	3
+8.21e-37	2
+1.43e-39	1
+6.45e-38	1
+6.66e-38	1
+2.28e-37	1
+8.62e-37	1
+1.06e-35	1
+1.08e-35	1
+3.33e-30	1
+8.16e-12	1
Binary file test-data/test_processed_taxa.xlsx has changed
--- a/test-data/test_similarity.txt	Tue Oct 14 09:09:46 2025 +0000
+++ b/test-data/test_similarity.txt	Mon Oct 20 12:27:31 2025 +0000
@@ -1,14 +1,14 @@
-# Average similarity: 99.35
-# Standard deviation: 0.65
-similarity	count
-100.0	383
-98.89	368
-98.88	18
-98.86	1
-98.73	7
-98.28	1
-98.21	8
-97.8	2
-97.78	29
-97.75	2
-97.73	1
+# Average similarity: 99.35
+# Standard deviation: 0.65
+similarity	count
+100.0	383
+98.89	368
+98.88	18
+98.86	1
+98.73	7
+98.28	1
+98.21	8
+97.8	2
+97.78	29
+97.75	2
+97.73	1
Binary file test-data/test_taxa_clusters.xlsx has changed
Binary file tests/__pycache__/test_cdhit_analysis.cpython-313-pytest-8.4.2.pyc has changed
--- a/tests/test_cdhit_analysis.py	Tue Oct 14 09:09:46 2025 +0000
+++ b/tests/test_cdhit_analysis.py	Mon Oct 20 12:27:31 2025 +0000
@@ -591,27 +591,27 @@
         assert "Processing complete" in captured.out


-    def test_16a_prepare_evalue_histogram_valid_data(self):
+    def test_18a_prepare_evalue_histogram_valid_data(self):
         """
-        Test 16a: prepare_evalue_histogram returns correct counts/bins.
+        Test 18a: prepare_evalue_histogram returns correct counts/bins.
         """
         from Stage_1_translated.NLOOR_scripts.process_clusters_tool import cdhit_analysis as ca
         counts, bins = ca.prepare_evalue_histogram([1e-5, 1e-3, 0.5], [])
         assert counts.sum() == 3  # 3 entries counted
         assert len(bins) == 51  # 50 bins => 51 edges

-    def test_16b_prepare_evalue_histogram_empty(self):
+    def test_18b_prepare_evalue_histogram_empty(self):
         """
-        Test 16b: prepare_evalue_histogram with empty/invalid data returns (None, None).
+        Test 18b: prepare_evalue_histogram with empty/invalid data returns (None, None).
         """
         from Stage_1_translated.NLOOR_scripts.process_clusters_tool import cdhit_analysis as ca
         counts, bins = ca.prepare_evalue_histogram([0, None, "bad"], [])
         assert counts is None
         assert bins is None

-    def test_16c_create_evalue_plot_creates_file_and_returns_data(self, tmp_path):
+    def test_18c_create_evalue_plot_creates_file_and_returns_data(self, tmp_path):
         """
-        Test 16c: create_evalue_plot saves a PNG and returns numeric data.
+        Test 18c: create_evalue_plot saves a PNG and returns numeric data.
         """
         from Stage_1_translated.NLOOR_scripts.process_clusters_tool import cdhit_analysis as ca
         out = tmp_path / "eval.png"