changeset 2:706b7acdb230 draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
author onnodg
date Tue, 21 Oct 2025 07:54:21 +0000
parents ff68835adb2b
children c6981ea453ae
files README.md cdhit_analysis.py cdhit_analysis.sh cdhit_analysis.xml
diffstat 4 files changed, 43 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Tue Oct 21 07:54:21 2025 +0000
@@ -0,0 +1,14 @@
+This script processes cluster output files from cd-hit-est for use in Galaxy.
+It extracts cluster information, associates taxa and e-values from annotation files,
+performs statistical calculations, and generates text and plot outputs
+summarizing similarity and taxonomic distributions.
+
+
+Main steps:
+1. Parse cd-hit-est cluster file and (optional) annotation file.
+2. Process each cluster to extract similarity, taxa, and e-value information.
+3. Aggregate results across clusters.
+4. Generate requested outputs: text summaries, plots, and Excel reports.
+
+
+Note: Uses a non-interactive matplotlib backend (Agg) for compatibility with Galaxy.
--- a/cdhit_analysis.py	Mon Oct 20 12:27:31 2025 +0000
+++ b/cdhit_analysis.py	Tue Oct 21 07:54:21 2025 +0000
@@ -1,14 +1,3 @@
-import argparse
-import os
-import re
-from collections import Counter, defaultdict
-from math import sqrt
-import pandas as pd
-import matplotlib
-
-matplotlib.use('Agg')  # Non-interactive backend for Galaxy
-import matplotlib.pyplot as plt
-
 """
 This script processes cluster output files from cd-hit-est for use in Galaxy.
 It extracts cluster information, associates taxa and e-values from annotation files,
@@ -26,6 +15,16 @@
 Note: Uses a non-interactive matplotlib backend (Agg) for compatibility with Galaxy.
 """
 
+import argparse
+from collections import Counter, defaultdict
+import os
+import re
+import matplotlib.pyplot as plt
+import pandas as pd
+from math import sqrt
+import openpyxl
+
+
 
 def parse_arguments(args_list=None):
     """Parse command-line arguments for the script."""
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cdhit_analysis.sh	Tue Oct 21 07:54:21 2025 +0000
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+SCRIPTDIR=$(dirname "$(readlink -f "$0")")
+python $SCRIPTDIR"/cdhit_analysis.py" "$@"
+
+# sanity check
+printf "Conda env: %s\n" "$CONDA_DEFAULT_ENV"
+printf "Python version: %s\n" "$(python --version | awk '{print $2}')"
+printf "Matplotlib version: %s\n" "$(python -c 'import matplotlib; print(matplotlib.__version__)')"
+printf "Pandas version: %s\n" "$(python -c 'import pandas; print(pandas.__version__)')"
+printf "Openpyxl version: %s\n" "$(python -c 'import openpyxl; print(openpyxl.__version__)')"
+printf "Bash version: %s\n" "${BASH_VERSION}"
+printf "SCRIPTDIR: %s\n\n" "$SCRIPTDIR"
\ No newline at end of file
--- a/cdhit_analysis.xml	Mon Oct 20 12:27:31 2025 +0000
+++ b/cdhit_analysis.xml	Tue Oct 21 07:54:21 2025 +0000
@@ -9,7 +9,7 @@
     </requirements>
 
     <command detect_errors="exit_code"><![CDATA[
-        python '$__tool_directory__/cdhit_analysis.py'
+bash '$__tool_directory__/cdhit_analysis.sh'
         --input_cluster '$input_cluster'
         --input_annotation '$input_annotation'
 
@@ -17,13 +17,16 @@
             --output_similarity_txt '$similarity_txt'
             --output_similarity_plot '$similarity_plot'
         #end if
+
         #if $output_options.evalue_output:
             --output_evalue_txt '$evalue_txt'
             --output_evalue_plot '$evalue_plot'
         #end if
+
         #if $output_options.count_output:
             --output_count '$cluster_count'
         #end if
+
         #if $output_options.taxa_output:
             --output_taxa_clusters '$cluster_taxa'
             --output_taxa_processed '$processed_taxa'
@@ -39,9 +42,11 @@
         #if $processing_options.show_unannotated_clusters:
             --show_unannotated_clusters
         #end if
+
         #if $processing_options.make_taxa_in_cluster_split:
             --make_taxa_in_cluster_split
         #end if
+
         #if $processing_options.print_empty_files:
             --print_empty_files
         #end if