Mercurial > repos > onnodg > cdhit_analysis
annotate cdhit_analysis.py @ 3:c6981ea453ae draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit ef31054ae26e19eff2f1b1f6c7979e39c47c0d5b-dirty
| author | onnodg |
|---|---|
| date | Fri, 24 Oct 2025 09:38:24 +0000 |
| parents | 706b7acdb230 |
| children |
| rev | line source |
|---|---|
|
0
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
1 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
2 This script processes cluster output files from cd-hit-est for use in Galaxy. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
3 It extracts cluster information, associates taxa and e-values from annotation files, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
4 performs statistical calculations, and generates text and plot outputs |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
5 summarizing similarity and taxonomic distributions. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
6 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
7 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
8 Main steps: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
9 1. Parse cd-hit-est cluster file and (optional) annotation file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
10 2. Process each cluster to extract similarity, taxa, and e-value information. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
11 3. Aggregate results across clusters. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
12 4. Generate requested outputs: text summaries, plots, and Excel reports. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
13 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
14 |
|
2
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
15 import argparse |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
16 from collections import Counter, defaultdict |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
17 import os |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
18 import re |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
19 import matplotlib.pyplot as plt |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
20 import pandas as pd |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
21 from math import sqrt |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
22 import openpyxl |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
23 |
|
706b7acdb230
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c2020ecc91cea0c8cf7439180cf796743c838b4d-dirty
onnodg
parents:
1
diff
changeset
|
24 |
|
0
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
25 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
26 def parse_arguments(args_list=None): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
27 """Parse command-line arguments for the script.""" |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
28 parser = argparse.ArgumentParser( |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
29 description='Create taxa analysis from cd-hit cluster files') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
30 parser.add_argument('--input_cluster', type=str, required=True, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
31 help='Input cluster file (.clstr)') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
32 parser.add_argument('--input_annotation', type=str, required=False, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
33 help='Input annotation file (.out)') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
34 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
35 # Galaxy output files |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
36 parser.add_argument('--output_similarity_txt', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
37 help='Similarity text output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
38 parser.add_argument('--output_similarity_plot', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
39 help='Similarity plot output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
40 parser.add_argument('--output_evalue_txt', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
41 help='E-value text output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
42 parser.add_argument('--output_evalue_plot', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
43 help='E-value plot output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
44 parser.add_argument('--output_count', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
45 help='Count summary output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
46 parser.add_argument('--output_taxa_clusters', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
47 help='Taxa per cluster output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
48 parser.add_argument('--output_taxa_processed', type=str, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
49 help='Processed taxa output file') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
50 # Plot parameters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
51 parser.add_argument('--simi_plot_y_min', type=float, default=95.0, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
52 help='Minimum value of the y-axis in the similarity plot') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
53 parser.add_argument('--simi_plot_y_max', type=float, default=100.0, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
54 help='Maximum value of the y-axis in the similarity plot') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
55 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
56 # Uncertain taxa configuration |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
57 parser.add_argument('--uncertain_taxa_use_ratio', type=float, default=0.5, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
58 help='Ratio at which uncertain taxa count toward the correct taxa') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
59 parser.add_argument('--min_to_split', type=float, default=0.45, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
60 help='Minimum percentage for taxonomic split') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
61 parser.add_argument('--min_count_to_split', type=int, default=10, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
62 help='Minimum count for taxonomic split') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
63 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
64 # Processing options |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
65 parser.add_argument('--show_unannotated_clusters', action='store_true', default=False, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
66 help='Show unannotated clusters in output') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
67 parser.add_argument('--make_taxa_in_cluster_split', action='store_true', default=False, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
68 help='Split clusters with multiple taxa') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
69 parser.add_argument('--print_empty_files', action='store_true', default=False, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
70 help='Print messages about empty annotation files') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
71 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
72 return parser.parse_args(args_list) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
73 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
74 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
75 # Color map for plots |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
76 COLORMAP = [ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
77 # List of RGBA tuples for bar coloring in plots |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
78 (0.12156862745098039, 0.4666666666666667, 0.7058823529411765, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
79 (1.0, 0.4980392156862745, 0.054901960784313725, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
80 (0.17254901960784313, 0.6274509803921569, 0.17254901960784313, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
81 (0.8392156862745098, 0.15294117647058825, 0.1568627450980392, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
82 (0.5803921568627451, 0.403921568627451, 0.7411764705882353, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
83 (0.5490196078431373, 0.33725490196078434, 0.29411764705882354, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
84 (0.8901960784313725, 0.4666666666666667, 0.7607843137254902, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
85 (0.4980392156862745, 0.4980392156862745, 0.4980392156862745, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
86 (0.7372549019607844, 0.7411764705882353, 0.13333333333333333, 1.0), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
87 (0.09019607843137255, 0.7450980392156863, 0.8117647058823529, 1.0) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
88 ] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
89 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
90 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
91 def parse_cluster_file(cluster_file, annotation_file=None, print_empty=False, raise_on_error=False): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
92 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
93 Parse the cd-hit-est cluster file (.clstr) and (optionally) an Excel annotation file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
94 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
95 It extracts cluster information (header, read count, similarity) and associates |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
96 taxonomic information and E-values from the annotation file based on the read header. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
97 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
98 :param cluster_file: Path to cd-hit cluster file (.clstr). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
99 :type cluster_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
100 :param annotation_file: Path to Excel annotation file with taxa and e-values. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
101 :type annotation_file: str, optional |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
102 :param print_empty: Print a message if the annotation file is not found or empty. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
103 :type print_empty: bool |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
104 :param raise_on_error: Raise parsing errors instead of printing warnings. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
105 :type raise_on_error: bool |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
106 :return: List of clusters, where each cluster is a dict mapping read header to a dict of read info. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
107 :rtype: list[dict[str, dict]] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
108 :raises ValueError: If similarity cannot be parsed from a cluster line. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
109 :raises UnboundLocalError: If an error occurs during annotation processing. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
110 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
111 clusters = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
112 current_cluster = {} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
113 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
114 # Load annotations if provided |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
115 annotations = {} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
116 if annotation_file and os.path.exists(annotation_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
117 # Lees het Excel-bestand |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
118 df = pd.read_excel(annotation_file, sheet_name='Individual_Reads', engine='openpyxl') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
119 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
120 # Itereer over de rijen |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
121 for _, row in df.iterrows(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
122 header = row['header'] # kolomnaam zoals in Excel |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
123 evalue = row['e_value'] # of de kolomnaam die je wilt gebruiken |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
124 taxa = row['taxa'] # afhankelijk van hoe je taxa opslaat |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
125 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
126 annotations[header] = {'evalue': evalue, 'taxa': taxa} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
127 elif annotation_file and print_empty: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
128 print(f"Annotation file {annotation_file} not found or empty") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
129 with open(cluster_file, 'r') as f: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
130 for line in f: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
131 line = line.strip() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
132 if line.startswith('>Cluster'): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
133 # Start of new cluster, save previous if exists |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
134 if current_cluster: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
135 clusters.append(current_cluster) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
136 current_cluster = {} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
137 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
138 # Parse sequence line |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
139 parts = line.split() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
140 if len(parts) >= 2: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
141 # Extract header and count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
142 header_part = parts[2].strip('>.') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
143 header_parts = header_part.split('(') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
144 if len(header_parts) > 1: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
145 last_part = header_parts[-1].strip(')') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
146 header = header_parts[0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
147 if last_part: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
148 count = int(last_part) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
149 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
150 print('no count') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
151 count = 0 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
152 header = '' |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
153 # Extract similarity |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
154 similarity_part = parts[-1].strip() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
155 if '*' in similarity_part: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
156 similarity = 100.0 # Representative sequence |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
157 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
158 matches = re.findall(r'[\d.]+', similarity_part) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
159 if matches: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
160 similarity = float(matches[-1]) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
161 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
162 raise ValueError(f"Could not parse similarity from: '{similarity_part}'") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
163 # Get annotation info |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
164 try: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
165 if header in annotations: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
166 taxa = annotations[header]['taxa'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
167 evalue = annotations[header]['evalue'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
168 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
169 taxa = 'Unannotated read' |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
170 evalue = 'Unannotated read' |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
171 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
172 current_cluster[header] = { |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
173 'count': count, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
174 'similarity': similarity, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
175 'taxa': taxa, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
176 'evalue': evalue |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
177 } |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
178 except UnboundLocalError as e: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
179 if raise_on_error: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
180 raise UnboundLocalError(str(e)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
181 print(f"Error: {e}, No annotations found") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
182 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
183 # Add the last cluster |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
184 if current_cluster: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
185 clusters.append(current_cluster) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
186 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
187 return clusters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
188 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
189 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
190 def process_cluster_data(cluster): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
191 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
192 Process a single cluster to extract E-value, similarity, and taxa data for all reads. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
193 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
194 Aggregates information from all reads in the cluster, storing read counts, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
195 E-values, similarities, and taxa in lists and a dictionary. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
196 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
197 :param cluster: Cluster data mapping read headers to read info. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
198 :type cluster: dict |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
199 :return: A tuple containing: (list of E-values, list of similarity values, dict of taxa -> counts). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
200 The first element of the E-value list ([0]) stores the count of unannotated reads. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
201 :rtype: tuple[list[float | int], list[float], dict[str, int]] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
202 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
203 eval_list = [0.0] # First element for unannotated count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
204 simi_list = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
205 taxa_dict = {'Unannotated read': 0} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
206 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
207 for header, info in cluster.items(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
208 count = info['count'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
209 similarity = info['similarity'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
210 taxa = info['taxa'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
211 evalue = info['evalue'] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
212 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
213 if evalue == 'Unannotated read': |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
214 eval_list[0] += count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
215 taxa_dict['Unannotated read'] += count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
216 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
217 try: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
218 eval_val = float(evalue) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
219 for _ in range(count): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
220 eval_list.append(eval_val) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
221 except ValueError: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
222 eval_list[0] += count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
223 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
224 if taxa not in taxa_dict: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
225 taxa_dict[taxa] = 0 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
226 taxa_dict[taxa] += count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
227 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
228 # Add similarity values |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
229 for _ in range(count): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
230 simi_list.append(similarity) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
231 return eval_list, simi_list, taxa_dict |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
232 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
233 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
234 def calculate_cluster_taxa(taxa_dict, args): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
235 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
236 Calculate the most likely taxa for a cluster based on read counts. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
237 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
238 This function applies the 'uncertain taxa use ratio' for unannotated reads |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
239 and uses a recursive approach to potentially split a cluster into sub-clusters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
240 if taxonomic dominance is not strong enough (based on ``min_to_split`` |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
241 and ``min_count_to_split``). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
242 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
243 :param taxa_dict: Mapping of taxa (full string) -> read counts. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
244 :type taxa_dict: dict[str, int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
245 :param args: Parsed script arguments, including parameters for taxa calculation. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
246 :type args: argparse.Namespace |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
247 :return: A list of refined taxa assignments (dictionaries), where each dictionary |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
248 represents a potentially split sub-cluster. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
249 :rtype: list[dict[str, float | int]] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
250 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
251 # Replace 'Unannotated read' with uncertain taxa format |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
252 processed_dict = {} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
253 for taxa, count in taxa_dict.items(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
254 if taxa == 'Unannotated read': |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
255 uncertain_taxa = ' / '.join(['Uncertain taxa'] * 7) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
256 processed_dict[uncertain_taxa] = count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
257 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
258 processed_dict[taxa] = count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
259 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
260 return _recursive_taxa_calculation(processed_dict, args, 0) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
261 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
262 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
263 def _recursive_taxa_calculation(taxa_dict, args, level): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
264 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
265 Recursive helper to calculate and potentially split taxa at each taxonomic level. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
266 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
267 :param taxa_dict: Taxa counts at the current level (or sub-group). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
268 :type taxa_dict: dict[str, float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
269 :param args: Parsed script arguments. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
270 :type args: argparse.Namespace |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
271 :param level: Index of the current taxonomic level (0=kingdom, max 6=species). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
272 :type level: int |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
273 :return: List of refined taxa dictionaries. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
274 :rtype: list[dict[str, float | int]] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
275 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
276 if level >= 7: # Max 7 taxonomic levels |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
277 return [taxa_dict] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
278 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
279 level_dict = defaultdict(float) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
280 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
281 # Group by taxonomic level |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
282 for taxa, count in taxa_dict.items(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
283 taxa_parts = taxa.split(' / ') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
284 if level < len(taxa_parts): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
285 level_taxon = taxa_parts[level] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
286 if level_taxon == 'Uncertain taxa': |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
287 level_dict[level_taxon] += count * args.uncertain_taxa_use_ratio |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
288 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
289 level_dict[level_taxon] += count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
290 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
291 if len(level_dict) <= 1: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
292 # Only one taxon at this level, continue to next level |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
293 return _recursive_taxa_calculation(taxa_dict, args, level + 1) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
294 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
295 # Sort by abundance |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
296 sorted_taxa = sorted(level_dict.items(), key=lambda x: x[1], reverse=True) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
297 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
298 result = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
299 dominant_taxon = sorted_taxa[0][0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
300 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
301 # Check if we should split |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
302 for i in range(1, len(sorted_taxa)): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
303 secondary_taxon = sorted_taxa[i][0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
304 total_count = sorted_taxa[0][1] + sorted_taxa[i][1] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
305 ratio = sorted_taxa[i][1] / total_count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
306 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
307 if ratio >= args.min_to_split or total_count <= args.min_count_to_split: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
308 # Split off this taxon |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
309 split_dict = {taxa: count for taxa, count in taxa_dict.items() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
310 if taxa.split(' / ')[level] == secondary_taxon} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
311 result.extend(_recursive_taxa_calculation(split_dict, args, level + 1)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
312 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
313 # Process the dominant group |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
314 dominant_dict = {taxa: count for taxa, count in taxa_dict.items() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
315 if taxa.split(' / ')[level] == dominant_taxon} |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
316 result.extend(_recursive_taxa_calculation(dominant_dict, args, level + 1)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
317 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
318 return result |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
319 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
320 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
321 def write_similarity_output(all_simi_data, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
322 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
323 Write the similarity text output, including the mean and standard deviation, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
324 and a count per similarity percentage. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
325 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
326 :param all_simi_data: List of all similarity percentages from all reads. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
327 :type all_simi_data: list[float] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
328 :param output_file: Path to the output file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
329 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
330 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
331 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
332 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
333 if not all_simi_data: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
334 return |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
335 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
336 avg = sum(all_simi_data) / len(all_simi_data) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
337 variance = sum((s - avg) ** 2 for s in all_simi_data) / len(all_simi_data) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
338 st_dev = sqrt(variance) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
339 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
340 simi_counter = Counter(all_simi_data) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
341 simi_sorted = sorted(simi_counter.items(), key=lambda x: -x[0]) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
342 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
343 with open(output_file, 'w') as f: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
344 f.write(f"# Average similarity: {avg:.2f}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
345 f.write(f"# Standard deviation: {st_dev:.2f}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
346 f.write("similarity\tcount\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
347 for similarity, count in simi_sorted: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
348 f.write(f"{similarity}\t{count}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
349 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
350 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
351 def write_evalue_output(all_eval_data, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
352 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
353 Write the E-value text output, including the count of unannotated reads |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
354 and a count per E-value. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
355 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
356 :param all_eval_data: List of E-values from all reads. The first element ([0]) is the count of unannotated reads. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
357 :type all_eval_data: list[float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
358 :param output_file: Path to the output file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
359 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
360 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
361 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
362 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
363 unanno_count = all_eval_data[0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
364 eval_counter = Counter(all_eval_data[1:]) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
365 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
366 with open(output_file, 'w') as f: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
367 f.write("evalue\tcount\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
368 if unanno_count > 0: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
369 f.write(f"unannotated\t{unanno_count}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
370 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
371 eval_sorted = sorted(eval_counter.items(), |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
372 key=lambda x: (-x[1], float(x[0]) if isinstance(x[0], (int, float)) else float('inf'))) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
373 for value, count in eval_sorted: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
374 f.write(f"{value}\t{count}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
375 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
376 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
377 def write_count_output(all_eval_data, cluster_data_list, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
378 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
379 Write a summary of annotated and unannotated read counts per cluster |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
380 and for the total sample. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
381 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
382 :param all_eval_data: List of E-values from all reads for the total count summary. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
383 :type all_eval_data: list[float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
384 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
385 :type cluster_data_list: list[tuple] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
386 :param output_file: Path to the output file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
387 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
388 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
389 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
390 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
391 with open(output_file, 'w') as f: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
392 f.write("cluster\tunannotated\tannotated\ttotal\tperc_unannotated\tperc_annotated\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
393 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
394 for cluster_num, (eval_list, _, _) in enumerate(cluster_data_list): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
395 unannotated = eval_list[0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
396 annotated = len(eval_list[1:]) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
397 total = unannotated + annotated |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
398 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
399 if total > 0: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
400 perc_annotated = (annotated / total) * 100 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
401 perc_unannotated = (unannotated / total) * 100 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
402 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
403 perc_annotated = perc_unannotated = 0 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
404 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
405 f.write( |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
406 f"{cluster_num}\t{unannotated}\t{annotated}\t{total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
407 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
408 # Add full sample summary |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
409 total_unannotated = all_eval_data[0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
410 total_annotated = len(all_eval_data[1:]) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
411 grand_total = total_unannotated + total_annotated |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
412 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
413 if grand_total > 0: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
414 perc_annotated = (total_annotated / grand_total) * 100 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
415 perc_unannotated = (total_unannotated / grand_total) * 100 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
416 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
417 perc_annotated = perc_unannotated = 0 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
418 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
419 f.write( |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
420 f"TOTAL\t{total_unannotated}\t{total_annotated}\t{grand_total}\t{perc_unannotated:.2f}\t{perc_annotated:.2f}\n") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
421 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
422 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
423 def write_taxa_clusters_output(cluster_data_list, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
424 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
425 Write raw taxa information per cluster to an Excel file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
426 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
427 Each row contains the cluster number, read count, the full taxa string, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
428 and the separate taxonomic levels (kingdom through species). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
429 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
430 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
431 :type cluster_data_list: list[tuple] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
432 :param output_file: Path to the output Excel file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
433 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
434 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
435 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
436 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
437 # Create main dataframe |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
438 data = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
439 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
440 for taxa, count in taxa_dict.items(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
441 if count > 0: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
442 # Split taxa into taxonomic levels |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
443 taxa_levels = taxa.split(' / ') if taxa else [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
444 taxa_levels += ['Unannotated read'] * (7 - len(taxa_levels)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
445 try: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
446 data.append({ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
447 'cluster': cluster_num, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
448 'count': count, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
449 'taxa_full': taxa, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
450 'kingdom': taxa_levels[0], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
451 'phylum': taxa_levels[1], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
452 'class': taxa_levels[2], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
453 'order': taxa_levels[3], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
454 'family': taxa_levels[4], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
455 'genus': taxa_levels[5], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
456 'species': taxa_levels[6] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
457 }) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
458 except IndexError as e: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
459 # Skip entries with incomplete taxonomic data |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
460 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}, error: {e}'") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
461 continue |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
462 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
463 df = pd.DataFrame(data) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
464 # Write to Excel |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
465 temp_path = output_file + ".xlsx" |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
466 os.makedirs(os.path.dirname(temp_path), exist_ok=True) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
467 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
468 df.to_excel(writer, sheet_name='Raw_Taxa_Clusters', index=False, engine='openpyxl') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
469 os.replace(temp_path, output_file) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
470 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
471 def write_taxa_processed_output(cluster_data_list, args, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
472 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
473 Write processed (potentially split) taxa information to an Excel file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
474 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
475 This file contains the resulting sub-clusters from the taxonomic dominance |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
476 analysis and a separate sheet documenting the parameters used. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
477 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
478 :param cluster_data_list: List of tuples (eval_list, simi_list, taxa_dict) per cluster. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
479 :type cluster_data_list: list[tuple] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
480 :param args: Parsed script arguments, used for taxa calculation and settings documentation. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
481 :type args: argparse.Namespace |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
482 :param output_file: Path to the output Excel file. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
483 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
484 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
485 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
486 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
487 # Create main dataframe |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
488 data = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
489 for cluster_num, (_, _, taxa_dict) in enumerate(cluster_data_list): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
490 processed_taxa = calculate_cluster_taxa(taxa_dict, args) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
491 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
492 for taxa_group in processed_taxa: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
493 for taxa, count in taxa_group.items(): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
494 if 'Uncertain taxa / Uncertain taxa / Uncertain taxa' in taxa: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
495 if args.show_unannotated_clusters: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
496 data.append({ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
497 'cluster': cluster_num, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
498 'count': count, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
499 'taxa_full': 'Unannotated read', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
500 'kingdom': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
501 'phylum': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
502 'class': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
503 'order': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
504 'family': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
505 'genus': 'Unannotated', |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
506 'species': 'Unannotated' |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
507 }) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
508 else: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
509 # Split taxa into taxonomic levels |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
510 taxa_levels = taxa.split(' / ') if taxa else [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
511 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
512 try: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
513 data.append({ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
514 'cluster': cluster_num, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
515 'count': count, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
516 'taxa_full': taxa, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
517 'kingdom': taxa_levels[0], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
518 'phylum': taxa_levels[1], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
519 'class': taxa_levels[2], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
520 'order': taxa_levels[3], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
521 'family': taxa_levels[4], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
522 'genus': taxa_levels[5], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
523 'species': taxa_levels[6] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
524 }) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
525 except IndexError: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
526 # Skip entries with incomplete taxonomic data |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
527 print(f"Skipped entry in cluster {cluster_num}: incomplete taxonomic data for '{taxa}'") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
528 continue |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
529 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
530 df = pd.DataFrame(data) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
531 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
532 # Create settings dataframe |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
533 settings_data = [ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
534 ['uncertain_taxa_use_ratio', args.uncertain_taxa_use_ratio], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
535 ['min_to_split', args.min_to_split], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
536 ['min_count_to_split', args.min_count_to_split] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
537 ] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
538 settings_df = pd.DataFrame(settings_data, columns=['Parameter', 'Value']) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
539 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
540 # Write to Excel with multiple sheets |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
541 temp_path = output_file + ".xlsx" |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
542 os.makedirs(os.path.dirname(temp_path), exist_ok=True) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
543 with pd.ExcelWriter(temp_path, engine='openpyxl') as writer: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
544 df.to_excel(writer, sheet_name='Processed_Taxa_Clusters', index=False, engine='openpyxl') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
545 settings_df.to_excel(writer, sheet_name='Settings', index=False, engine='openpyxl') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
546 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
547 # Auto-adjust column widths for better readability |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
548 for sheet_name in writer.sheets: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
549 worksheet = writer.sheets[sheet_name] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
550 for column in worksheet.columns: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
551 max_length = 0 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
552 column_letter = column[0].column_letter |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
553 for cell in column: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
554 try: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
555 if len(str(cell.value)) > max_length: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
556 max_length = len(str(cell.value)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
557 except: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
558 pass |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
559 adjusted_width = min(max_length + 2, 50) # Cap at 50 characters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
560 worksheet.column_dimensions[column_letter].width = adjusted_width |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
561 os.replace(temp_path, output_file) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
562 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
563 def create_similarity_plot(all_simi_data, cluster_simi_lengths, args, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
564 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
565 Create a bar plot showing the distribution of intra-cluster similarity values. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
566 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
567 The plot uses different colors to distinguish reads belonging to different clusters. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
568 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
569 :param all_simi_data: List of all similarity values, sorted descending. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
570 :type all_simi_data: list[float] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
571 :param cluster_simi_lengths: List of lengths of similarity data per cluster, used for coloring. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
572 :type cluster_simi_lengths: list[int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
573 :param args: Parsed script arguments, used for plot y-limits. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
574 :type args: argparse.Namespace |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
575 :param output_file: Path to the output plot file (e.g., .png). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
576 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
577 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
578 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
579 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
580 if not all_simi_data: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
581 return |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
582 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
583 sorted_simi_list = sorted(all_simi_data, reverse=True) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
584 bar_positions = list(range(len(sorted_simi_list))) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
585 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
586 # Create colormap for different clusters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
587 colormap_full = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
588 for i, length in enumerate(cluster_simi_lengths): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
589 color = COLORMAP[i % len(COLORMAP)] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
590 colormap_full.extend([color] * length) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
591 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
592 plt.figure(figsize=(12, 6)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
593 plt.bar(bar_positions, sorted_simi_list, width=1, color=colormap_full) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
594 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
595 plt.ylabel("Similarity (%)") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
596 plt.xlabel("Reads") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
597 plt.title("Intra-cluster Similarity Distribution") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
598 plt.ylim(ymin=args.simi_plot_y_min, ymax=args.simi_plot_y_max) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
599 plt.tight_layout() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
600 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
601 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
602 plt.close() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
603 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
604 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
605 def create_evalue_plot(all_eval_data, cluster_eval_lengths, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
606 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
607 Create a bar plot showing the distribution of E-values. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
608 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
609 The y-axis is log-scaled and displays ``1/E-values``. Reads are ordered |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
610 by E-value (ascending). The plot uses different colors to distinguish reads |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
611 belonging to different clusters. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
612 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
613 :param all_eval_data: List of E-values from all reads. Assumes E-values start at index 1. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
614 :type all_eval_data: list[float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
615 :param cluster_eval_lengths: List of lengths of annotated E-value data per cluster, used for coloring. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
616 :type cluster_eval_lengths: list[int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
617 :param output_file: Path to the output plot file (e.g., .png). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
618 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
619 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
620 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
621 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
622 if len(all_eval_data) <= 1: # Only unannotated reads |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
623 return |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
624 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
625 sorted_eval_list = sorted(all_eval_data[1:]) # Skip unannotated count |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
626 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
627 if not sorted_eval_list: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
628 return |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
629 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
630 bar_positions = list(range(len(sorted_eval_list))) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
631 bar_heights = [1 / e if e > 0 else 0 for e in sorted_eval_list] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
632 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
633 # Create colormap for different clusters |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
634 colormap_full = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
635 for i, length in enumerate(cluster_eval_lengths): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
636 color = COLORMAP[i % len(COLORMAP)] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
637 colormap_full.extend([color] * length) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
638 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
639 plt.figure(figsize=(12, 6)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
640 plt.bar(bar_positions, bar_heights, width=1, color=colormap_full) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
641 plt.yscale('log') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
642 plt.grid(axis='y', linestyle='--', color='gray', alpha=0.7) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
643 plt.ylabel("1/E-values") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
644 plt.xlabel("Reads") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
645 plt.title("E-value Distribution") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
646 plt.tight_layout() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
647 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
648 plt.savefig(output_file, format='png', dpi=300, bbox_inches='tight') |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
649 plt.close() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
650 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
651 def prepare_evalue_histogram(evalue_list, unannotated_list): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
652 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
653 Generate histogram data for E-value distributions. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
654 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
655 This function processes a list of E-values from BLAST or similar search |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
656 results, filters out invalid or zero entries, and computes histogram data |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
657 suitable for plotting. The histogram represents the frequency distribution |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
658 of E-values across all annotated hits. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
659 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
660 :param evalue_list: List of E-values from BLAST hits |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
661 :type evalue_list: list[float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
662 :param unannotated_list: List of unannotated E-values |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
663 :type unannotated_list: list |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
664 :return: Tuple containing: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
665 - **counts** (*numpy.ndarray*): Number of entries per histogram bin. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
666 - **bins** (*numpy.ndarray*): Bin edges corresponding to the histogram. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
667 Returns ``(None, None)`` if no valid data is available. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
668 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
669 :note: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
670 - Only positive numeric E-values are included in the histogram. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
671 - Uses 50 bins in the range (0, 1) for visualization consistency. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
672 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
673 data = [ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
674 if not data: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
675 return None, None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
676 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
677 counts, bins, _ = plt.hist(data, bins=50, range=(0, 1)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
678 plt.close() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
679 return counts, bins |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
680 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
681 def create_evalue_plot_test(evalue_list, unannotated_list, output_file): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
682 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
683 Create and save an E-value distribution plot, returning the computed histogram data. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
684 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
685 This function visualizes the frequency distribution of E-values from BLAST or |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
686 annotation results. It saves the plot to the specified output file and returns |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
687 the histogram data (counts and bins) for testing with pytests. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
688 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
689 :param evalue_list: List of numeric E-values to plot |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
690 :type evalue_list: list[float | int] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
691 :param unannotated_list: Optional list of E-values for unannotated sequences. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
692 :type unannotated_list: list |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
693 :param output_file: Path where the histogram image will be saved. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
694 :type output_file: str |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
695 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
696 :return: Tuple containing: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
697 - **counts** (*numpy.ndarray*): Frequency counts per histogram bin. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
698 - **bins** (*numpy.ndarray*): Histogram bin edges. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
699 Returns ``(None, None)`` if no valid data was available for plotting. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
700 :rtype: tuple[numpy.ndarray, numpy.ndarray] | tuple[None, None] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
701 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
702 counts, bins = prepare_evalue_histogram(evalue_list, unannotated_list) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
703 if counts is None: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
704 return None, None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
705 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
706 plt.hist([ev for ev in evalue_list if isinstance(ev, (int, float)) and ev > 0], |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
707 bins=50, range=(0, 1)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
708 plt.xlabel("E-value") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
709 plt.ylabel("Frequency") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
710 plt.title("E-value Distribution") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
711 plt.savefig(output_file) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
712 plt.close() |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
713 return counts, bins |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
714 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
715 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
716 def main(arg_list=None): |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
717 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
718 Main entry point of the script. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
719 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
720 Parses arguments, processes cd-hit cluster data, aggregates results, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
721 and generates requested outputs (text summaries, plots, and Excel reports). |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
722 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
723 :param arg_list: List of arguments for testing purposes. |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
724 :type arg_list: list, optional |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
725 :return: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
726 :rtype: None |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
727 """ |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
728 args = parse_arguments(arg_list) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
729 # Parse cluster file |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
730 clusters = parse_cluster_file( |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
731 args.input_cluster, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
732 args.input_annotation, |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
733 args.print_empty_files |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
734 ) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
735 # Process each cluster |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
736 all_eval_data = [0] # For full sample statistics |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
737 all_simi_data = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
738 cluster_eval_lengths = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
739 cluster_simi_lengths = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
740 cluster_data_list = [] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
741 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
742 for cluster in clusters: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
743 eval_list, simi_list, taxa_dict = process_cluster_data(cluster) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
744 cluster_data_list.append((eval_list, simi_list, taxa_dict)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
745 # Collect data for full sample plots |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
746 all_eval_data[0] += eval_list[0] |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
747 if len(eval_list) > 1: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
748 all_eval_data.extend(sorted(eval_list[1:])) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
749 cluster_eval_lengths.append(len(eval_list[1:])) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
750 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
751 if simi_list: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
752 all_simi_data.extend(sorted(simi_list, reverse=True)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
753 cluster_simi_lengths.append(len(simi_list)) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
754 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
755 # Generate outputs based on what was requested |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
756 if args.output_similarity_txt: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
757 write_similarity_output(all_simi_data, args.output_similarity_txt) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
758 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
759 if args.output_similarity_plot and all_simi_data: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
760 create_similarity_plot(all_simi_data, cluster_simi_lengths, args, args.output_similarity_plot) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
761 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
762 if args.output_evalue_txt: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
763 write_evalue_output(all_eval_data, args.output_evalue_txt) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
764 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
765 if args.output_evalue_plot and len(all_eval_data) > 1: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
766 create_evalue_plot(all_eval_data, cluster_eval_lengths, args.output_evalue_plot) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
767 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
768 if args.output_count: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
769 write_count_output(all_eval_data, cluster_data_list, args.output_count) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
770 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
771 if args.output_taxa_clusters: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
772 write_taxa_clusters_output(cluster_data_list, args.output_taxa_clusters) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
773 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
774 if args.output_taxa_processed: |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
775 write_taxa_processed_output(cluster_data_list, args, args.output_taxa_processed) |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
776 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
777 print(f"Processing complete. Processed {len(clusters)} clusters.") |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
778 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
779 |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
780 if __name__ == "__main__": |
|
00d56396b32a
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_clusters_tool commit c944fd5685f295acba06679e85b67973c173b137
onnodg
parents:
diff
changeset
|
781 main() |
