annotate spamr_vet_tools_v2/quast_get_fasta.py @ 2:d7b099fbb003 draft default tip

Corrected file names and updated tool wrappers for consistency.
author maciek
date Tue, 25 Mar 2025 13:35:00 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
1 import json
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
2 import csv
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
3 import sys
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
4 import os
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
5
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
6 def extract_software_data(json_data, software_name):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
7 """
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
8 Extract QUAST data from JSON and create a CSV with assembly metrics.
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
9 For "quast", include specific columns and calculate a Filter_N50 based on "N50".
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
10 """
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
11 # Ensure json_data is a dictionary
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
12 if isinstance(json_data, list):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
13 json_data = next((entry for entry in json_data if "analysis_software_name" in entry and entry["analysis_software_name"] == software_name), None)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
14
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
15 if not isinstance(json_data, dict):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
16 print(f"Invalid JSON format for {software_name} extraction.")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
17 return
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
18
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
19 results = json_data.get("results", [])
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
20 extracted_data = []
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
21 headers = [
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
22 "Assembly",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
23 "contigs_(>=_0_bp)",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
24 "contigs_(>=_1000_bp)",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
25 "Total_length_(>=_0_bp)",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
26 "Total_length_(>=_1000_bp)",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
27 "contigs",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
28 "Largest_contig",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
29 "Total_length",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
30 "GC",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
31 "N50",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
32 "Filter_N50",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
33 "N90",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
34 "auN",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
35 "L50",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
36 "L90",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
37 "total_reads",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
38 "left",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
39 "right",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
40 "Mapped",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
41 "Properly_paired",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
42 "Avg._coverage_depth",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
43 "Coverage_>=_1x",
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
44 "N's_per_100_kbp"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
45 ]
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
46 output_csv_file = f"{software_name}_output.csv"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
47
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
48 for entry in results:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
49 if "content" in entry and isinstance(entry["content"], list):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
50 for content_item in entry["content"]:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
51 n50 = content_item.get("N50", "")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
52 try:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
53 n50_value = float(n50) if n50 else 0
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
54 filter_n50 = "pass" if n50_value > 20000 else "fail"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
55 except ValueError:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
56 filter_n50 = "fail" # If the value is non-numeric, consider it as "fail"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
57
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
58 extracted_data.append({
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
59 "Assembly": content_item.get("Assembly", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
60 "contigs_(>=_0_bp)": content_item.get("contigs_(>=_0_bp)", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
61 "contigs_(>=_1000_bp)": content_item.get("contigs_(>=_1000_bp)", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
62 "Total_length_(>=_0_bp)": content_item.get("Total_length_(>=_0_bp)", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
63 "Total_length_(>=_1000_bp)": content_item.get("Total_length_(>=_1000_bp)", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
64 "contigs": content_item.get("contigs", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
65 "Largest_contig": content_item.get("Largest_contig", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
66 "Total_length": content_item.get("Total_length", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
67 "GC": content_item.get("GC", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
68 "N50": content_item.get("N50", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
69 "Filter_N50": filter_n50,
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
70 "N90": content_item.get("N90", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
71 "auN": content_item.get("auN", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
72 "L50": content_item.get("L50", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
73 "L90": content_item.get("L90", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
74 "total_reads": content_item.get("total_reads", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
75 "left": content_item.get("left", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
76 "right": content_item.get("right", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
77 "Mapped": content_item.get("Mapped", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
78 "Properly_paired": content_item.get("Properly_paired", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
79 "Avg._coverage_depth": content_item.get("Avg._coverage_depth", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
80 "Coverage_>=_1x": content_item.get("Coverage_>=_1x", ""),
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
81 "N's_per_100_kbp": content_item.get("N's_per_100_kbp", "")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
82 })
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
83
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
84 with open(output_csv_file, "w", newline="", encoding="utf-8") as f:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
85 writer = csv.DictWriter(f, fieldnames=headers)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
86 writer.writeheader()
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
87 writer.writerows(extracted_data)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
88
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
89 print(f"CSV file successfully generated: {output_csv_file}")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
90
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
91 def extract_contigs_to_fasta(json_data):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
92 """
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
93 Extract contigs information from "shovill" and save it as a FASTA file.
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
94 """
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
95 if isinstance(json_data, list):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
96 json_data = next((entry for entry in json_data if "analysis_software_name" in entry and entry["analysis_software_name"] == "shovill"), None)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
97
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
98 if not isinstance(json_data, dict):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
99 print("Invalid JSON format for shovill extraction.")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
100 return
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
101
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
102 results = json_data.get("results", [])
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
103 output_fasta_file = "shovill_contigs.fasta"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
104
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
105 with open(output_fasta_file, "w", encoding="utf-8") as f:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
106 for entry in results:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
107 if "content" in entry and isinstance(entry["content"], list):
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
108 for content_item in entry["content"]:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
109 name = content_item.get("name", "unknown")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
110 length = content_item.get("length", "unknown")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
111 coverage = content_item.get("coverage", "unknown")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
112 sequence = content_item.get("sequence", "")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
113
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
114 header = f">{name}_{length}_{coverage}"
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
115 f.write(f"{header}\n{sequence}\n")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
116
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
117 print(f"FASTA file successfully generated: {output_fasta_file}")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
118
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
119 if __name__ == "__main__":
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
120 if len(sys.argv) != 2:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
121 print("Usage: python script.py input.json")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
122 sys.exit(1)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
123
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
124 input_json_file = sys.argv[1]
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
125
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
126 try:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
127 with open(input_json_file, "r", encoding="utf-8") as file:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
128 json_data = json.load(file)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
129 extract_software_data(json_data, "quast")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
130 extract_contigs_to_fasta(json_data)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
131 sys.exit(0)
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
132 except Exception as e:
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
133 print(f"Error processing file: {e}")
d7b099fbb003 Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff changeset
134 sys.exit(1)