Mercurial > repos > maciek > spamr_vet_tools
annotate spamr_vet_tools_v2/quast_get_fasta.py @ 2:d7b099fbb003 draft default tip
Corrected file names and updated tool wrappers for consistency.
author | maciek |
---|---|
date | Tue, 25 Mar 2025 13:35:00 +0000 |
parents | |
children |
rev | line source |
---|---|
2
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
1 import json |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
2 import csv |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
3 import sys |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
4 import os |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
5 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
6 def extract_software_data(json_data, software_name): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
7 """ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
8 Extract QUAST data from JSON and create a CSV with assembly metrics. |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
9 For "quast", include specific columns and calculate a Filter_N50 based on "N50". |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
10 """ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
11 # Ensure json_data is a dictionary |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
12 if isinstance(json_data, list): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
13 json_data = next((entry for entry in json_data if "analysis_software_name" in entry and entry["analysis_software_name"] == software_name), None) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
14 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
15 if not isinstance(json_data, dict): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
16 print(f"Invalid JSON format for {software_name} extraction.") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
17 return |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
18 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
19 results = json_data.get("results", []) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
20 extracted_data = [] |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
21 headers = [ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
22 "Assembly", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
23 "contigs_(>=_0_bp)", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
24 "contigs_(>=_1000_bp)", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
25 "Total_length_(>=_0_bp)", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
26 "Total_length_(>=_1000_bp)", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
27 "contigs", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
28 "Largest_contig", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
29 "Total_length", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
30 "GC", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
31 "N50", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
32 "Filter_N50", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
33 "N90", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
34 "auN", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
35 "L50", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
36 "L90", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
37 "total_reads", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
38 "left", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
39 "right", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
40 "Mapped", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
41 "Properly_paired", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
42 "Avg._coverage_depth", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
43 "Coverage_>=_1x", |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
44 "N's_per_100_kbp" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
45 ] |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
46 output_csv_file = f"{software_name}_output.csv" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
47 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
48 for entry in results: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
49 if "content" in entry and isinstance(entry["content"], list): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
50 for content_item in entry["content"]: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
51 n50 = content_item.get("N50", "") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
52 try: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
53 n50_value = float(n50) if n50 else 0 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
54 filter_n50 = "pass" if n50_value > 20000 else "fail" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
55 except ValueError: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
56 filter_n50 = "fail" # If the value is non-numeric, consider it as "fail" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
57 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
58 extracted_data.append({ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
59 "Assembly": content_item.get("Assembly", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
60 "contigs_(>=_0_bp)": content_item.get("contigs_(>=_0_bp)", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
61 "contigs_(>=_1000_bp)": content_item.get("contigs_(>=_1000_bp)", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
62 "Total_length_(>=_0_bp)": content_item.get("Total_length_(>=_0_bp)", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
63 "Total_length_(>=_1000_bp)": content_item.get("Total_length_(>=_1000_bp)", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
64 "contigs": content_item.get("contigs", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
65 "Largest_contig": content_item.get("Largest_contig", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
66 "Total_length": content_item.get("Total_length", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
67 "GC": content_item.get("GC", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
68 "N50": content_item.get("N50", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
69 "Filter_N50": filter_n50, |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
70 "N90": content_item.get("N90", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
71 "auN": content_item.get("auN", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
72 "L50": content_item.get("L50", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
73 "L90": content_item.get("L90", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
74 "total_reads": content_item.get("total_reads", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
75 "left": content_item.get("left", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
76 "right": content_item.get("right", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
77 "Mapped": content_item.get("Mapped", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
78 "Properly_paired": content_item.get("Properly_paired", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
79 "Avg._coverage_depth": content_item.get("Avg._coverage_depth", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
80 "Coverage_>=_1x": content_item.get("Coverage_>=_1x", ""), |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
81 "N's_per_100_kbp": content_item.get("N's_per_100_kbp", "") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
82 }) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
83 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
84 with open(output_csv_file, "w", newline="", encoding="utf-8") as f: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
85 writer = csv.DictWriter(f, fieldnames=headers) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
86 writer.writeheader() |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
87 writer.writerows(extracted_data) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
88 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
89 print(f"CSV file successfully generated: {output_csv_file}") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
90 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
91 def extract_contigs_to_fasta(json_data): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
92 """ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
93 Extract contigs information from "shovill" and save it as a FASTA file. |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
94 """ |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
95 if isinstance(json_data, list): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
96 json_data = next((entry for entry in json_data if "analysis_software_name" in entry and entry["analysis_software_name"] == "shovill"), None) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
97 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
98 if not isinstance(json_data, dict): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
99 print("Invalid JSON format for shovill extraction.") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
100 return |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
101 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
102 results = json_data.get("results", []) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
103 output_fasta_file = "shovill_contigs.fasta" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
104 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
105 with open(output_fasta_file, "w", encoding="utf-8") as f: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
106 for entry in results: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
107 if "content" in entry and isinstance(entry["content"], list): |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
108 for content_item in entry["content"]: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
109 name = content_item.get("name", "unknown") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
110 length = content_item.get("length", "unknown") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
111 coverage = content_item.get("coverage", "unknown") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
112 sequence = content_item.get("sequence", "") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
113 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
114 header = f">{name}_{length}_{coverage}" |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
115 f.write(f"{header}\n{sequence}\n") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
116 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
117 print(f"FASTA file successfully generated: {output_fasta_file}") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
118 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
119 if __name__ == "__main__": |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
120 if len(sys.argv) != 2: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
121 print("Usage: python script.py input.json") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
122 sys.exit(1) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
123 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
124 input_json_file = sys.argv[1] |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
125 |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
126 try: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
127 with open(input_json_file, "r", encoding="utf-8") as file: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
128 json_data = json.load(file) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
129 extract_software_data(json_data, "quast") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
130 extract_contigs_to_fasta(json_data) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
131 sys.exit(0) |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
132 except Exception as e: |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
133 print(f"Error processing file: {e}") |
d7b099fbb003
Corrected file names and updated tool wrappers for consistency.
maciek
parents:
diff
changeset
|
134 sys.exit(1) |