comparison add_taxonomic_labels.py @ 0:abd214795fa5 draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
author onnodg
date Tue, 14 Oct 2025 09:07:01 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:abd214795fa5
1 """This script processes the output obtained from a curated BLAST database
2 and prepares the correct input format for downstream analysis.
3
4 The difference before and after running this script is that taxonomic labels
5 in the BLAST output are no longer marked as unknown, because the curated database
6 only includes taxonomy information in the headers of the output sequences.
7
8 To extract the correct taxonomy levels, it is necessary to know the exact
9 positions of the required taxonomic ranks. These positions must be specified
10 in the `--taxon_levels`, based on splitting each header string first on '=' and
11 then on whitespace.
12
13 Important:
14 This script is not needed for GenBank-based databases, since their output
15 is already usable.
16 `--taxon_levels` is a critical variable — do not modify it unless you
17 understand what you're doing.
18 """
19
20 import argparse
21
22 def parse_arguments(args_list=None):
23 """Parse the command line arguments."""
24 parser = argparse.ArgumentParser(description="Add taxonomix labels from curated database output")
25 parser.add_argument("-i", "--input_file", required=True)
26 parser.add_argument("-o", "--output_file", required=True)
27 parser.add_argument("-t", "--taxon_levels", type=int, nargs="+", default=[1, 2, 4, 7, 11, 12, 13])
28 return parser.parse_args(args_list)
29
30
31 def add_labels(input, output, taxon_levels):
32 """
33 Add taxonomic labels to BLAST output.
34
35 :param input: Path to BLAST output file.
36 :type input: str
37 :param output: Path to output file.
38 :type output: str
39 :param taxon_levels: List of taxonomic levels.
40 :type taxon_levels: list
41 :return: None
42 :rtype: None
43 :raises: ValueError: if certain fields are missing from the input
44 """
45 # Make header in file
46 with open(output, 'w') as outfile:
47 outfile.write('#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage '
48 '#Coverage #evalue #bitscore #Source #Taxonomy\n')
49
50 with open(input, 'r') as infile, open(output, 'a') as outfile:
51 for line in infile:
52 if "#Query ID" not in line:
53 new_taxa = ''
54 src_val = ""
55 if "superkingdom=" in line and "markercode=" in line and "Genbank" in line:
56 taxonomy = line.split("superkingdom=")[-1].split("markercode=")[0].strip()
57 line = line.split("Genbank")[0].strip()
58 parts = line.strip().split()
59 else:
60 raise ValueError("Line does not contain expected fields: superkingdom, markercode, or Genbank")
61
62 for p in parts:
63 if p.startswith("source="):
64 src_val = p.split("=", 1)[1] # only keep the value after '='
65 break # we only need to get the source once
66 if src_val:
67 line = line.strip() + "\t" + src_val
68
69 for level in taxon_levels:
70 if level != taxon_levels[-1]:
71 new_taxa += taxonomy.split('=')[level].split(' ')[0]
72 new_taxa += ' / '
73 else:
74 # The species name already contains a space, so it
75 # should not be split further.
76 new_taxa += taxonomy.split('=')[level]
77 line += '\t'
78 line += new_taxa
79 outfile.write(f'{line}\n')
80
81
82 def main(arg_list=None):
83 args = parse_arguments(arg_list)
84 add_labels(args.input_file, args.output_file, args.taxon_levels)
85
86
87 if __name__ == '__main__':
88 main()