Mercurial > repos > onnodg > add_taxonomic_labels
comparison add_taxonomic_labels.py @ 0:abd214795fa5 draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:07:01 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:abd214795fa5 |
|---|---|
| 1 """This script processes the output obtained from a curated BLAST database | |
| 2 and prepares the correct input format for downstream analysis. | |
| 3 | |
| 4 The difference before and after running this script is that taxonomic labels | |
| 5 in the BLAST output are no longer marked as unknown, because the curated database | |
| 6 only includes taxonomy information in the headers of the output sequences. | |
| 7 | |
| 8 To extract the correct taxonomy levels, it is necessary to know the exact | |
| 9 positions of the required taxonomic ranks. These positions must be specified | |
| 10 in the `--taxon_levels`, based on splitting each header string first on '=' and | |
| 11 then on whitespace. | |
| 12 | |
| 13 Important: | |
| 14 This script is not needed for GenBank-based databases, since their output | |
| 15 is already usable. | |
| 16 `--taxon_levels` is a critical variable — do not modify it unless you | |
| 17 understand what you're doing. | |
| 18 """ | |
| 19 | |
| 20 import argparse | |
| 21 | |
| 22 def parse_arguments(args_list=None): | |
| 23 """Parse the command line arguments.""" | |
| 24 parser = argparse.ArgumentParser(description="Add taxonomix labels from curated database output") | |
| 25 parser.add_argument("-i", "--input_file", required=True) | |
| 26 parser.add_argument("-o", "--output_file", required=True) | |
| 27 parser.add_argument("-t", "--taxon_levels", type=int, nargs="+", default=[1, 2, 4, 7, 11, 12, 13]) | |
| 28 return parser.parse_args(args_list) | |
| 29 | |
| 30 | |
| 31 def add_labels(input, output, taxon_levels): | |
| 32 """ | |
| 33 Add taxonomic labels to BLAST output. | |
| 34 | |
| 35 :param input: Path to BLAST output file. | |
| 36 :type input: str | |
| 37 :param output: Path to output file. | |
| 38 :type output: str | |
| 39 :param taxon_levels: List of taxonomic levels. | |
| 40 :type taxon_levels: list | |
| 41 :return: None | |
| 42 :rtype: None | |
| 43 :raises: ValueError: if certain fields are missing from the input | |
| 44 """ | |
| 45 # Make header in file | |
| 46 with open(output, 'w') as outfile: | |
| 47 outfile.write('#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage ' | |
| 48 '#Coverage #evalue #bitscore #Source #Taxonomy\n') | |
| 49 | |
| 50 with open(input, 'r') as infile, open(output, 'a') as outfile: | |
| 51 for line in infile: | |
| 52 if "#Query ID" not in line: | |
| 53 new_taxa = '' | |
| 54 src_val = "" | |
| 55 if "superkingdom=" in line and "markercode=" in line and "Genbank" in line: | |
| 56 taxonomy = line.split("superkingdom=")[-1].split("markercode=")[0].strip() | |
| 57 line = line.split("Genbank")[0].strip() | |
| 58 parts = line.strip().split() | |
| 59 else: | |
| 60 raise ValueError("Line does not contain expected fields: superkingdom, markercode, or Genbank") | |
| 61 | |
| 62 for p in parts: | |
| 63 if p.startswith("source="): | |
| 64 src_val = p.split("=", 1)[1] # only keep the value after '=' | |
| 65 break # we only need to get the source once | |
| 66 if src_val: | |
| 67 line = line.strip() + "\t" + src_val | |
| 68 | |
| 69 for level in taxon_levels: | |
| 70 if level != taxon_levels[-1]: | |
| 71 new_taxa += taxonomy.split('=')[level].split(' ')[0] | |
| 72 new_taxa += ' / ' | |
| 73 else: | |
| 74 # The species name already contains a space, so it | |
| 75 # should not be split further. | |
| 76 new_taxa += taxonomy.split('=')[level] | |
| 77 line += '\t' | |
| 78 line += new_taxa | |
| 79 outfile.write(f'{line}\n') | |
| 80 | |
| 81 | |
| 82 def main(arg_list=None): | |
| 83 args = parse_arguments(arg_list) | |
| 84 add_labels(args.input_file, args.output_file, args.taxon_levels) | |
| 85 | |
| 86 | |
| 87 if __name__ == '__main__': | |
| 88 main() |
