Mercurial > repos > onnodg > add_taxonomic_labels
view add_taxonomic_labels.py @ 1:5155c1c41198 draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit d771f9fbfd42bcdeda1623d954550882a0863847-dirty
| author | onnodg |
|---|---|
| date | Mon, 20 Oct 2025 12:25:29 +0000 |
| parents | abd214795fa5 |
| children |
line wrap: on
line source
"""This script processes the output obtained from a curated BLAST database and prepares the correct input format for downstream analysis. The difference before and after running this script is that taxonomic labels in the BLAST output are no longer marked as unknown, because the curated database only includes taxonomy information in the headers of the output sequences. To extract the correct taxonomy levels, it is necessary to know the exact positions of the required taxonomic ranks. These positions must be specified in the `--taxon_levels`, based on splitting each header string first on '=' and then on whitespace. Important: This script is not needed for GenBank-based databases, since their output is already usable. `--taxon_levels` is a critical variable — do not modify it unless you understand what you're doing. """ import argparse def parse_arguments(args_list=None): """Parse the command line arguments.""" parser = argparse.ArgumentParser(description="Add taxonomix labels from curated database output") parser.add_argument("-i", "--input_file", required=True) parser.add_argument("-o", "--output_file", required=True) parser.add_argument("-t", "--taxon_levels", type=int, nargs="+", default=[1, 2, 4, 7, 11, 12, 13]) return parser.parse_args(args_list) def add_labels(input, output, taxon_levels): """ Add taxonomic labels to BLAST output. :param input: Path to BLAST output file. :type input: str :param output: Path to output file. :type output: str :param taxon_levels: List of taxonomic levels. :type taxon_levels: list :return: None :rtype: None :raises: ValueError: if certain fields are missing from the input """ # Make header in file with open(output, 'w') as outfile: outfile.write('#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage ' '#Coverage #evalue #bitscore #Source #Taxonomy\n') with open(input, 'r') as infile, open(output, 'a') as outfile: for line in infile: if "#Query ID" not in line: new_taxa = '' src_val = "" if "superkingdom=" in line and "markercode=" in line and "Genbank" in line: taxonomy = line.split("superkingdom=")[-1].split("markercode=")[0].strip() line = line.split("Genbank")[0].strip() parts = line.strip().split() else: raise ValueError("Line does not contain expected fields: superkingdom, markercode, or Genbank") for p in parts: if p.startswith("source="): src_val = p.split("=", 1)[1] # only keep the value after '=' break # we only need to get the source once if src_val: line = line.strip() + "\t" + src_val for level in taxon_levels: if level != taxon_levels[-1]: new_taxa += taxonomy.split('=')[level].split(' ')[0] new_taxa += ' / ' else: # The species name already contains a space, so it # should not be split further. new_taxa += taxonomy.split('=')[level] line += '\t' line += new_taxa outfile.write(f'{line}\n') def main(arg_list=None): args = parse_arguments(arg_list) add_labels(args.input_file, args.output_file, args.taxon_levels) if __name__ == '__main__': main()
