Mercurial > repos > onnodg > add_taxonomic_labels
diff add_taxonomic_labels.py @ 0:abd214795fa5 draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:07:01 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_taxonomic_labels.py Tue Oct 14 09:07:01 2025 +0000 @@ -0,0 +1,88 @@ +"""This script processes the output obtained from a curated BLAST database +and prepares the correct input format for downstream analysis. + +The difference before and after running this script is that taxonomic labels +in the BLAST output are no longer marked as unknown, because the curated database +only includes taxonomy information in the headers of the output sequences. + +To extract the correct taxonomy levels, it is necessary to know the exact +positions of the required taxonomic ranks. These positions must be specified +in the `--taxon_levels`, based on splitting each header string first on '=' and +then on whitespace. + +Important: +This script is not needed for GenBank-based databases, since their output +is already usable. +`--taxon_levels` is a critical variable — do not modify it unless you +understand what you're doing. +""" + +import argparse + +def parse_arguments(args_list=None): + """Parse the command line arguments.""" + parser = argparse.ArgumentParser(description="Add taxonomix labels from curated database output") + parser.add_argument("-i", "--input_file", required=True) + parser.add_argument("-o", "--output_file", required=True) + parser.add_argument("-t", "--taxon_levels", type=int, nargs="+", default=[1, 2, 4, 7, 11, 12, 13]) + return parser.parse_args(args_list) + + +def add_labels(input, output, taxon_levels): + """ + Add taxonomic labels to BLAST output. + + :param input: Path to BLAST output file. + :type input: str + :param output: Path to output file. + :type output: str + :param taxon_levels: List of taxonomic levels. + :type taxon_levels: list + :return: None + :rtype: None + :raises: ValueError: if certain fields are missing from the input + """ + # Make header in file + with open(output, 'w') as outfile: + outfile.write('#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage ' + '#Coverage #evalue #bitscore #Source #Taxonomy\n') + + with open(input, 'r') as infile, open(output, 'a') as outfile: + for line in infile: + if "#Query ID" not in line: + new_taxa = '' + src_val = "" + if "superkingdom=" in line and "markercode=" in line and "Genbank" in line: + taxonomy = line.split("superkingdom=")[-1].split("markercode=")[0].strip() + line = line.split("Genbank")[0].strip() + parts = line.strip().split() + else: + raise ValueError("Line does not contain expected fields: superkingdom, markercode, or Genbank") + + for p in parts: + if p.startswith("source="): + src_val = p.split("=", 1)[1] # only keep the value after '=' + break # we only need to get the source once + if src_val: + line = line.strip() + "\t" + src_val + + for level in taxon_levels: + if level != taxon_levels[-1]: + new_taxa += taxonomy.split('=')[level].split(' ')[0] + new_taxa += ' / ' + else: + # The species name already contains a space, so it + # should not be split further. + new_taxa += taxonomy.split('=')[level] + line += '\t' + line += new_taxa + outfile.write(f'{line}\n') + + +def main(arg_list=None): + args = parse_arguments(arg_list) + add_labels(args.input_file, args.output_file, args.taxon_levels) + + +if __name__ == '__main__': + main() \ No newline at end of file
