diff add_taxonomic_labels.py @ 0:abd214795fa5 draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
author onnodg
date Tue, 14 Oct 2025 09:07:01 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/add_taxonomic_labels.py	Tue Oct 14 09:07:01 2025 +0000
@@ -0,0 +1,88 @@
+"""This script processes the output obtained from a curated BLAST database
+and prepares the correct input format for downstream analysis.
+
+The difference before and after running this script is that taxonomic labels
+in the BLAST output are no longer marked as unknown, because the curated database
+only includes taxonomy information in the headers of the output sequences.
+
+To extract the correct taxonomy levels, it is necessary to know the exact
+positions of the required taxonomic ranks. These positions must be specified
+in the `--taxon_levels`, based on splitting each header string first on '=' and
+then on whitespace.
+
+Important:
+This script is not needed for GenBank-based databases, since their output
+is already usable.  
+`--taxon_levels` is a critical variable — do not modify it unless you
+understand what you're doing.
+"""
+
+import argparse
+
+def parse_arguments(args_list=None):
+    """Parse the command line arguments."""
+    parser = argparse.ArgumentParser(description="Add taxonomix labels from curated database output")
+    parser.add_argument("-i", "--input_file", required=True)
+    parser.add_argument("-o", "--output_file", required=True)
+    parser.add_argument("-t", "--taxon_levels", type=int, nargs="+", default=[1, 2, 4, 7, 11, 12, 13])
+    return parser.parse_args(args_list)
+
+
+def add_labels(input, output, taxon_levels):
+    """
+    Add taxonomic labels to BLAST output.
+
+    :param input: Path to BLAST output file.
+    :type input: str
+    :param output: Path to output file.
+    :type output: str
+    :param taxon_levels: List of taxonomic levels.
+    :type taxon_levels: list
+    :return: None
+    :rtype: None
+    :raises: ValueError: if certain fields are missing from the input
+    """
+    # Make header in file
+    with open(output, 'w') as outfile:
+        outfile.write('#Query ID	#Subject	#Subject accession	#Subject Taxonomy ID	#Identity percentage	'
+                      '#Coverage	#evalue	#bitscore	#Source	#Taxonomy\n')
+
+    with open(input, 'r') as infile, open(output, 'a') as outfile:
+        for line in infile:
+            if "#Query ID" not in line:
+                new_taxa = ''
+                src_val = ""
+                if "superkingdom=" in line and "markercode="  in line and "Genbank" in line:
+                    taxonomy = line.split("superkingdom=")[-1].split("markercode=")[0].strip()
+                    line = line.split("Genbank")[0].strip()
+                    parts = line.strip().split()
+                else:
+                    raise ValueError("Line does not contain expected fields: superkingdom, markercode, or Genbank")
+
+                for p in parts:
+                    if p.startswith("source="):
+                        src_val = p.split("=", 1)[1]  # only keep the value after '='
+                        break # we only need to get the source once
+                if src_val:
+                    line = line.strip() + "\t" + src_val
+
+                for level in taxon_levels:
+                    if level != taxon_levels[-1]:
+                        new_taxa += taxonomy.split('=')[level].split(' ')[0]
+                        new_taxa += ' / '
+                    else:
+                        # The species name already contains a space, so it
+                        # should not be split further.
+                        new_taxa += taxonomy.split('=')[level]
+                line += '\t'
+                line += new_taxa
+                outfile.write(f'{line}\n')
+
+
+def main(arg_list=None):
+    args = parse_arguments(arg_list)
+    add_labels(args.input_file, args.output_file, args.taxon_levels)
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file