changeset 4:bb29ae8708b5 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
author iuc
date Tue, 13 May 2025 11:52:17 +0000
parents f8ebd1e802d7
children
files macros.xml otu.py rps2tsv.py test-data/blast2tsv_output.tab test-data/blast2tsv_output_with_rn.tab test-data/blast2tsv_reads_with_rn.txt test-data/input_otu_rps_s1.tab test-data/input_otu_rps_s2.tab test-data/rps_test.tab virAnnot_blast2tsv.xml
diffstat 10 files changed, 203 insertions(+), 142 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Sun Sep 08 14:09:19 2024 +0000
+++ b/macros.xml	Tue May 13 11:52:17 2025 +0000
@@ -18,9 +18,9 @@
             <requirement type="package" version="2.2.0">pandas</requirement>
             <requirement type="package" version="2.8.1">krona</requirement>
             <requirement type="package" version="3.0">zip</requirement>
-            <yield />
         </requirements>
     </xml>
+    <token name="@TOOL_VERSION@">1.2.0</token>
     <token name="@HEADLESS@"><![CDATA[export QT_QPA_PLATFORM='offscreen' &&]]></token>
     <xml name="citations">
         <citations>
--- a/otu.py	Sun Sep 08 14:09:19 2024 +0000
+++ b/otu.py	Tue May 13 11:52:17 2025 +0000
@@ -4,8 +4,8 @@
 # Name: virAnnot_otu
 # Author: Marie Lefebvre - INRAE
 # Reuirements: Ete3 toolkit and external apps
-# Aims: Create viral OTUs based on RPS and Blast annotations
 
+"""Create viral OTUs based on RPS and Blast annotations"""
 
 import argparse
 import csv
@@ -65,9 +65,14 @@
                         frame = float(row[7])
                         description = row[8]
                         superkingdom = row[9]
+                        try:
+                            pident = row[10]
+                        except IndexError:
+                            log.info(rps_file[0])
+                            log.info(row)
                         match = re.search("Viruses", superkingdom)
                         # if contig is viral then retrieve sequence
-                        if match:
+                        if match and float(pident) >= options.viral_portion:
                             options.fasta.sort()
                             seq = _retrieve_fasta_seq(options.fasta[i][0], query_id)
                             seq_length = len(seq)
@@ -103,13 +108,23 @@
                                                 if "taxonomy" not in collection[cdd_id][query_id]:
                                                     collection[cdd_id][query_id]["taxonomy"] = "Unknown"
                                 else:
-                                    log.info("No blast file")
+                                    log.debug("No blast file")
                                     collection[cdd_id][query_id]["taxonomy"] = "Unknown"
                                     collection[cdd_id][query_id]["nb"] = 0
-
-                                collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1]  # keep pfamXXX and RdRp 1
+                                # keep pfamXXX and RdRp 1
+                                collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1]
                                 collection[cdd_id]["full_description"] = description
         i += 1
+    if options.merge_rdrp == "yes":
+        rdrp_list = ["pfam00680", "pfam02123", "pfam00978", "pfam00998"]
+        collection["RdRp_merge"] = {}
+        for cdd_id in collection:
+            if cdd_id in rdrp_list and cdd_id != "RdRp_merge":
+                log.info("Add " + cdd_id + " in merge")
+                for query_id in collection[cdd_id]:
+                    if query_id not in collection["RdRp_merge"]:
+                        collection["RdRp_merge"][query_id] = {}
+                        collection["RdRp_merge"][query_id] = collection[cdd_id][query_id]
     return collection
 
 
@@ -181,7 +196,11 @@
         os.mkdir(options.output)
     color_by_sample = {}
     for cdd_id in hits_collection:
-        cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
+        log.info("align seq for " + cdd_id)
+        if cdd_id == "RdRp_merge":
+            cdd_output = options.output + "/" + cdd_id
+        else:
+            cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
         if not os.path.exists(cdd_output):
             os.mkdir(cdd_output)
         if os.path.exists(cdd_output + "/seq_to_align.fasta"):
@@ -223,7 +242,7 @@
             file_matrix = cdd_output + "/identity_matrix.csv"
             log.info("Create tree...")
             _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config)
-            _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id)
+            _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id)
             log.info("Retrieve OTUs...")
             # if os.path.exists(file_cluster):
             #     os.remove(file_cluster)
@@ -241,7 +260,7 @@
             f.close()
 
 
-def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id):
+def _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id):
     """
     Calculate paiwise distance between aligned protein sequences
     from a cdd_id
@@ -297,8 +316,13 @@
     log.info("Writing stats to " + file_xlsx)
     for cdd_id in hits_collection:
         otu_collection = {}
-        cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
-        worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"])  # add a worksheet
+        if cdd_id == "RdRp_merge":
+            cdd_output = options.output + "/" + cdd_id
+            worksheet = workbook.add_worksheet(cdd_id)
+        else:
+
+            cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_")
+            worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"])  # add a worksheet
         file_cluster = cdd_output + '/otu_cluster.csv'
         file_fasta_nucc = cdd_output + '/representative_nucc.fasta'
         with open(file_cluster, 'r') as clust:
@@ -315,25 +339,31 @@
                         otu_collection[row[0]][sample] = {}
                         otu_collection[row[0]][sample][contig] = {}
                         # add read number of the contig and annotation
-                        if 'nb' in hits_collection[cdd_id][contig]:
-                            otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                        if contig in hits_collection[cdd_id]:
+                            if 'nb' in hits_collection[cdd_id][contig]:
+                                otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                            else:
+                                otu_collection[row[0]][sample][contig]['nb'] = 0
+                            if 'taxonomy' in hits_collection[cdd_id][contig]:
+                                otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
+                            else:
+                                otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
                         else:
-                            otu_collection[row[0]][sample][contig]['nb'] = 0
-                        if 'taxonomy' in hits_collection[cdd_id][contig]:
-                            otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
-                        else:
-                            otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
+                            otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'}
                     else:
                         otu_collection[row[0]][sample][contig] = {}
                         # add read number of the contig and annotation
-                        if 'nb' in hits_collection[cdd_id][contig]:
-                            otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                        if contig in hits_collection[cdd_id]:
+                            if 'nb' in hits_collection[cdd_id][contig]:
+                                otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"]
+                            else:
+                                otu_collection[row[0]][sample][contig]['nb'] = 0
+                            if 'taxonomy' in hits_collection[cdd_id][contig]:
+                                otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
+                            else:
+                                otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
                         else:
-                            otu_collection[row[0]][sample][contig]['nb'] = 0
-                        if 'taxonomy' in hits_collection[cdd_id][contig]:
-                            otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
-                        else:
-                            otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown'
+                            otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'}
                     if 'taxonomy' in hits_collection[cdd_id][contig]:
                         otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"]
                     else:
@@ -362,7 +392,6 @@
         # column = 0
         with open(file_fasta_nucc, "w+") as f_nucc:
             for otu in otu_collection:
-                log.info(otu)
                 if isinstance(otu_collection[otu], dict):
                     column = 0
                     worksheet.write(row, column, otu)
@@ -405,7 +434,10 @@
         headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n']
         map_file.write("\t".join(headers))
         for cdd_id in hits_collection:
-            cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
+            if cdd_id == "RdRp_merge":
+                cdd_output = "RdRp_merge"
+            else:
+                cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")
             short_description = cdd_output
             file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa'
             tree_file = cdd_output + '/tree.dnd.png'
@@ -422,6 +454,9 @@
 
 
 def _set_options():
+    """
+    Set parameters
+    """
     parser = argparse.ArgumentParser()
     parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+')
     parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+')
@@ -429,6 +464,7 @@
     parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc')
     parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion')
     parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length')
+    parser.add_argument('-m', '--merge_rdrp', help='Merge RdRp1, 2, 3 and 4 to create otu on it.', action='store', type=str, default="no", dest='merge_rdrp')
     parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path')
     parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output')
     parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb')
@@ -438,6 +474,9 @@
 
 
 def _set_log_level(verbosity):
+    """
+    Debbug
+    """
     if verbosity == 1:
         log_format = '%(asctime)s %(levelname)-8s %(message)s'
         log.basicConfig(level=log.INFO, format=log_format)
--- a/rps2tsv.py	Sun Sep 08 14:09:19 2024 +0000
+++ b/rps2tsv.py	Tue May 13 11:52:17 2025 +0000
@@ -5,6 +5,7 @@
 # Author: Marie Lefebvre - INRAE
 # Aims: Convert rpsblast xml output to csv and add taxonomy
 
+"""Module which converts rpsblast xml output to tsv and add taxonomy"""
 
 import argparse
 import json
@@ -19,6 +20,9 @@
 
 
 def main():
+    """
+    Main function
+    """
     options = _set_options()
     _set_log_level(options.verbosity)
     hits = _read_xml(options)
@@ -44,6 +48,12 @@
                 hit_evalue = hit.expect  # evalue
                 hit_startQ = hit.query_start
                 hit_endQ = hit.query_end
+                hit_identity = hit.identities
+                hit_aln_length = hit.align_length
+                pident = "%0.3f" % (100 * float(hit_identity) / float(hit_aln_length))
+            if float(pident) < 0.1:
+                continue
+            hsp["pident"] = pident
             hsp["frame"] = hit_frame
             hsp["evalue"] = hit_evalue
             hsp["startQ"] = hit_startQ
@@ -83,7 +93,8 @@
                             taxonomy = names
                         if len(taxonomy) != 0:
                             kingdoms.append(taxonomy[0])
-                frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms}  # {'Pseudomonadota': 9, 'cellular organisms': 4}
+                # {'Pseudomonadota': 9, 'cellular organisms': 4}
+                frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms}
                 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
                 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items())
                 hsp["taxonomy"] = concat_freq
@@ -96,29 +107,40 @@
     Write output
     """
     log.info("Write output file " + options.output)
-    headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n"
+    headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\tpident\n"
     f = open(options.output, "w+")
     f.write(headers)
     for h in hits:
         f.write(h + "\t" + str(hits[h]["query_length"]) + "\t")
         f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t")
-        f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t")
-        f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"])
+        f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t"
+                + str(hits[h]["frame"]) + "\t")
+        f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"] + "\t" + hits[h]["pident"])
         f.write("\n")
     f.close()
 
 
 def _set_options():
+    """
+    Script parameters
+    """
     parser = argparse.ArgumentParser()
-    parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
-    parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
-    parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output')
-    parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
+    parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store',
+                        required=True, dest='xml_file')
+    parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store',
+                        type=float, default=0.0001, dest='max_evalue')
+    parser.add_argument('-o', '--out', help='The output file (.tab).', action='store',
+                        type=str, default='./rps2tsv_output.tab', dest='output')
+    parser.add_argument('-v', '--verbosity', help='Verbose level', action='store',
+                        type=int, choices=[1, 2, 3, 4], default=1)
     args = parser.parse_args()
     return args
 
 
 def _set_log_level(verbosity):
+    """
+    Debbug
+    """
     if verbosity == 1:
         log_format = '%(asctime)s %(levelname)-8s %(message)s'
         log.basicConfig(level=log.INFO, format=log_format)
--- a/test-data/blast2tsv_output.tab	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/blast2tsv_output.tab	Tue May 13 11:52:17 2025 +0000
@@ -2,11 +2,11 @@
 TBLASTX	NODE_13_length_295_cov_0.945833		295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
 TBLASTX	NODE_16_length_278_cov_0.901345		278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
 TBLASTX	NODE_19_length_271_cov_0.879630		271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_20_length_267_cov_1.429245		267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_20_length_267_cov_1.429245		267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_22_length_262_cov_1.053140		262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_24_length_258_cov_0.935961		258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_29_length_250_cov_0.851282		250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_34_length_245_cov_1.000000		245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000		245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
 TBLASTX	NODE_46_length_229_cov_1.091954		229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_47_length_229_cov_0.816092		229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_50_length_226_cov_2.269006		226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/blast2tsv_output_with_rn.tab	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/blast2tsv_output_with_rn.tab	Tue May 13 11:52:17 2025 +0000
@@ -2,11 +2,11 @@
 TBLASTX	NODE_13_length_295_cov_0.945833	264	295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
 TBLASTX	NODE_16_length_278_cov_0.901345	377	278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
 TBLASTX	NODE_19_length_271_cov_0.879630	67	271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_22_length_262_cov_1.053140	262	262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_24_length_258_cov_0.935961	101	258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_29_length_250_cov_0.851282	428	250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
 TBLASTX	NODE_46_length_229_cov_1.091954	471	229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_47_length_229_cov_0.816092	470	229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_50_length_226_cov_2.269006	315	226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/blast2tsv_reads_with_rn.txt	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/blast2tsv_reads_with_rn.txt	Tue May 13 11:52:17 2025 +0000
@@ -2,11 +2,11 @@
 TBLASTX	NODE_13_length_295_cov_0.945833	264	295	316155	pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2.  This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4.	Tursiops truncatus papillomavirus 2	41.5	1	100	67.0	2.277e-05	38.6378	316155	Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2	TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG
 TBLASTX	NODE_16_length_278_cov_0.901345	377	278	306845	pfam00421, PSII, Photosystem II protein.  		65.8	1	100	47.0	7.65615e-39	132.634			GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA
 TBLASTX	NODE_19_length_271_cov_0.879630	67	271	306845	pfam00421, PSII, Photosystem II protein.  		32.9	1	100	42.0	1.69015e-11	56.3644			GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
+TBLASTX	NODE_20_length_267_cov_1.429245	2	267	287774	pfam10839, DUF2647, Protein of unknown function (DUF2647).  This eukaryotic family of proteins are annotated as ycf68 but have no known function.	Desulfovibrio sp. G100IX	91.3	1	100	99.0	7.70073e-10	48.4966	287774	cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX	CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_22_length_262_cov_1.053140	262	262	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	40.9	1	100	77.0	4.94039e-28	99.6256	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_24_length_258_cov_0.935961	101	258	307679	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Tetrastemma peltatum	39.4	1	100	70.0	8.38713e-15	65.0021	307679	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum	GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_29_length_250_cov_0.851282	428	250	278700	pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits.  	uncultured archaeon CRE-PA11a	58.6	1	100	100	7.31211e-08	42.0012	278700	cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a	GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG
-TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
+TBLASTX	NODE_34_length_245_cov_1.000000	183	245	250270	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Nocardia sp. 431D04	37.5	1	100	38.0	6.42106e-08	45.7137	250270	cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04	GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG
 TBLASTX	NODE_46_length_229_cov_1.091954	471	229	306604	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	Heterotermes sp. TMJ-2004j	43.9	1	100	66.0	4.26406e-23	86.1436	306604	cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j	TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_47_length_229_cov_0.816092	470	229	306687	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  		66.7	1	100	14.0	1.79906e-13	61.3066			TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC
 TBLASTX	NODE_50_length_226_cov_2.269006	315	226	306845	pfam00421, PSII, Photosystem II protein.  		60.3	1	100	41.0	2.77182e-23	89.1064			GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/input_otu_rps_s1.tab	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/input_otu_rps_s1.tab	Tue May 13 11:52:17 2025 +0000
@@ -1,45 +1,45 @@
-#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom
-Query_2	2436	pfam02123	gnl|CDD|280316	2.04111e-21	184	1476	1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_4	2297	pfam00680	gnl|CDD|279070	3.12197e-05	995	1873	-2	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-Query_5	2029	pfam00680	gnl|CDD|279070	8.86955e-06	840	1706	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-Query_6	1860	pfam02123	gnl|CDD|280316	1.27376e-17	1147	1764	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_8	1703	pfam00680	gnl|CDD|279070	3.19349e-12	685	1458	-3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-Query_19	425	pfam00005	gnl|CDD|306511	3.70622e-07	129	275	-1	pfam00005, ABC_tran, ABC transporter.  ABC transporters for a large family of proteins responsible for translocation of a variety of compounds across biological membranes. ABC transporters are the largest family of proteins in many completely sequenced bacteria. ABC transporters are composed of two copies of this domain and two copies of a transmembrane domain pfam00664. These four domains may belong to a single polypeptide as in CFTR, or belong in different polypeptide chains.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)
-Query_38	386	pfam01347	gnl|CDD|279663	0.000262768	129	275	-1	pfam01347, Vitellogenin_N, Lipoprotein amino terminal region.  This family contains regions from: Vitellogenin, Microsomal triglyceride transfer protein and apolipoprotein B-100. These proteins are all involved in lipid transport. This family contains the LV1n chain from lipovitellin, that contains two structural domains.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_41	380	pfam04879	gnl|CDD|282703	2.77416e-08	125	274	-2	pfam04879, Molybdop_Fe4S4, Molybdopterin oxidoreductase Fe4S4 domain.  This domain is found in formate dehydrogenase H for which the structure is known. This first domain (residues 1 to 60) of Structure 1aa6 is an Fe4S4 cluster just below the protein surface.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_42	379	pfam16203	gnl|CDD|318443	8.05104e-30	131	280	-1	pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase.  This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases.	cellular organisms(2);Bacteria(1);Terrabacteria group(1)
-Query_44	376	pfam00401	gnl|CDD|306831	6.62013e-05	81	215	-3	pfam00401, ATP-synt_DE, ATP synthase, Delta/Epsilon chain, long alpha-helix domain.  Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. This subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213).	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_58	347	pfam00471	gnl|CDD|306877	8.86568e-13	132	302	3	pfam00471, Ribosomal_L33, Ribosomal protein L33.  	cellular organisms(2);Bacteria(1);Eukaryota(1)
-Query_61	344	pfam00252	gnl|CDD|306711	1.17482e-22	107	295	2	pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e.  	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_62	343	pfam00421	gnl|CDD|306845	7.93928e-41	92	337	-1	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_64	339	pfam01333	gnl|CDD|307480	0.000362606	197	325	-3	pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal.  This is a sub-family of cytochrome C. See pfam00034.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_74	330	pfam00680	gnl|CDD|279070	4.51414e-05	124	282	1	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-Query_83	320	pfam05860	gnl|CDD|310447	1.29746e-13	167	298	2	pfam05860, Haemagg_act, haemagglutination activity domain.  This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_87	252	pfam00585	gnl|CDD|278982	1.42752e-05	29	166	2	pfam00585, Thr_dehydrat_C, C-terminal regulatory domain of Threonine dehydratase.  Threonine dehydratases pfam00291 all contain a carboxy terminal region. This region may have a regulatory role. Some members contain two copies of this region. This family is homologous to the pfam01842 domain.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_90	251	pfam13188	gnl|CDD|315779	0.000739897	32	241	2	pfam13188, PAS_8, PAS domain.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_91	251	pfam02123	gnl|CDD|280316	3.2928e-08	28	228	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_93	251	pfam00252	gnl|CDD|306711	7.50297e-12	78	206	-1	pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e.  	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_98	250	pfam00227	gnl|CDD|306690	4.91252e-09	10	150	-2	pfam00227, Proteasome, Proteasome subunit.  The proteasome is a multisubunit structure that degrades proteins. Protein degradation is an essential component of regulation because proteins can become misfolded, damaged, or unnecessary. Proteasomes and their homologs vary greatly in complexity: from HslV (heat shock locus v), which is encoded by 1 gene in bacteria, to the eukaryotic 20S proteasome, which is encoded by more than 14 genes. Recently evidence of two novel groups of bacterial proteasomes was proposed. The first is Anbu, which is sparsely distributed among cyanobacteria and proteobacteria. The second is call beta-proteobacteria proteasome homolog (BPH).	cellular organisms(2);Eukaryota(1);Opisthokonta(1)
-Query_104	249	pfam13173	gnl|CDD|315764	2.6724e-08	106	249	1	pfam13173, AAA_14, AAA domain.  This family of domains contain a P-loop motif that is characteristic of the AAA superfamily.	Bacteria(2);cellular organisms(1);FCB group(1)
-Query_111	248	pfam00113	gnl|CDD|278539	3.9331e-13	15	116	-1	pfam00113, Enolase_C, Enolase, C-terminal TIM barrel domain.  	cellular organisms(2);Bacteria(2)
-Query_127	245	pfam00946	gnl|CDD|307203	3.13472e-05	1	141	1	pfam00946, Mononeg_RNA_pol, Mononegavirales RNA dependent RNA polymerase.  Members of the Mononegavirales including the Paramyxoviridae, like other non-segmented negative strand RNA viruses, have an RNA-dependent RNA polymerase composed of two subunits, a large protein L and a phosphoprotein P. This is a protein family of the L protein. The L protein confers the RNA polymerase activity on the complex. The P protein acts as a transcription factor.	Viruses(1);Riboviria(1);Orthornavirae(1);Negarnaviricota(1)
-Query_138	243	pfam00416	gnl|CDD|306841	5.30772e-05	15	134	-2	pfam00416, Ribosomal_S13, Ribosomal protein S13/S18.  This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes.	cellular organisms(2);Bacteria(2)
-Query_139	243	pfam00216	gnl|CDD|306682	1.89202e-10	134	241	-3	pfam00216, Bac_DNA_binding, Bacterial DNA-binding protein.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_140	243	pfam13041	gnl|CDD|315669	0.000344884	134	241	-3	pfam13041, PPR_2, PPR repeat family.  This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_144	243	pfam12137	gnl|CDD|314930	3.71293e-05	137	217	-3	pfam12137, RapA_C, RNA polymerase recycling family C-terminal.  This domain is found in bacteria. This domain is about 360 amino acids in length. This domain is found associated with pfam00271, pfam00176. The function of this domain is not known, but structurally it forms an alpha-beta fold in nature with a central beta-sheet flanked by helices and loops, the beta-sheet being mainly antiparallel and flanked by four alpha helices, among which the two longer helices exhibit a coiled-coil arrangement.	cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1)
-Query_145	242	pfam00146	gnl|CDD|306623	2.12078e-10	22	111	1	pfam00146, NADHdh, NADH dehydrogenase.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_149	242	pfam00124	gnl|CDD|306604	4.44151e-07	21	125	3	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_163	241	pfam02123	gnl|CDD|280316	5.78854e-08	35	214	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_177	239	pfam06122	gnl|CDD|310603	1.30391e-05	29	172	2	pfam06122, TraH, Conjugative relaxosome accessory transposon protein.  The TraH protein is thought to be a relaxosome accessory component, also necessary for transfer but not for H-pilus synthesis within the conjugative transposon.	cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1)
-Query_179	239	pfam00361	gnl|CDD|306795	3.63199e-05	70	219	1	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_182	239	pfam00177	gnl|CDD|306646	1.05327e-06	28	126	1	pfam00177, Ribosomal_S7, Ribosomal protein S7p/S5e.  This family contains ribosomal protein S7 from prokaryotes and S5 from eukaryotes.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_202	235	pfam03154	gnl|CDD|308660	0.000842762	28	126	1	pfam03154, Atrophin-1, Atrophin-1 family.  Atrophin-1 is the protein product of the dentatorubral-pallidoluysian atrophy (DRPLA) gene. DRPLA OMIM:125370 is a progressive neurodegenerative disorder. It is caused by the expansion of a CAG repeat in the DRPLA gene on chromosome 12p. This results in an extended polyglutamine region in atrophin-1, that is thought to confer toxicity to the protein, possibly through altering its interactions with other proteins. The expansion of a CAG repeat is also the underlying defect in six other neurodegenerative disorders, including Huntington's disease. One interaction of expanded polyglutamine repeats that is thought to be pathogenic is that with the short glutamine repeat in the transcriptional coactivator CREB binding protein, CBP. This interaction draws CBP away from its usual nuclear location to the expanded polyglutamine repeat protein aggregates that are characteristic of the polyglutamine neurodegenerative disorders. This interferes with CBP-mediated transcription and causes cytotoxicity.	Eukaryota(1);cellular organisms(1);Opisthokonta(1);Metazoa(1)
-Query_203	235	pfam00164	gnl|CDD|278589	1.83229e-23	3	182	3	pfam00164, Ribosom_S12_S23, Ribosomal protein S12/S23.  This protein is known as S12 in bacteria and archaea and S23 in eukaryotes.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_211	234	pfam00155	gnl|CDD|306629	0.000251531	3	182	3	pfam00155, Aminotran_1_2, Aminotransferase class I and II.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_219	233	pfam00680	gnl|CDD|279070	0.000703744	3	182	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-Query_232	231	pfam00481	gnl|CDD|306885	0.00063843	3	182	3	pfam00481, PP2C, Protein phosphatase 2C.  Protein phosphatase 2C is a Mn++ or Mg++ dependent protein serine/threonine phosphatase.	Eukaryota(2);cellular organisms(1);Viridiplantae(1)
-Query_241	230	pfam00072	gnl|CDD|306560	5.30837e-08	50	208	2	pfam00072, Response_reg, Response regulator receiver domain.  This domain receives the signal from the sensor partner in bacterial two-component systems. It is usually found N-terminal to a DNA binding effector domain.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_246	230	pfam00201	gnl|CDD|278624	2.93544e-07	46	210	1	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)
-Query_261	228	pfam17035	gnl|CDD|319097	3.87403e-09	108	203	3	pfam17035, BET, Bromodomain extra-terminal - transcription regulation.  The BET, or bromodomain extra-terminal domain, is found on bromodomain proteins that play key roles in development, cancer progression and virus-host pathogenesis. It interacts with NSD3, JMJD6, CHD4, GLTSCR1, and ATAD5 all of which are shown to impart a pTEFb-independent transcriptional activation function on the bromodomain proteins.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_280	207	pfam04061	gnl|CDD|309259	7.30581e-19	1	159	1	pfam04061, ORMDL, ORMDL family.  Evidence form suggests that ORMDLs are involved in protein folding in the ER. Orm proteins have been identified as negative regulators of sphingolipid synthesis that form a conserved complex with serine palmitoyltransferase, the first and rate-limiting enzyme in sphingolipid production. This novel and conserved protein complex, has been termed the SPOTS complex (serine palmitoyltransferase, Orm1/2, Tsc3, and Sac1).	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_326	206	pfam10775	gnl|CDD|313884	0.00091969	1	159	1	pfam10775, ATP_sub_h, ATP synthase complex subunit h.  Subunit h is a component of the yeast mitochondrial F1-F0 ATP synthase. It is essential for the correct assembly and functioning of this enzyme. Subunit h occupies a central place in the peripheral stalk between the F1 sector and the membrane.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Fungi(1)
+#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom	pident
+ds2020-267_5	2436	pfam02123	gnl|CDD|280316	2.04111e-21	184	1476	1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	22.535
+ds2020-267_7	2297	pfam00680	gnl|CDD|279070	3.12197e-05	995	1873	-2	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	19.742
+ds2020-267_8	2029	pfam00680	gnl|CDD|279070	8.86955e-06	840	1706	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	25.314
+ds2020-267_10	1860	pfam02123	gnl|CDD|280316	1.27376e-17	1147	1764	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	18.868
+ds2020-267_12	1703	pfam00680	gnl|CDD|279070	3.19349e-12	685	1458	-3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	27.456
+ds2020-267_75	425	pfam00005	gnl|CDD|306511	3.70622e-07	129	275	-1	pfam00005, ABC_tran, ABC transporter.  ABC transporters for a large family of proteins responsible for translocation of a variety of compounds across biological membranes. ABC transporters are the largest family of proteins in many completely sequenced bacteria. ABC transporters are composed of two copies of this domain and two copies of a transmembrane domain pfam00664. These four domains may belong to a single polypeptide as in CFTR, or belong in different polypeptide chains.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)	33.974
+ds2020-267_76	386	pfam01347	gnl|CDD|279663	0.000262768	129	275	-1	pfam01347, Vitellogenin_N, Lipoprotein amino terminal region.  This family contains regions from: Vitellogenin, Microsomal triglyceride transfer protein and apolipoprotein B-100. These proteins are all involved in lipid transport. This family contains the LV1n chain from lipovitellin, that contains two structural domains.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	24.167
+ds2020-267_79	380	pfam04879	gnl|CDD|282703	2.77416e-08	125	274	-2	pfam04879, Molybdop_Fe4S4, Molybdopterin oxidoreductase Fe4S4 domain.  This domain is found in formate dehydrogenase H for which the structure is known. This first domain (residues 1 to 60) of Structure 1aa6 is an Fe4S4 cluster just below the protein surface.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	22.921
+ds2020-267_80	379	pfam16203	gnl|CDD|318443	8.05104e-30	131	280	-1	pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase.  This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases.	cellular organisms(2);Bacteria(1);Terrabacteria group(1)	29.017
+ds2020-267_81	376	pfam00401	gnl|CDD|306831	6.62013e-05	81	215	-3	pfam00401, ATP-synt_DE, ATP synthase, Delta/Epsilon chain, long alpha-helix domain.  Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. This subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213).	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	27.296
+ds2020-267_320	347	pfam00471	gnl|CDD|306877	8.86568e-13	132	302	3	pfam00471, Ribosomal_L33, Ribosomal protein L33.  	cellular organisms(2);Bacteria(1);Eukaryota(1)	27.649
+ds2020-267_322	344	pfam00252	gnl|CDD|306711	1.17482e-22	107	295	2	pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e.  	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	18.354
+ds2020-267_323	343	pfam00421	gnl|CDD|306845	7.93928e-41	92	337	-1	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	21.070
+ds2020-267_324	339	pfam01333	gnl|CDD|307480	0.000362606	197	325	-3	pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal.  This is a sub-family of cytochrome C. See pfam00034.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	26.684
+ds2020-267_327	330	pfam00680	gnl|CDD|279070	4.51414e-05	124	282	1	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	24.942
+ds2020-267_332	320	pfam05860	gnl|CDD|310447	1.29746e-13	167	298	2	pfam05860, Haemagg_act, haemagglutination activity domain.  This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	22.222
+ds2020-267_333	252	pfam00585	gnl|CDD|278982	1.42752e-05	29	166	2	pfam00585, Thr_dehydrat_C, C-terminal regulatory domain of Threonine dehydratase.  Threonine dehydratases pfam00291 all contain a carboxy terminal region. This region may have a regulatory role. Some members contain two copies of this region. This family is homologous to the pfam01842 domain.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	25.916
+ds2020-267_336	251	pfam13188	gnl|CDD|315779	0.000739897	32	241	2	pfam13188, PAS_8, PAS domain.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	27.014
+ds2020-267_337	251	pfam02123	gnl|CDD|280316	3.2928e-08	28	228	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	37.500
+ds2020-267_338	251	pfam00252	gnl|CDD|306711	7.50297e-12	78	206	-1	pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e.  	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	17.308
+ds2020-267_339	250	pfam00227	gnl|CDD|306690	4.91252e-09	10	150	-2	pfam00227, Proteasome, Proteasome subunit.  The proteasome is a multisubunit structure that degrades proteins. Protein degradation is an essential component of regulation because proteins can become misfolded, damaged, or unnecessary. Proteasomes and their homologs vary greatly in complexity: from HslV (heat shock locus v), which is encoded by 1 gene in bacteria, to the eukaryotic 20S proteasome, which is encoded by more than 14 genes. Recently evidence of two novel groups of bacterial proteasomes was proposed. The first is Anbu, which is sparsely distributed among cyanobacteria and proteobacteria. The second is call beta-proteobacteria proteasome homolog (BPH).	cellular organisms(2);Eukaryota(1);Opisthokonta(1)	21.244
+ds2020-267_343	249	pfam13173	gnl|CDD|315764	2.6724e-08	106	249	1	pfam13173, AAA_14, AAA domain.  This family of domains contain a P-loop motif that is characteristic of the AAA superfamily.	Bacteria(2);cellular organisms(1);FCB group(1)	24.583
+ds2020-267_362	248	pfam00113	gnl|CDD|278539	3.9331e-13	15	116	-1	pfam00113, Enolase_C, Enolase, C-terminal TIM barrel domain.  	cellular organisms(2);Bacteria(2)	21.656
+ds2020-267_363	245	pfam00946	gnl|CDD|307203	3.13472e-05	1	141	1	pfam00946, Mononeg_RNA_pol, Mononegavirales RNA dependent RNA polymerase.  Members of the Mononegavirales including the Paramyxoviridae, like other non-segmented negative strand RNA viruses, have an RNA-dependent RNA polymerase composed of two subunits, a large protein L and a phosphoprotein P. This is a protein family of the L protein. The L protein confers the RNA polymerase activity on the complex. The P protein acts as a transcription factor.	Viruses(1);Riboviria(1);Orthornavirae(1);Negarnaviricota(1)	26.562
+ds2020-267_364	243	pfam00416	gnl|CDD|306841	5.30772e-05	15	134	-2	pfam00416, Ribosomal_S13, Ribosomal protein S13/S18.  This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes.	cellular organisms(2);Bacteria(2)	26.276
+ds2020-267_365	243	pfam00216	gnl|CDD|306682	1.89202e-10	134	241	-3	pfam00216, Bac_DNA_binding, Bacterial DNA-binding protein.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	25.178
+ds2020-267_366	243	pfam13041	gnl|CDD|315669	0.000344884	134	241	-3	pfam13041, PPR_2, PPR repeat family.  This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	17.600
+ds2020-267_370	243	pfam12137	gnl|CDD|314930	3.71293e-05	137	217	-3	pfam12137, RapA_C, RNA polymerase recycling family C-terminal.  This domain is found in bacteria. This domain is about 360 amino acids in length. This domain is found associated with pfam00271, pfam00176. The function of this domain is not known, but structurally it forms an alpha-beta fold in nature with a central beta-sheet flanked by helices and loops, the beta-sheet being mainly antiparallel and flanked by four alpha helices, among which the two longer helices exhibit a coiled-coil arrangement.	cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1)	24.942
+ds2020-267_372	242	pfam00146	gnl|CDD|306623	2.12078e-10	22	111	1	pfam00146, NADHdh, NADH dehydrogenase.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	24.942
+ds2020-267_373	242	pfam00124	gnl|CDD|306604	4.44151e-07	21	125	3	pfam00124, Photo_RC, Photosynthetic reaction centre protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	33.663
+ds2020-267_374	241	pfam02123	gnl|CDD|280316	5.78854e-08	35	214	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	21.831
+ds2020-267_380	239	pfam06122	gnl|CDD|310603	1.30391e-05	29	172	2	pfam06122, TraH, Conjugative relaxosome accessory transposon protein.  The TraH protein is thought to be a relaxosome accessory component, also necessary for transfer but not for H-pilus synthesis within the conjugative transposon.	cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1)	37.888
+ds2020-267_385	239	pfam00361	gnl|CDD|306795	3.63199e-05	70	219	1	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	18.868
+ds2020-267_386	239	pfam00177	gnl|CDD|306646	1.05327e-06	28	126	1	pfam00177, Ribosomal_S7, Ribosomal protein S7p/S5e.  This family contains ribosomal protein S7 from prokaryotes and S5 from eukaryotes.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	29.545
+ds2020-267_395	235	pfam03154	gnl|CDD|308660	0.000842762	28	126	1	pfam03154, Atrophin-1, Atrophin-1 family.  Atrophin-1 is the protein product of the dentatorubral-pallidoluysian atrophy (DRPLA) gene. DRPLA OMIM:125370 is a progressive neurodegenerative disorder. It is caused by the expansion of a CAG repeat in the DRPLA gene on chromosome 12p. This results in an extended polyglutamine region in atrophin-1, that is thought to confer toxicity to the protein, possibly through altering its interactions with other proteins. The expansion of a CAG repeat is also the underlying defect in six other neurodegenerative disorders, including Huntington's disease. One interaction of expanded polyglutamine repeats that is thought to be pathogenic is that with the short glutamine repeat in the transcriptional coactivator CREB binding protein, CBP. This interaction draws CBP away from its usual nuclear location to the expanded polyglutamine repeat protein aggregates that are characteristic of the polyglutamine neurodegenerative disorders. This interferes with CBP-mediated transcription and causes cytotoxicity.	Eukaryota(1);cellular organisms(1);Opisthokonta(1);Metazoa(1)	36.317
+ds2020-267_403	235	pfam00164	gnl|CDD|278589	1.83229e-23	3	182	3	pfam00164, Ribosom_S12_S23, Ribosomal protein S12/S23.  This protein is known as S12 in bacteria and archaea and S23 in eukaryotes.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	21.831
+ds2020-267_404	234	pfam00155	gnl|CDD|306629	0.000251531	3	182	3	pfam00155, Aminotran_1_2, Aminotransferase class I and II.  	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	25.314
+ds2020-267_835	233	pfam00680	gnl|CDD|279070	0.000703744	3	182	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	28.244
+ds2020-267_837	231	pfam00481	gnl|CDD|306885	0.00063843	3	182	3	pfam00481, PP2C, Protein phosphatase 2C.  Protein phosphatase 2C is a Mn++ or Mg++ dependent protein serine/threonine phosphatase.	Eukaryota(2);cellular organisms(1);Viridiplantae(1)	22.921
+ds2020-267_838	230	pfam00072	gnl|CDD|306560	5.30837e-08	50	208	2	pfam00072, Response_reg, Response regulator receiver domain.  This domain receives the signal from the sensor partner in bacterial two-component systems. It is usually found N-terminal to a DNA binding effector domain.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	34.356
+ds2020-267_843	230	pfam00201	gnl|CDD|278624	2.93544e-07	46	210	1	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)	26.684
+ds2020-267_852	228	pfam17035	gnl|CDD|319097	3.87403e-09	108	203	3	pfam17035, BET, Bromodomain extra-terminal - transcription regulation.  The BET, or bromodomain extra-terminal domain, is found on bromodomain proteins that play key roles in development, cancer progression and virus-host pathogenesis. It interacts with NSD3, JMJD6, CHD4, GLTSCR1, and ATAD5 all of which are shown to impart a pTEFb-independent transcriptional activation function on the bromodomain proteins.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	34.188
+ds2020-267_855	207	pfam04061	gnl|CDD|309259	7.30581e-19	1	159	1	pfam04061, ORMDL, ORMDL family.  Evidence form suggests that ORMDLs are involved in protein folding in the ER. Orm proteins have been identified as negative regulators of sphingolipid synthesis that form a conserved complex with serine palmitoyltransferase, the first and rate-limiting enzyme in sphingolipid production. This novel and conserved protein complex, has been termed the SPOTS complex (serine palmitoyltransferase, Orm1/2, Tsc3, and Sac1).	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	21.368
+ds2020-267_858	206	pfam10775	gnl|CDD|313884	0.00091969	1	159	1	pfam10775, ATP_sub_h, ATP synthase complex subunit h.  Subunit h is a component of the yeast mitochondrial F1-F0 ATP synthase. It is essential for the correct assembly and functioning of this enzyme. Subunit h occupies a central place in the peripheral stalk between the F1 sector and the membrane.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Fungi(1)	32.258
--- a/test-data/input_otu_rps_s2.tab	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/input_otu_rps_s2.tab	Tue May 13 11:52:17 2025 +0000
@@ -1,50 +1,50 @@
-#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom
-Query_1	2975	pfam02874	gnl|CDD|308490	6.56656e-19	2202	2405	-1	pfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain.  This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_8	1120	pfam00146	gnl|CDD|306623	6.73934e-18	936	1097	-3	pfam00146, NADHdh, NADH dehydrogenase.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_19	872	pfam01443	gnl|CDD|307550	7.69575e-33	10	696	-3	pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase.  Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)
-Query_22	847	pfam13456	gnl|CDD|316018	1.2307e-09	176	397	2	pfam13456, RVT_3, Reverse transcriptase-like.  This domain is found in plants and appears to be part of a retrotransposon.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_30	681	pfam00416	gnl|CDD|306841	7.7464e-31	92	409	-3	pfam00416, Ribosomal_S13, Ribosomal protein S13/S18.  This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes.	cellular organisms(2);Bacteria(2)
-Query_36	644	pfam00078	gnl|CDD|306564	2.13234e-08	190	636	-3	pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.	Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1)
-Query_40	623	pfam00346	gnl|CDD|306783	6.5049e-56	191	496	-2	pfam00346, Complex1_49kDa, Respiratory-chain NADH dehydrogenase, 49 Kd subunit.  	cellular organisms(2);Bacteria(1);Eukaryota(1)
-Query_43	620	pfam00115	gnl|CDD|306596	2.19638e-51	78	548	3	pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_45	598	pfam00115	gnl|CDD|306596	4.78609e-34	21	302	3	pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_50	458	pfam02123	gnl|CDD|280316	1.82963e-26	27	443	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_51	458	pfam03732	gnl|CDD|309014	1.12045e-06	256	441	1	pfam03732, Retrotrans_gag, Retrotransposon gag protein.  Gag or Capsid-like proteins from LTR retrotransposons. There is a central motif QGXXEXXXXXFXXLXXH that is common to Retroviridae gag-proteins, but is poorly conserved.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_53	454	pfam14111	gnl|CDD|316622	3.40587e-07	213	353	3	pfam14111, DUF4283, Domain of unknown function (DUF4283).  This domain family is found in plants, and is approximately 100 amino acids in length. Considering the very diverse range of other domains it is associated with it is possible that this domain is a binding/guiding region. There are two highly conserved tryptophan residues.	cellular organisms(1);Eukaryota(1);Streptophytina(1);Viridiplantae(1)
-Query_58	446	pfam01348	gnl|CDD|279664	1.01441e-09	40	303	-3	pfam01348, Intron_maturas2, Type II intron maturase.  Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_61	442	pfam02123	gnl|CDD|280316	1.50074e-23	115	429	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_65	433	pfam00253	gnl|CDD|306712	1.66195e-07	329	415	2	pfam00253, Ribosomal_S14, Ribosomal protein S14p/S29e.  This family includes both ribosomal S14 from prokaryotes and S29 from eukaryotes.	cellular organisms(2);Bacteria(1);Eukaryota(1)
-Query_67	426	pfam00078	gnl|CDD|306564	9.00965e-09	268	405	-1	pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.	Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1)
-Query_70	424	pfam00665	gnl|CDD|307008	1.57397e-23	93	413	3	pfam00665, rve, Integrase core domain.  Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain pfam02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain pfam00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyzes the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site.	cellular organisms(2);Viruses(1)
-Query_76	406	pfam00361	gnl|CDD|306795	9.79473e-05	212	379	2	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_88	372	pfam02123	gnl|CDD|280316	7.63867e-10	160	363	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_108	229	pfam02123	gnl|CDD|280316	5.2142e-12	25	213	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_111	229	pfam00421	gnl|CDD|306845	5.07684e-21	15	218	3	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_114	229	pfam05518	gnl|CDD|253234	1.06567e-09	26	229	2	pfam05518, Totivirus_coat, Totivirus coat protein.  	Viruses(1);Riboviria(1);Duplornaviricota(1);Orthornavirae(1)
-Query_118	228	pfam01333	gnl|CDD|307480	2.57329e-21	54	218	3	pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal.  This is a sub-family of cytochrome C. See pfam00034.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_123	228	pfam00006	gnl|CDD|306512	7.83347e-19	20	211	2	pfam00006, ATP-synt_ab, ATP synthase alpha/beta family, nucleotide-binding domain.  This entry includes the ATP synthase alpha and beta subunits, the ATP synthase associated with flagella and the termination factor Rho.	cellular organisms(2);Bacteria(2)
-Query_125	228	pfam00421	gnl|CDD|306845	1.91926e-10	23	226	-3	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_138	226	pfam00421	gnl|CDD|306845	4.1109e-19	14	193	2	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_139	226	pfam01660	gnl|CDD|307679	1.36829e-05	15	209	3	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)
-Query_148	225	pfam13041	gnl|CDD|315669	1.22135e-07	54	185	-2	pfam13041, PPR_2, PPR repeat family.  This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_155	225	pfam04392	gnl|CDD|282274	7.01878e-09	28	177	-1	pfam04392, ABC_sub_bind, ABC transporter substrate binding protein.  This family contains many hypothetical proteins and some ABC transporter substrate binding proteins.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)
-Query_168	224	pfam00223	gnl|CDD|306687	7.46218e-21	41	205	-2	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_171	223	pfam13683	gnl|CDD|316225	0.000987396	41	205	-2	pfam13683, rve_3, Integrase core domain.  	Bacteria(2);cellular organisms(1);Terrabacteria group(1)
-Query_173	223	pfam01809	gnl|CDD|307773	1.03441e-07	121	189	-2	pfam01809, Haemolytic, Haemolytic domain.  This domain has haemolytic activity. It is found in short (73-103 amino acid) proteins and contains three conserved cysteine residues.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)
-Query_189	222	pfam02468	gnl|CDD|280606	3.26069e-17	22	123	1	pfam02468, PsbN, Photosystem II reaction centre N protein (psbN).  This is a family of small proteins encoded on the chloroplast genome. psbN is involved in photosystem II during photosynthesis, but its exact role is unknown.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_191	222	pfam00978	gnl|CDD|250270	1.6261e-12	24	206	-2	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)
-Query_195	222	pfam00562	gnl|CDD|306936	2.01964e-09	26	145	-3	pfam00562, RNA_pol_Rpb2_6, RNA polymerase Rpb2, domain 6.  RNA polymerases catalyze the DNA dependent polymerization of RNA. Prokaryotes contain a single RNA polymerase compared to three in eukaryotes (not including mitochondrial. and chloroplast polymerases). This domain represents the hybrid binding domain and the wall domain. The hybrid binding domain binds the nascent RNA strand / template DNA strand in the Pol II transcription elongation complex. This domain contains the important structural motifs, switch 3 and the flap loop and binds an active site metal ion. This domain is also involved in binding to Rpb1 and Rpb3. Many of the bacterial members contain large insertions within this domain, as region known as dispensable region 2 (DRII).	cellular organisms(2);Eukaryota(1);Viridiplantae(1)
-Query_201	222	pfam00201	gnl|CDD|278624	0.000513014	26	145	-3	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)
-Query_203	222	pfam02670	gnl|CDD|308348	4.98265e-14	25	135	1	pfam02670, DXP_reductoisom, 1-deoxy-D-xylulose 5-phosphate reductoisomerase.  This is a family of 1-deoxy-D-xylulose 5-phosphate reductoisomerases. This enzyme catalyzes the formation of 2-C-methyl-D-erythritol 4-phosphate from 1-deoxy-D-xylulose-5-phosphate in the presence of NADPH. This reaction is part of the terpenoid biosynthesis pathway.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)
-Query_205	221	pfam00329	gnl|CDD|306769	1.68456e-18	81	197	-1	pfam00329, Complex1_30kDa, Respiratory-chain NADH dehydrogenase, 30 Kd subunit.  	cellular organisms(2);Bacteria(1);Eukaryota(1)
-Query_218	220	pfam05724	gnl|CDD|310379	3.15883e-08	38	196	2	pfam05724, TPMT, Thiopurine S-methyltransferase (TPMT).  This family consists of thiopurine S-methyltransferase proteins from both eukaryotes and prokaryotes. Thiopurine S-methyltransferase (TPMT) is a cytosolic enzyme that catalyzes S-methylation of aromatic and heterocyclic sulfhydryl compounds, including anticancer and immunosuppressive thiopurines.	cellular organisms(2);Bacteria(1);Eukaryota(1)
-Query_238	219	pfam02123	gnl|CDD|280316	1.42892e-13	35	199	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_260	217	pfam02123	gnl|CDD|280316	4.65988e-13	13	210	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_264	216	pfam02123	gnl|CDD|280316	7.05387e-17	8	214	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_275	215	pfam02123	gnl|CDD|280316	3.8356e-09	37	198	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_277	215	pfam00201	gnl|CDD|278624	5.96981e-07	113	193	-2	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)
-Query_282	215	pfam02123	gnl|CDD|280316	4.70874e-08	33	209	3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
-Query_289	214	pfam00361	gnl|CDD|306795	1.62395e-10	59	196	-1	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-Query_292	211	pfam05892	gnl|CDD|283531	0.000183874	59	196	-1	pfam05892, Tricho_coat, Trichovirus coat protein.  This family consists of several coat proteins which are specific to the ssRNA positive-strand, no DNA stage viruses such as the Trichovirus and Vitivirus.	Viruses(1);Kitrinoviricota(1);Orthornavirae(1);Tymovirales(1)
-Query_293	211	pfam07727	gnl|CDD|311594	9.19953e-05	43	120	1	pfam07727, RVT_2, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. This Pfam entry includes reverse transcriptases not recognized by the pfam00078 model.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)
-Query_297	211	pfam00978	gnl|CDD|250270	2.21971e-14	16	201	1	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)
+#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom	pident
+ds2020-328_2	2975	pfam02874	gnl|CDD|308490	6.56656e-19	2202	2405	-1	pfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain.  This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella.	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	23.821
+ds2020-328_16	1120	pfam00146	gnl|CDD|306623	6.73934e-18	936	1097	-3	pfam00146, NADHdh, NADH dehydrogenase.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	34.959
+ds2020-328_27	872	pfam01443	gnl|CDD|307550	7.69575e-33	10	696	-3	pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase.  Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)	30.000
+ds2020-328_38	847	pfam13456	gnl|CDD|316018	1.2307e-09	176	397	2	pfam13456, RVT_3, Reverse transcriptase-like.  This domain is found in plants and appears to be part of a retrotransposon.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	33.486
+ds2020-328_39	681	pfam00416	gnl|CDD|306841	7.7464e-31	92	409	-3	pfam00416, Ribosomal_S13, Ribosomal protein S13/S18.  This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes.	cellular organisms(2);Bacteria(2)	34.188
+ds2020-328_40	644	pfam00078	gnl|CDD|306564	2.13234e-08	190	636	-3	pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.	Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1)	18.868
+ds2020-328_41	623	pfam00346	gnl|CDD|306783	6.5049e-56	191	496	-2	pfam00346, Complex1_49kDa, Respiratory-chain NADH dehydrogenase, 49 Kd subunit.  	cellular organisms(2);Bacteria(1);Eukaryota(1)	28.244
+ds2020-328_42	620	pfam00115	gnl|CDD|306596	2.19638e-51	78	548	3	pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	26.543
+ds2020-328_43	598	pfam00115	gnl|CDD|306596	4.78609e-34	21	302	3	pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I.  	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	23.849
+ds2020-328_44	458	pfam02123	gnl|CDD|280316	1.82963e-26	27	443	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	28.405
+ds2020-328_45	458	pfam03732	gnl|CDD|309014	1.12045e-06	256	441	1	pfam03732, Retrotrans_gag, Retrotransposon gag protein.  Gag or Capsid-like proteins from LTR retrotransposons. There is a central motif QGXXEXXXXXFXXLXXH that is common to Retroviridae gag-proteins, but is poorly conserved.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	37.970
+ds2020-328_46	454	pfam14111	gnl|CDD|316622	3.40587e-07	213	353	3	pfam14111, DUF4283, Domain of unknown function (DUF4283).  This domain family is found in plants, and is approximately 100 amino acids in length. Considering the very diverse range of other domains it is associated with it is possible that this domain is a binding/guiding region. There are two highly conserved tryptophan residues.	cellular organisms(1);Eukaryota(1);Streptophytina(1);Viridiplantae(1)	27.296
+ds2020-328_47	446	pfam01348	gnl|CDD|279664	1.01441e-09	40	303	-3	pfam01348, Intron_maturas2, Type II intron maturase.  Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	18.868
+ds2020-328_48	442	pfam02123	gnl|CDD|280316	1.50074e-23	115	429	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	20.084
+ds2020-328_49	433	pfam00253	gnl|CDD|306712	1.66195e-07	329	415	2	pfam00253, Ribosomal_S14, Ribosomal protein S14p/S29e.  This family includes both ribosomal S14 from prokaryotes and S29 from eukaryotes.	cellular organisms(2);Bacteria(1);Eukaryota(1)	28.428
+ds2020-328_50	426	pfam00078	gnl|CDD|306564	9.00965e-09	268	405	-1	pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.	Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1)	28.428
+ds2020-328_51	424	pfam00665	gnl|CDD|307008	1.57397e-23	93	413	3	pfam00665, rve, Integrase core domain.  Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain pfam02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain pfam00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyzes the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site.	cellular organisms(2);Viruses(1)	21.127
+ds2020-328_52	406	pfam00361	gnl|CDD|306795	9.79473e-05	212	379	2	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	34.188
+ds2020-328_53	372	pfam02123	gnl|CDD|280316	7.63867e-10	160	363	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	20.084
+ds2020-328_54	229	pfam02123	gnl|CDD|280316	5.2142e-12	25	213	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	34.746
+ds2020-328_55	229	pfam00421	gnl|CDD|306845	5.07684e-21	15	218	3	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	34.836
+ds2020-328_56	229	pfam05518	gnl|CDD|253234	1.06567e-09	26	229	2	pfam05518, Totivirus_coat, Totivirus coat protein.  	Viruses(1);Riboviria(1);Duplornaviricota(1);Orthornavirae(1)	20.084
+ds2020-328_98	228	pfam01333	gnl|CDD|307480	2.57329e-21	54	218	3	pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal.  This is a sub-family of cytochrome C. See pfam00034.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	22.535
+ds2020-328_99	228	pfam00006	gnl|CDD|306512	7.83347e-19	20	211	2	pfam00006, ATP-synt_ab, ATP synthase alpha/beta family, nucleotide-binding domain.  This entry includes the ATP synthase alpha and beta subunits, the ATP synthase associated with flagella and the termination factor Rho.	cellular organisms(2);Bacteria(2)	20.084
+ds2020-328_612	228	pfam00421	gnl|CDD|306845	1.91926e-10	23	226	-3	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	15.909
+ds2020-328_613	226	pfam00421	gnl|CDD|306845	4.1109e-19	14	193	2	pfam00421, PSII, Photosystem II protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	21.831
+ds2020-328_614	226	pfam01660	gnl|CDD|307679	1.36829e-05	15	209	3	pfam01660, Vmethyltransf, Viral methyltransferase.  This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)	35.041
+ds2020-328_615	225	pfam13041	gnl|CDD|315669	1.22135e-07	54	185	-2	pfam13041, PPR_2, PPR repeat family.  This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	28.244
+ds2020-328_616	225	pfam04392	gnl|CDD|282274	7.01878e-09	28	177	-1	pfam04392, ABC_sub_bind, ABC transporter substrate binding protein.  This family contains many hypothetical proteins and some ABC transporter substrate binding proteins.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)	25.314
+ds2020-328_617	224	pfam00223	gnl|CDD|306687	7.46218e-21	41	205	-2	pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	29.688
+ds2020-328_618	223	pfam13683	gnl|CDD|316225	0.000987396	41	205	-2	pfam13683, rve_3, Integrase core domain.  	Bacteria(2);cellular organisms(1);Terrabacteria group(1)	33.894
+ds2020-328_619	223	pfam01809	gnl|CDD|307773	1.03441e-07	121	189	-2	pfam01809, Haemolytic, Haemolytic domain.  This domain has haemolytic activity. It is found in short (73-103 amino acid) proteins and contains three conserved cysteine residues.	Bacteria(2);cellular organisms(1);Terrabacteria group(1)	27.296
+ds2020-328_620	222	pfam02468	gnl|CDD|280606	3.26069e-17	22	123	1	pfam02468, PsbN, Photosystem II reaction centre N protein (psbN).  This is a family of small proteins encoded on the chloroplast genome. psbN is involved in photosystem II during photosynthesis, but its exact role is unknown.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	27.848
+ds2020-328_621	222	pfam00978	gnl|CDD|250270	1.6261e-12	24	206	-2	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)	22.772
+ds2020-328_682	222	pfam00562	gnl|CDD|306936	2.01964e-09	26	145	-3	pfam00562, RNA_pol_Rpb2_6, RNA polymerase Rpb2, domain 6.  RNA polymerases catalyze the DNA dependent polymerization of RNA. Prokaryotes contain a single RNA polymerase compared to three in eukaryotes (not including mitochondrial. and chloroplast polymerases). This domain represents the hybrid binding domain and the wall domain. The hybrid binding domain binds the nascent RNA strand / template DNA strand in the Pol II transcription elongation complex. This domain contains the important structural motifs, switch 3 and the flap loop and binds an active site metal ion. This domain is also involved in binding to Rpb1 and Rpb3. Many of the bacterial members contain large insertions within this domain, as region known as dispensable region 2 (DRII).	cellular organisms(2);Eukaryota(1);Viridiplantae(1)	23.864
+ds2020-328_688	222	pfam00201	gnl|CDD|278624	0.000513014	26	145	-3	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)	26.154
+ds2020-328_690	222	pfam02670	gnl|CDD|308348	4.98265e-14	25	135	1	pfam02670, DXP_reductoisom, 1-deoxy-D-xylulose 5-phosphate reductoisomerase.  This is a family of 1-deoxy-D-xylulose 5-phosphate reductoisomerases. This enzyme catalyzes the formation of 2-C-methyl-D-erythritol 4-phosphate from 1-deoxy-D-xylulose-5-phosphate in the presence of NADPH. This reaction is part of the terpenoid biosynthesis pathway.	Bacteria(2);cellular organisms(1);Pseudomonadota(1)	21.111
+ds2020-328_692	221	pfam00329	gnl|CDD|306769	1.68456e-18	81	197	-1	pfam00329, Complex1_30kDa, Respiratory-chain NADH dehydrogenase, 30 Kd subunit.  	cellular organisms(2);Bacteria(1);Eukaryota(1)	24.942
+ds2020-328_705	220	pfam05724	gnl|CDD|310379	3.15883e-08	38	196	2	pfam05724, TPMT, Thiopurine S-methyltransferase (TPMT).  This family consists of thiopurine S-methyltransferase proteins from both eukaryotes and prokaryotes. Thiopurine S-methyltransferase (TPMT) is a cytosolic enzyme that catalyzes S-methylation of aromatic and heterocyclic sulfhydryl compounds, including anticancer and immunosuppressive thiopurines.	cellular organisms(2);Bacteria(1);Eukaryota(1)	25.191
+ds2020-328_725	219	pfam02123	gnl|CDD|280316	1.42892e-13	35	199	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	20.084
+ds2020-328_747	217	pfam02123	gnl|CDD|280316	4.65988e-13	13	210	-2	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	34.560
+ds2020-328_751	216	pfam02123	gnl|CDD|280316	7.05387e-17	8	214	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	21.831
+ds2020-328_762	215	pfam02123	gnl|CDD|280316	3.8356e-09	37	198	-3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	21.244
+ds2020-328_764	215	pfam00201	gnl|CDD|278624	5.96981e-07	113	193	-2	pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase.  	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1)	20.149
+ds2020-328_769	215	pfam02123	gnl|CDD|280316	4.70874e-08	33	209	3	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)	34.188
+ds2020-328_776	214	pfam00361	gnl|CDD|306795	1.62395e-10	59	196	-1	pfam00361, Proton_antipo_M, Proton-conducting membrane transporter.  This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	21.311
+ds2020-328_826	211	pfam05892	gnl|CDD|283531	0.000183874	59	196	-1	pfam05892, Tricho_coat, Trichovirus coat protein.  This family consists of several coat proteins which are specific to the ssRNA positive-strand, no DNA stage viruses such as the Trichovirus and Vitivirus.	Viruses(1);Kitrinoviricota(1);Orthornavirae(1);Tymovirales(1)	21.782
+ds2020-328_827	211	pfam07727	gnl|CDD|311594	9.19953e-05	43	120	1	pfam07727, RVT_2, Reverse transcriptase (RNA-dependent DNA polymerase).  A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. This Pfam entry includes reverse transcriptases not recognized by the pfam00078 model.	cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1)	22.535
+ds2020-328_831	211	pfam00978	gnl|CDD|250270	2.21971e-14	16	201	1	pfam00978, RdRP_2, RNA dependent RNA polymerase.  This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses.	Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1)	18.868
--- a/test-data/rps_test.tab	Sun Sep 08 14:09:19 2024 +0000
+++ b/test-data/rps_test.tab	Tue May 13 11:52:17 2025 +0000
@@ -1,5 +1,5 @@
-#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom
-No definition line	211	pfam01490	gnl|CDD|279788	0.000177299	15	134	-2	pfam01490, Aa_trans, Transmembrane amino acid transporter protein.  This transmembrane region is found in many amino acid transporters including UNC-47 and MTR. UNC-47 encodes a vesicular amino butyric acid (GABA) transporter, (VGAT). UNC-47 is predicted to have 10 transmembrane domains. MTR is a N system amino acid transporter system protein involved in methyltryptophan resistance. Other members of this family include proline transporters and amino acid permeases.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)
-ds2020-267_4	2297	pfam00680	gnl|CDD|279070	3.12197e-05	995	1873	-2	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-ds2020-267_5	2029	pfam00680	gnl|CDD|279070	8.86955e-06	840	1706	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)
-ds2020-267_6	1860	pfam02123	gnl|CDD|280316	1.27376e-17	1147	1764	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1)
+#query_id	query_length	cdd_id	hit_id	evalue	startQ	endQ	frame	description	superkingdom	pident
+No definition line	211	pfam01490	gnl|CDD|279788	0.000177299	15	134	-2	pfam01490, Aa_trans, Transmembrane amino acid transporter protein.  This transmembrane region is found in many amino acid transporters including UNC-47 and MTR. UNC-47 encodes a vesicular amino butyric acid (GABA) transporter, (VGAT). UNC-47 is predicted to have 10 transmembrane domains. MTR is a N system amino acid transporter system protein involved in methyltryptophan resistance. Other members of this family include proline transporters and amino acid permeases.	cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1)	35.000
+ds2020-267_4	2297	pfam00680	gnl|CDD|279070	3.12197e-05	995	1873	-2	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	16.986
+ds2020-267_5	2029	pfam00680	gnl|CDD|279070	8.86955e-06	840	1706	3	pfam00680, RdRP_1, RNA dependent RNA polymerase.  	Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1)	17.974
+ds2020-267_6	1860	pfam02123	gnl|CDD|280316	1.27376e-17	1147	1764	-1	pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase.  This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.	Viruses(1);Riboviria(1);Orthornavirae(1);Resentoviricetes(1)	23.671
--- a/virAnnot_blast2tsv.xml	Sun Sep 08 14:09:19 2024 +0000
+++ b/virAnnot_blast2tsv.xml	Tue May 13 11:52:17 2025 +0000
@@ -1,4 +1,4 @@
-<tool id="virannot_blast2tsv" name="virAnnot Blast2tsv" version="1.1.0+galaxy0" profile="21.05">
+<tool id="virannot_blast2tsv" name="virAnnot Blast2tsv" version="@TOOL_VERSION@+galaxy0" profile="21.05">
     <description>convert XML blast results to tabular file with taxonomic informations</description>
     <macros>
         <import>macros.xml</import>