Mercurial > repos > iuc > virannot_rps2tsv
changeset 4:998724a43694 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 7036ce0e06b6dc64332b1a5642fc58928523c5c6
| author | iuc | 
|---|---|
| date | Tue, 13 May 2025 11:52:07 +0000 | 
| parents | d1fd5579469d | 
| children | aac02a3a395c | 
| files | macros.xml otu.py rps2tsv.py test-data/blast2tsv_output.tab test-data/blast2tsv_output_with_rn.tab test-data/blast2tsv_reads_with_rn.txt test-data/input_otu_rps_s1.tab test-data/input_otu_rps_s2.tab test-data/rps_test.tab virAnnot_rps2tsv.xml | 
| diffstat | 10 files changed, 209 insertions(+), 145 deletions(-) [+] | 
line wrap: on
 line diff
--- a/macros.xml Sun Sep 08 14:09:07 2024 +0000 +++ b/macros.xml Tue May 13 11:52:07 2025 +0000 @@ -18,9 +18,9 @@ <requirement type="package" version="2.2.0">pandas</requirement> <requirement type="package" version="2.8.1">krona</requirement> <requirement type="package" version="3.0">zip</requirement> - <yield /> </requirements> </xml> + <token name="@TOOL_VERSION@">1.2.0</token> <token name="@HEADLESS@"><![CDATA[export QT_QPA_PLATFORM='offscreen' &&]]></token> <xml name="citations"> <citations>
--- a/otu.py Sun Sep 08 14:09:07 2024 +0000 +++ b/otu.py Tue May 13 11:52:07 2025 +0000 @@ -4,8 +4,8 @@ # Name: virAnnot_otu # Author: Marie Lefebvre - INRAE # Reuirements: Ete3 toolkit and external apps -# Aims: Create viral OTUs based on RPS and Blast annotations +"""Create viral OTUs based on RPS and Blast annotations""" import argparse import csv @@ -65,9 +65,14 @@ frame = float(row[7]) description = row[8] superkingdom = row[9] + try: + pident = row[10] + except IndexError: + log.info(rps_file[0]) + log.info(row) match = re.search("Viruses", superkingdom) # if contig is viral then retrieve sequence - if match: + if match and float(pident) >= options.viral_portion: options.fasta.sort() seq = _retrieve_fasta_seq(options.fasta[i][0], query_id) seq_length = len(seq) @@ -103,13 +108,23 @@ if "taxonomy" not in collection[cdd_id][query_id]: collection[cdd_id][query_id]["taxonomy"] = "Unknown" else: - log.info("No blast file") + log.debug("No blast file") collection[cdd_id][query_id]["taxonomy"] = "Unknown" collection[cdd_id][query_id]["nb"] = 0 - - collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] # keep pfamXXX and RdRp 1 + # keep pfamXXX and RdRp 1 + collection[cdd_id]["short_description"] = description.split(",")[0] + description.split(",")[1] collection[cdd_id]["full_description"] = description i += 1 + if options.merge_rdrp == "yes": + rdrp_list = ["pfam00680", "pfam02123", "pfam00978", "pfam00998"] + collection["RdRp_merge"] = {} + for cdd_id in collection: + if cdd_id in rdrp_list and cdd_id != "RdRp_merge": + log.info("Add " + cdd_id + " in merge") + for query_id in collection[cdd_id]: + if query_id not in collection["RdRp_merge"]: + collection["RdRp_merge"][query_id] = {} + collection["RdRp_merge"][query_id] = collection[cdd_id][query_id] return collection @@ -181,7 +196,11 @@ os.mkdir(options.output) color_by_sample = {} for cdd_id in hits_collection: - cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") + log.info("align seq for " + cdd_id) + if cdd_id == "RdRp_merge": + cdd_output = options.output + "/" + cdd_id + else: + cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") if not os.path.exists(cdd_output): os.mkdir(cdd_output) if os.path.exists(cdd_output + "/seq_to_align.fasta"): @@ -223,7 +242,7 @@ file_matrix = cdd_output + "/identity_matrix.csv" log.info("Create tree...") _create_tree(tree_file, file_seq_aligned, tree_file + '.png', file_color_config) - _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id) + _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id) log.info("Retrieve OTUs...") # if os.path.exists(file_cluster): # os.remove(file_cluster) @@ -241,7 +260,7 @@ f.close() -def _compute_pairwise_distance(options, file_seq_aligned, file_matrix, cdd_id): +def _compute_pairwise_distance(file_seq_aligned, file_matrix, cdd_id): """ Calculate paiwise distance between aligned protein sequences from a cdd_id @@ -297,8 +316,13 @@ log.info("Writing stats to " + file_xlsx) for cdd_id in hits_collection: otu_collection = {} - cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") - worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet + if cdd_id == "RdRp_merge": + cdd_output = options.output + "/" + cdd_id + worksheet = workbook.add_worksheet(cdd_id) + else: + + cdd_output = options.output + "/" + hits_collection[cdd_id]["short_description"].replace(" ", "_") + worksheet = workbook.add_worksheet(hits_collection[cdd_id]["short_description"]) # add a worksheet file_cluster = cdd_output + '/otu_cluster.csv' file_fasta_nucc = cdd_output + '/representative_nucc.fasta' with open(file_cluster, 'r') as clust: @@ -315,25 +339,31 @@ otu_collection[row[0]][sample] = {} otu_collection[row[0]][sample][contig] = {} # add read number of the contig and annotation - if 'nb' in hits_collection[cdd_id][contig]: - otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] + if contig in hits_collection[cdd_id]: + if 'nb' in hits_collection[cdd_id][contig]: + otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] + else: + otu_collection[row[0]][sample][contig]['nb'] = 0 + if 'taxonomy' in hits_collection[cdd_id][contig]: + otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] + else: + otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' else: - otu_collection[row[0]][sample][contig]['nb'] = 0 - if 'taxonomy' in hits_collection[cdd_id][contig]: - otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] - else: - otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' + otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} else: otu_collection[row[0]][sample][contig] = {} # add read number of the contig and annotation - if 'nb' in hits_collection[cdd_id][contig]: - otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] + if contig in hits_collection[cdd_id]: + if 'nb' in hits_collection[cdd_id][contig]: + otu_collection[row[0]][sample][contig]['nb'] = hits_collection[cdd_id][contig]["nb"] + else: + otu_collection[row[0]][sample][contig]['nb'] = 0 + if 'taxonomy' in hits_collection[cdd_id][contig]: + otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] + else: + otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' else: - otu_collection[row[0]][sample][contig]['nb'] = 0 - if 'taxonomy' in hits_collection[cdd_id][contig]: - otu_collection[row[0]][sample][contig]['taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] - else: - otu_collection[row[0]][sample][contig]['taxonomy'] = 'unknown' + otu_collection[row[0]][sample][contig] = {'nb': 0, 'taxonomy': 'unknown'} if 'taxonomy' in hits_collection[cdd_id][contig]: otu_collection[row[0]]['global_taxonomy'] = hits_collection[cdd_id][contig]["taxonomy"] else: @@ -362,7 +392,6 @@ # column = 0 with open(file_fasta_nucc, "w+") as f_nucc: for otu in otu_collection: - log.info(otu) if isinstance(otu_collection[otu], dict): column = 0 worksheet.write(row, column, otu) @@ -405,7 +434,10 @@ headers = ['#cdd_id', 'align_files', 'tree_files', 'cluster_files', 'cluster_nb_reads_files', 'pairwise_files', 'description', 'full_description\n'] map_file.write("\t".join(headers)) for cdd_id in hits_collection: - cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") + if cdd_id == "RdRp_merge": + cdd_output = "RdRp_merge" + else: + cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_") short_description = cdd_output file_seq_aligned = cdd_output + '/seq_aligned.final_tree.fa' tree_file = cdd_output + '/tree.dnd.png' @@ -422,6 +454,9 @@ def _set_options(): + """ + Set parameters + """ parser = argparse.ArgumentParser() parser.add_argument('-b', '--blast', help='TAB blast file from blast2ecsv module.', action='append', required=False, dest='blast', nargs='+') parser.add_argument('-r', '--rps', help='TAB rpsblast file from rps2ecsv module.', action='append', required=True, dest='rps', nargs='+') @@ -429,6 +464,7 @@ parser.add_argument('-p', '--percentage', help='Percentage similarity threshold for OTUs cutoff.', action='store', type=int, default=90, dest='perc') parser.add_argument('-vp', '--viral_portion', help='Minimun portion of viral sequences in RPS domain to be included.', action='store', type=float, default=0.3, dest='viral_portion') parser.add_argument('-mpl', '--min_protein_length', help='Minimum query protein length.', action='store', type=int, default=100, dest='min_protein_length') + parser.add_argument('-m', '--merge_rdrp', help='Merge RdRp1, 2, 3 and 4 to create otu on it.', action='store', type=str, default="no", dest='merge_rdrp') parser.add_argument('-tp', '--tool_path', help='Path to otu_seek.R', action='store', type=str, default='./', dest='tool_path') parser.add_argument('-o', '--out', help='The output directory', action='store', type=str, default='./Rps2tree_OTU', dest='output') parser.add_argument('-rgb', '--rgb-conf', help='Color palette for contigs coloration', action='store', type=str, default='rgb.txt', dest='file_rgb') @@ -438,6 +474,9 @@ def _set_log_level(verbosity): + """ + Debbug + """ if verbosity == 1: log_format = '%(asctime)s %(levelname)-8s %(message)s' log.basicConfig(level=log.INFO, format=log_format)
--- a/rps2tsv.py Sun Sep 08 14:09:07 2024 +0000 +++ b/rps2tsv.py Tue May 13 11:52:07 2025 +0000 @@ -5,6 +5,7 @@ # Author: Marie Lefebvre - INRAE # Aims: Convert rpsblast xml output to csv and add taxonomy +"""Module which converts rpsblast xml output to tsv and add taxonomy""" import argparse import json @@ -19,6 +20,9 @@ def main(): + """ + Main function + """ options = _set_options() _set_log_level(options.verbosity) hits = _read_xml(options) @@ -44,6 +48,12 @@ hit_evalue = hit.expect # evalue hit_startQ = hit.query_start hit_endQ = hit.query_end + hit_identity = hit.identities + hit_aln_length = hit.align_length + pident = "%0.3f" % (100 * float(hit_identity) / float(hit_aln_length)) + if float(pident) < 0.1: + continue + hsp["pident"] = pident hsp["frame"] = hit_frame hsp["evalue"] = hit_evalue hsp["startQ"] = hit_startQ @@ -83,7 +93,8 @@ taxonomy = names if len(taxonomy) != 0: kingdoms.append(taxonomy[0]) - frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} + # {'Pseudomonadota': 9, 'cellular organisms': 4} + frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) hsp["taxonomy"] = concat_freq @@ -96,29 +107,40 @@ Write output """ log.info("Write output file " + options.output) - headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" + headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\tpident\n" f = open(options.output, "w+") f.write(headers) for h in hits: f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") - f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") - f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) + f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + + str(hits[h]["frame"]) + "\t") + f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"] + "\t" + hits[h]["pident"]) f.write("\n") f.close() def _set_options(): + """ + Script parameters + """ parser = argparse.ArgumentParser() - parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') - parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') - parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') - parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) + parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', + required=True, dest='xml_file') + parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', + type=float, default=0.0001, dest='max_evalue') + parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', + type=str, default='./rps2tsv_output.tab', dest='output') + parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', + type=int, choices=[1, 2, 3, 4], default=1) args = parser.parse_args() return args def _set_log_level(verbosity): + """ + Debbug + """ if verbosity == 1: log_format = '%(asctime)s %(levelname)-8s %(message)s' log.basicConfig(level=log.INFO, format=log_format)
--- a/test-data/blast2tsv_output.tab Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/blast2tsv_output.tab Tue May 13 11:52:07 2025 +0000 @@ -2,11 +2,11 @@ TBLASTX NODE_13_length_295_cov_0.945833 295 316155 pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2. This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4. Tursiops truncatus papillomavirus 2 41.5 1 100 67.0 2.277e-05 38.6378 316155 Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2 TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG TBLASTX NODE_16_length_278_cov_0.901345 278 306845 pfam00421, PSII, Photosystem II protein. 65.8 1 100 47.0 7.65615e-39 132.634 GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA TBLASTX NODE_19_length_271_cov_0.879630 271 306845 pfam00421, PSII, Photosystem II protein. 32.9 1 100 42.0 1.69015e-11 56.3644 GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_20_length_267_cov_1.429245 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC +TBLASTX NODE_20_length_267_cov_1.429245 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_22_length_262_cov_1.053140 262 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 40.9 1 100 77.0 4.94039e-28 99.6256 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_24_length_258_cov_0.935961 258 307679 pfam01660, Vmethyltransf, Viral methyltransferase. This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. Tetrastemma peltatum 39.4 1 100 70.0 8.38713e-15 65.0021 307679 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_29_length_250_cov_0.851282 250 278700 pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits. uncultured archaeon CRE-PA11a 58.6 1 100 100 7.31211e-08 42.0012 278700 cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_34_length_245_cov_1.000000 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG +TBLASTX NODE_34_length_245_cov_1.000000 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG TBLASTX NODE_46_length_229_cov_1.091954 229 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 43.9 1 100 66.0 4.26406e-23 86.1436 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_47_length_229_cov_0.816092 229 306687 pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein. 66.7 1 100 14.0 1.79906e-13 61.3066 TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_50_length_226_cov_2.269006 226 306845 pfam00421, PSII, Photosystem II protein. 60.3 1 100 41.0 2.77182e-23 89.1064 GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/blast2tsv_output_with_rn.tab Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/blast2tsv_output_with_rn.tab Tue May 13 11:52:07 2025 +0000 @@ -2,11 +2,11 @@ TBLASTX NODE_13_length_295_cov_0.945833 264 295 316155 pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2. This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4. Tursiops truncatus papillomavirus 2 41.5 1 100 67.0 2.277e-05 38.6378 316155 Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2 TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG TBLASTX NODE_16_length_278_cov_0.901345 377 278 306845 pfam00421, PSII, Photosystem II protein. 65.8 1 100 47.0 7.65615e-39 132.634 GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA TBLASTX NODE_19_length_271_cov_0.879630 67 271 306845 pfam00421, PSII, Photosystem II protein. 32.9 1 100 42.0 1.69015e-11 56.3644 GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_20_length_267_cov_1.429245 2 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC +TBLASTX NODE_20_length_267_cov_1.429245 2 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_22_length_262_cov_1.053140 262 262 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 40.9 1 100 77.0 4.94039e-28 99.6256 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_24_length_258_cov_0.935961 101 258 307679 pfam01660, Vmethyltransf, Viral methyltransferase. This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. Tetrastemma peltatum 39.4 1 100 70.0 8.38713e-15 65.0021 307679 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_29_length_250_cov_0.851282 428 250 278700 pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits. uncultured archaeon CRE-PA11a 58.6 1 100 100 7.31211e-08 42.0012 278700 cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_34_length_245_cov_1.000000 183 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG +TBLASTX NODE_34_length_245_cov_1.000000 183 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG TBLASTX NODE_46_length_229_cov_1.091954 471 229 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 43.9 1 100 66.0 4.26406e-23 86.1436 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_47_length_229_cov_0.816092 470 229 306687 pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein. 66.7 1 100 14.0 1.79906e-13 61.3066 TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_50_length_226_cov_2.269006 315 226 306845 pfam00421, PSII, Photosystem II protein. 60.3 1 100 41.0 2.77182e-23 89.1064 GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/blast2tsv_reads_with_rn.txt Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/blast2tsv_reads_with_rn.txt Tue May 13 11:52:07 2025 +0000 @@ -2,11 +2,11 @@ TBLASTX NODE_13_length_295_cov_0.945833 264 295 316155 pfam13603, tRNA-synt_1_2, Leucyl-tRNA synthetase, Domain 2. This is a family of the conserved region of Leucine-tRNA ligase or Leucyl-tRNA synthetase, EC:6.1.1.4. Tursiops truncatus papillomavirus 2 41.5 1 100 67.0 2.277e-05 38.6378 316155 Viruses;Monodnaviria;Shotokuvirae;Cossaviricota;Papovaviricetes;Zurhausenvirales;Papillomaviridae;Firstpapillomavirinae;Upsilonpapillomavirus;Upsilonpapillomavirus 2;Tursiops truncatus papillomavirus 2 TGTGTTGGGTGTGTTTGGTTTCCGGTTACCATAATCGCTATTCTTTCAAACAGAAAGCGCATGCTAAGTATTCTCACCCAGAGGAATATGCTGACAAGCCCTCCTCAAAAGGCTATTTTTACAATGCCACCTATGAGAATGCACGAACTCTTATTCACTTCATTAAGCAATATGGATTGCCCTTCAATCCTGTTATTGCACCAGAAGATGCTGAACTAACTGATGAACAGATTCAATCTTACATCAACACAGCAAACTCCTTCTTTAATGATTATCCGACGTTACTGTTCACCCG TBLASTX NODE_16_length_278_cov_0.901345 377 278 306845 pfam00421, PSII, Photosystem II protein. 65.8 1 100 47.0 7.65615e-39 132.634 GTCTAACCTGTGTTGGGTGTGTTTGGGCTGTAATCGAGGTATAGTGTCGAACAAGTCGGTGTCACTGTTGAATTCTATGGCGGCGAACTCAATGGAGTCAGTTATAGTGATCCTGCTACTGTGAAAAAATATGCTAGACGTGCTCAATTGGGTGAAATTTTTGAATTAGATCGTGCTACTTTAAAATCGGATGGTGTTTTTCGTAGCAGTCCAAGGGGTTGGTTTACTTTTGGACATGCGTCGTTTGCTCTGCTCTTCTTCCAAACACACCCAACACA TBLASTX NODE_19_length_271_cov_0.879630 67 271 306845 pfam00421, PSII, Photosystem II protein. 32.9 1 100 42.0 1.69015e-11 56.3644 GTCTAACCTGTGTTGGGTGTGTTTGGTATGGAGGGAGGTGTATATGATACCTGGGCACCCGGAGGGGGAGATGTAAGAAAAATTACCAACTTGACCCTTAACCCAAGCGTGATATTTGGTTATTTACTAAAATCTCCTTTTGGGGGAGAAGGATGGATTGTTAGTGTGGACGATTTAGAAGATATAATTGGAGGACATGTCTGGTTAGGCTCCATTTGTATACTTGGTGGAATTTGGCATATCTTAACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_20_length_267_cov_1.429245 2 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC +TBLASTX NODE_20_length_267_cov_1.429245 2 267 287774 pfam10839, DUF2647, Protein of unknown function (DUF2647). This eukaryotic family of proteins are annotated as ycf68 but have no known function. Desulfovibrio sp. G100IX 91.3 1 100 99.0 7.70073e-10 48.4966 287774 cellular organisms;Bacteria;Pseudomonadati;Thermodesulfobacteriota;Desulfovibrionia;Desulfovibrionales;Desulfovibrionaceae;Desulfovibrio;unclassified Desulfovibrio;Desulfovibrio sp. G100IX CTGTGTTGGGTGTGTTTGGACTTGGTATGTGAAGATACGTTGTTAGGTGCTCCGTTTTATTTTCCCATTGAGGCCGAACCTAAACCTGTGCTCGAGAGATAGCTGTCCATATACTGATAAGGGATGTATGGATTCTCGAGAAGAGAGGAGCCATGGTGGTCCCTCCCGGACCGCCCGGATCCCACGAGTGAATAGAAAGTTGGATCTACATTGGATCTCACCTGAATCGCCCCATAAACAACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_22_length_262_cov_1.053140 262 262 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 40.9 1 100 77.0 4.94039e-28 99.6256 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j GTCTAACCTGTGTTGGGTGTGTTTGGCTAGTCAGTAGCTTGTTATATGGGTCGTGAGTGGGAAGTTAGCTTCCGTCTGGGTATGCGCCCGTGGATTGCTGTTGCATATTCAGCTCCTGTTGCAGCTGCTACTGCTGTTTTCTTGATTTACCCAATTGGTCAAGGAAGTTTTTCTGATGGTATGCCTCTAGGAATCTCTGGTACTTTCAACTTCATGATTGTATTCCAGGAGAGCACCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_24_length_258_cov_0.935961 101 258 307679 pfam01660, Vmethyltransf, Viral methyltransferase. This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. Tetrastemma peltatum 39.4 1 100 70.0 8.38713e-15 65.0021 307679 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Spiralia;Lophotrochozoa;Nemertea;Enopla;Hoplonemertea;Monostilifera;Eumonostilifera;Tetrastemmatidae;Tetrastemma;Tetrastemma peltatum GTGTTGGGTGTGTTTGGTTGGTGAACGCGCACCATTTAGTGGCAATCACGCGCGGGGAGGCTGAAAACTGCAAGCATAGATCTTTCGGCCCTTTCGAAGCTACCGCTTCCGAGAGCCTGGCTAAACTCTGCCCAGATTATCCGATCTGCTTGCCTGTACCTTACGACGTGATCAATAAAGTGTATAGGTATCTCAGAACGCTTAAGAAGCCTGATGTGCAGTCGCCCCACTACCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_29_length_250_cov_0.851282 428 250 278700 pfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits. uncultured archaeon CRE-PA11a 58.6 1 100 100 7.31211e-08 42.0012 278700 cellular organisms;Archaea;environmental samples;uncultured archaeon CRE-PA11a GTCTAACCTGTGTTGGGTGTGTTTGGGTTTCTTTGGAGCAACTCGATGAATTTAGTAAATCCTTTTAGGAGGTTCCCAATGACCATAGATCGAACCTATCCAATTTTTACAGTGCGATGGTTGGCTGTTCACGGACTGGCTGTACCTACTGTTTCTTTTTTAGGGTCAATATCAGCAATGCAGTTCATCCAACGATAAACCTAATTCAAATTATAGAGCTAGCACACCAAACACACCCAACACAGGTTAG -TBLASTX NODE_34_length_245_cov_1.000000 183 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Terrabacteria group;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG +TBLASTX NODE_34_length_245_cov_1.000000 183 245 250270 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Nocardia sp. 431D04 37.5 1 100 38.0 6.42106e-08 45.7137 250270 cellular organisms;Bacteria;Bacillati;Actinomycetota;Actinomycetes;Mycobacteriales;Nocardiaceae;Nocardia;unclassified Nocardia;Nocardia sp. 431D04 GTCTAACCTGTGTTGGGTGTGTTTGGATGGTGATCGGCAATTTAAAGGAATGTGTGCGATATTTTCACCAATATTCAAGGAGCTTAAAAATCGACTAAAAAGCGTGTTAGATATTAAATACATGTATGCAGACGGATTGAGACCTGATCAGTTGTCGGAGCGCATGTCACAGATAGGTGCAGGTAAATATTTTATAGAGAATGATATGGAACAGCATCTCGCCAAACACACCCAACACAGGTTAG TBLASTX NODE_46_length_229_cov_1.091954 471 229 306604 pfam00124, Photo_RC, Photosynthetic reaction centre protein. Heterotermes sp. TMJ-2004j 43.9 1 100 66.0 4.26406e-23 86.1436 306604 cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Protostomia;Ecdysozoa;Panarthropoda;Arthropoda;Mandibulata;Pancrustacea;Hexapoda;Insecta;Dicondylia;Pterygota;Neoptera;Polyneoptera;Dictyoptera;Blattodea;Blattoidea;Termitoidae;Rhinotermitidae;Heterotermitinae;Heterotermes;unclassified Heterotermes;Heterotermes sp. TMJ-2004j TGTGTTGGGTGTGTTTGGTTGGATGCCTGGAATACAATCATGAAATTGAAAGTACCAGATATTCCTAAAGGCATGCCATCTGAAAAACTTCCTTGACCAATAGGGTAGATCAAGAAAACAGCTGTAGCAGCCGCGACAGGAGCTGAATATGCAACAGCAATCCAAGGACGCATACCCAGACGGAAACTAAGCTCCCTCTCGCTCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_47_length_229_cov_0.816092 470 229 306687 pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein. 66.7 1 100 14.0 1.79906e-13 61.3066 TTGGTAAATTGGCGGAAAGAGGAGGACTCAATGATTATTCGTTCGCCGGAACCAGAAGTAAAAATTTTGGTAGATAGGGATCACATAAAAACTTCTTTCGAGGAATGGGCCAGGCCGGGTCATTTCTCAAGAACACTAGCTAAAGGCCCTGACACTACCACTTGGATCTGGAACCTACATGCTGATGCTCACGATCTTAATAGCCAAACACACCCAACACAGGTTAGAC TBLASTX NODE_50_length_226_cov_2.269006 315 226 306845 pfam00421, PSII, Photosystem II protein. 60.3 1 100 41.0 2.77182e-23 89.1064 GTCAACGGTGTGTTGGGTGTGTTTGGGAAAGGTCCTGGAATATGGGTGTCCGATCCTTATGGACTAACCGGAACAGTGCAACCTGTAAATCCGGCGTGGGGCGTGGAAGGTTTTGATCCTTTTGTCCCGGGAGGAATAGCTTCTCATCATATTGCAGCAGGTACATTGGGCATATTAGCGGGCCTATTCCATCTTAGCGTACGGTCACCCCAGCCAAACACACCCA
--- a/test-data/input_otu_rps_s1.tab Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/input_otu_rps_s1.tab Tue May 13 11:52:07 2025 +0000 @@ -1,45 +1,45 @@ -#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom -Query_2 2436 pfam02123 gnl|CDD|280316 2.04111e-21 184 1476 1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_4 2297 pfam00680 gnl|CDD|279070 3.12197e-05 995 1873 -2 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -Query_5 2029 pfam00680 gnl|CDD|279070 8.86955e-06 840 1706 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -Query_6 1860 pfam02123 gnl|CDD|280316 1.27376e-17 1147 1764 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_8 1703 pfam00680 gnl|CDD|279070 3.19349e-12 685 1458 -3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -Query_19 425 pfam00005 gnl|CDD|306511 3.70622e-07 129 275 -1 pfam00005, ABC_tran, ABC transporter. ABC transporters for a large family of proteins responsible for translocation of a variety of compounds across biological membranes. ABC transporters are the largest family of proteins in many completely sequenced bacteria. ABC transporters are composed of two copies of this domain and two copies of a transmembrane domain pfam00664. These four domains may belong to a single polypeptide as in CFTR, or belong in different polypeptide chains. Bacteria(2);cellular organisms(1);Terrabacteria group(1) -Query_38 386 pfam01347 gnl|CDD|279663 0.000262768 129 275 -1 pfam01347, Vitellogenin_N, Lipoprotein amino terminal region. This family contains regions from: Vitellogenin, Microsomal triglyceride transfer protein and apolipoprotein B-100. These proteins are all involved in lipid transport. This family contains the LV1n chain from lipovitellin, that contains two structural domains. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_41 380 pfam04879 gnl|CDD|282703 2.77416e-08 125 274 -2 pfam04879, Molybdop_Fe4S4, Molybdopterin oxidoreductase Fe4S4 domain. This domain is found in formate dehydrogenase H for which the structure is known. This first domain (residues 1 to 60) of Structure 1aa6 is an Fe4S4 cluster just below the protein surface. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_42 379 pfam16203 gnl|CDD|318443 8.05104e-30 131 280 -1 pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase. This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases. cellular organisms(2);Bacteria(1);Terrabacteria group(1) -Query_44 376 pfam00401 gnl|CDD|306831 6.62013e-05 81 215 -3 pfam00401, ATP-synt_DE, ATP synthase, Delta/Epsilon chain, long alpha-helix domain. Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. This subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213). cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_58 347 pfam00471 gnl|CDD|306877 8.86568e-13 132 302 3 pfam00471, Ribosomal_L33, Ribosomal protein L33. cellular organisms(2);Bacteria(1);Eukaryota(1) -Query_61 344 pfam00252 gnl|CDD|306711 1.17482e-22 107 295 2 pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e. cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_62 343 pfam00421 gnl|CDD|306845 7.93928e-41 92 337 -1 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_64 339 pfam01333 gnl|CDD|307480 0.000362606 197 325 -3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_74 330 pfam00680 gnl|CDD|279070 4.51414e-05 124 282 1 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -Query_83 320 pfam05860 gnl|CDD|310447 1.29746e-13 167 298 2 pfam05860, Haemagg_act, haemagglutination activity domain. This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_87 252 pfam00585 gnl|CDD|278982 1.42752e-05 29 166 2 pfam00585, Thr_dehydrat_C, C-terminal regulatory domain of Threonine dehydratase. Threonine dehydratases pfam00291 all contain a carboxy terminal region. This region may have a regulatory role. Some members contain two copies of this region. This family is homologous to the pfam01842 domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_90 251 pfam13188 gnl|CDD|315779 0.000739897 32 241 2 pfam13188, PAS_8, PAS domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_91 251 pfam02123 gnl|CDD|280316 3.2928e-08 28 228 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_93 251 pfam00252 gnl|CDD|306711 7.50297e-12 78 206 -1 pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e. cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_98 250 pfam00227 gnl|CDD|306690 4.91252e-09 10 150 -2 pfam00227, Proteasome, Proteasome subunit. The proteasome is a multisubunit structure that degrades proteins. Protein degradation is an essential component of regulation because proteins can become misfolded, damaged, or unnecessary. Proteasomes and their homologs vary greatly in complexity: from HslV (heat shock locus v), which is encoded by 1 gene in bacteria, to the eukaryotic 20S proteasome, which is encoded by more than 14 genes. Recently evidence of two novel groups of bacterial proteasomes was proposed. The first is Anbu, which is sparsely distributed among cyanobacteria and proteobacteria. The second is call beta-proteobacteria proteasome homolog (BPH). cellular organisms(2);Eukaryota(1);Opisthokonta(1) -Query_104 249 pfam13173 gnl|CDD|315764 2.6724e-08 106 249 1 pfam13173, AAA_14, AAA domain. This family of domains contain a P-loop motif that is characteristic of the AAA superfamily. Bacteria(2);cellular organisms(1);FCB group(1) -Query_111 248 pfam00113 gnl|CDD|278539 3.9331e-13 15 116 -1 pfam00113, Enolase_C, Enolase, C-terminal TIM barrel domain. cellular organisms(2);Bacteria(2) -Query_127 245 pfam00946 gnl|CDD|307203 3.13472e-05 1 141 1 pfam00946, Mononeg_RNA_pol, Mononegavirales RNA dependent RNA polymerase. Members of the Mononegavirales including the Paramyxoviridae, like other non-segmented negative strand RNA viruses, have an RNA-dependent RNA polymerase composed of two subunits, a large protein L and a phosphoprotein P. This is a protein family of the L protein. The L protein confers the RNA polymerase activity on the complex. The P protein acts as a transcription factor. Viruses(1);Riboviria(1);Orthornavirae(1);Negarnaviricota(1) -Query_138 243 pfam00416 gnl|CDD|306841 5.30772e-05 15 134 -2 pfam00416, Ribosomal_S13, Ribosomal protein S13/S18. This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes. cellular organisms(2);Bacteria(2) -Query_139 243 pfam00216 gnl|CDD|306682 1.89202e-10 134 241 -3 pfam00216, Bac_DNA_binding, Bacterial DNA-binding protein. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_140 243 pfam13041 gnl|CDD|315669 0.000344884 134 241 -3 pfam13041, PPR_2, PPR repeat family. This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_144 243 pfam12137 gnl|CDD|314930 3.71293e-05 137 217 -3 pfam12137, RapA_C, RNA polymerase recycling family C-terminal. This domain is found in bacteria. This domain is about 360 amino acids in length. This domain is found associated with pfam00271, pfam00176. The function of this domain is not known, but structurally it forms an alpha-beta fold in nature with a central beta-sheet flanked by helices and loops, the beta-sheet being mainly antiparallel and flanked by four alpha helices, among which the two longer helices exhibit a coiled-coil arrangement. cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1) -Query_145 242 pfam00146 gnl|CDD|306623 2.12078e-10 22 111 1 pfam00146, NADHdh, NADH dehydrogenase. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_149 242 pfam00124 gnl|CDD|306604 4.44151e-07 21 125 3 pfam00124, Photo_RC, Photosynthetic reaction centre protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_163 241 pfam02123 gnl|CDD|280316 5.78854e-08 35 214 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_177 239 pfam06122 gnl|CDD|310603 1.30391e-05 29 172 2 pfam06122, TraH, Conjugative relaxosome accessory transposon protein. The TraH protein is thought to be a relaxosome accessory component, also necessary for transfer but not for H-pilus synthesis within the conjugative transposon. cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1) -Query_179 239 pfam00361 gnl|CDD|306795 3.63199e-05 70 219 1 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_182 239 pfam00177 gnl|CDD|306646 1.05327e-06 28 126 1 pfam00177, Ribosomal_S7, Ribosomal protein S7p/S5e. This family contains ribosomal protein S7 from prokaryotes and S5 from eukaryotes. cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_202 235 pfam03154 gnl|CDD|308660 0.000842762 28 126 1 pfam03154, Atrophin-1, Atrophin-1 family. Atrophin-1 is the protein product of the dentatorubral-pallidoluysian atrophy (DRPLA) gene. DRPLA OMIM:125370 is a progressive neurodegenerative disorder. It is caused by the expansion of a CAG repeat in the DRPLA gene on chromosome 12p. This results in an extended polyglutamine region in atrophin-1, that is thought to confer toxicity to the protein, possibly through altering its interactions with other proteins. The expansion of a CAG repeat is also the underlying defect in six other neurodegenerative disorders, including Huntington's disease. One interaction of expanded polyglutamine repeats that is thought to be pathogenic is that with the short glutamine repeat in the transcriptional coactivator CREB binding protein, CBP. This interaction draws CBP away from its usual nuclear location to the expanded polyglutamine repeat protein aggregates that are characteristic of the polyglutamine neurodegenerative disorders. This interferes with CBP-mediated transcription and causes cytotoxicity. Eukaryota(1);cellular organisms(1);Opisthokonta(1);Metazoa(1) -Query_203 235 pfam00164 gnl|CDD|278589 1.83229e-23 3 182 3 pfam00164, Ribosom_S12_S23, Ribosomal protein S12/S23. This protein is known as S12 in bacteria and archaea and S23 in eukaryotes. cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_211 234 pfam00155 gnl|CDD|306629 0.000251531 3 182 3 pfam00155, Aminotran_1_2, Aminotransferase class I and II. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_219 233 pfam00680 gnl|CDD|279070 0.000703744 3 182 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -Query_232 231 pfam00481 gnl|CDD|306885 0.00063843 3 182 3 pfam00481, PP2C, Protein phosphatase 2C. Protein phosphatase 2C is a Mn++ or Mg++ dependent protein serine/threonine phosphatase. Eukaryota(2);cellular organisms(1);Viridiplantae(1) -Query_241 230 pfam00072 gnl|CDD|306560 5.30837e-08 50 208 2 pfam00072, Response_reg, Response regulator receiver domain. This domain receives the signal from the sensor partner in bacterial two-component systems. It is usually found N-terminal to a DNA binding effector domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_246 230 pfam00201 gnl|CDD|278624 2.93544e-07 46 210 1 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) -Query_261 228 pfam17035 gnl|CDD|319097 3.87403e-09 108 203 3 pfam17035, BET, Bromodomain extra-terminal - transcription regulation. The BET, or bromodomain extra-terminal domain, is found on bromodomain proteins that play key roles in development, cancer progression and virus-host pathogenesis. It interacts with NSD3, JMJD6, CHD4, GLTSCR1, and ATAD5 all of which are shown to impart a pTEFb-independent transcriptional activation function on the bromodomain proteins. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_280 207 pfam04061 gnl|CDD|309259 7.30581e-19 1 159 1 pfam04061, ORMDL, ORMDL family. Evidence form suggests that ORMDLs are involved in protein folding in the ER. Orm proteins have been identified as negative regulators of sphingolipid synthesis that form a conserved complex with serine palmitoyltransferase, the first and rate-limiting enzyme in sphingolipid production. This novel and conserved protein complex, has been termed the SPOTS complex (serine palmitoyltransferase, Orm1/2, Tsc3, and Sac1). cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_326 206 pfam10775 gnl|CDD|313884 0.00091969 1 159 1 pfam10775, ATP_sub_h, ATP synthase complex subunit h. Subunit h is a component of the yeast mitochondrial F1-F0 ATP synthase. It is essential for the correct assembly and functioning of this enzyme. Subunit h occupies a central place in the peripheral stalk between the F1 sector and the membrane. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Fungi(1) +#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom pident +ds2020-267_5 2436 pfam02123 gnl|CDD|280316 2.04111e-21 184 1476 1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 22.535 +ds2020-267_7 2297 pfam00680 gnl|CDD|279070 3.12197e-05 995 1873 -2 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 19.742 +ds2020-267_8 2029 pfam00680 gnl|CDD|279070 8.86955e-06 840 1706 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 25.314 +ds2020-267_10 1860 pfam02123 gnl|CDD|280316 1.27376e-17 1147 1764 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 18.868 +ds2020-267_12 1703 pfam00680 gnl|CDD|279070 3.19349e-12 685 1458 -3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 27.456 +ds2020-267_75 425 pfam00005 gnl|CDD|306511 3.70622e-07 129 275 -1 pfam00005, ABC_tran, ABC transporter. ABC transporters for a large family of proteins responsible for translocation of a variety of compounds across biological membranes. ABC transporters are the largest family of proteins in many completely sequenced bacteria. ABC transporters are composed of two copies of this domain and two copies of a transmembrane domain pfam00664. These four domains may belong to a single polypeptide as in CFTR, or belong in different polypeptide chains. Bacteria(2);cellular organisms(1);Terrabacteria group(1) 33.974 +ds2020-267_76 386 pfam01347 gnl|CDD|279663 0.000262768 129 275 -1 pfam01347, Vitellogenin_N, Lipoprotein amino terminal region. This family contains regions from: Vitellogenin, Microsomal triglyceride transfer protein and apolipoprotein B-100. These proteins are all involved in lipid transport. This family contains the LV1n chain from lipovitellin, that contains two structural domains. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 24.167 +ds2020-267_79 380 pfam04879 gnl|CDD|282703 2.77416e-08 125 274 -2 pfam04879, Molybdop_Fe4S4, Molybdopterin oxidoreductase Fe4S4 domain. This domain is found in formate dehydrogenase H for which the structure is known. This first domain (residues 1 to 60) of Structure 1aa6 is an Fe4S4 cluster just below the protein surface. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 22.921 +ds2020-267_80 379 pfam16203 gnl|CDD|318443 8.05104e-30 131 280 -1 pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase. This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases. cellular organisms(2);Bacteria(1);Terrabacteria group(1) 29.017 +ds2020-267_81 376 pfam00401 gnl|CDD|306831 6.62013e-05 81 215 -3 pfam00401, ATP-synt_DE, ATP synthase, Delta/Epsilon chain, long alpha-helix domain. Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. This subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213). cellular organisms(2);Eukaryota(1);Viridiplantae(1) 27.296 +ds2020-267_320 347 pfam00471 gnl|CDD|306877 8.86568e-13 132 302 3 pfam00471, Ribosomal_L33, Ribosomal protein L33. cellular organisms(2);Bacteria(1);Eukaryota(1) 27.649 +ds2020-267_322 344 pfam00252 gnl|CDD|306711 1.17482e-22 107 295 2 pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e. cellular organisms(2);Eukaryota(1);Viridiplantae(1) 18.354 +ds2020-267_323 343 pfam00421 gnl|CDD|306845 7.93928e-41 92 337 -1 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 21.070 +ds2020-267_324 339 pfam01333 gnl|CDD|307480 0.000362606 197 325 -3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 26.684 +ds2020-267_327 330 pfam00680 gnl|CDD|279070 4.51414e-05 124 282 1 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 24.942 +ds2020-267_332 320 pfam05860 gnl|CDD|310447 1.29746e-13 167 298 2 pfam05860, Haemagg_act, haemagglutination activity domain. This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 22.222 +ds2020-267_333 252 pfam00585 gnl|CDD|278982 1.42752e-05 29 166 2 pfam00585, Thr_dehydrat_C, C-terminal regulatory domain of Threonine dehydratase. Threonine dehydratases pfam00291 all contain a carboxy terminal region. This region may have a regulatory role. Some members contain two copies of this region. This family is homologous to the pfam01842 domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 25.916 +ds2020-267_336 251 pfam13188 gnl|CDD|315779 0.000739897 32 241 2 pfam13188, PAS_8, PAS domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 27.014 +ds2020-267_337 251 pfam02123 gnl|CDD|280316 3.2928e-08 28 228 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 37.500 +ds2020-267_338 251 pfam00252 gnl|CDD|306711 7.50297e-12 78 206 -1 pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e. cellular organisms(2);Eukaryota(1);Viridiplantae(1) 17.308 +ds2020-267_339 250 pfam00227 gnl|CDD|306690 4.91252e-09 10 150 -2 pfam00227, Proteasome, Proteasome subunit. The proteasome is a multisubunit structure that degrades proteins. Protein degradation is an essential component of regulation because proteins can become misfolded, damaged, or unnecessary. Proteasomes and their homologs vary greatly in complexity: from HslV (heat shock locus v), which is encoded by 1 gene in bacteria, to the eukaryotic 20S proteasome, which is encoded by more than 14 genes. Recently evidence of two novel groups of bacterial proteasomes was proposed. The first is Anbu, which is sparsely distributed among cyanobacteria and proteobacteria. The second is call beta-proteobacteria proteasome homolog (BPH). cellular organisms(2);Eukaryota(1);Opisthokonta(1) 21.244 +ds2020-267_343 249 pfam13173 gnl|CDD|315764 2.6724e-08 106 249 1 pfam13173, AAA_14, AAA domain. This family of domains contain a P-loop motif that is characteristic of the AAA superfamily. Bacteria(2);cellular organisms(1);FCB group(1) 24.583 +ds2020-267_362 248 pfam00113 gnl|CDD|278539 3.9331e-13 15 116 -1 pfam00113, Enolase_C, Enolase, C-terminal TIM barrel domain. cellular organisms(2);Bacteria(2) 21.656 +ds2020-267_363 245 pfam00946 gnl|CDD|307203 3.13472e-05 1 141 1 pfam00946, Mononeg_RNA_pol, Mononegavirales RNA dependent RNA polymerase. Members of the Mononegavirales including the Paramyxoviridae, like other non-segmented negative strand RNA viruses, have an RNA-dependent RNA polymerase composed of two subunits, a large protein L and a phosphoprotein P. This is a protein family of the L protein. The L protein confers the RNA polymerase activity on the complex. The P protein acts as a transcription factor. Viruses(1);Riboviria(1);Orthornavirae(1);Negarnaviricota(1) 26.562 +ds2020-267_364 243 pfam00416 gnl|CDD|306841 5.30772e-05 15 134 -2 pfam00416, Ribosomal_S13, Ribosomal protein S13/S18. This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes. cellular organisms(2);Bacteria(2) 26.276 +ds2020-267_365 243 pfam00216 gnl|CDD|306682 1.89202e-10 134 241 -3 pfam00216, Bac_DNA_binding, Bacterial DNA-binding protein. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 25.178 +ds2020-267_366 243 pfam13041 gnl|CDD|315669 0.000344884 134 241 -3 pfam13041, PPR_2, PPR repeat family. This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 17.600 +ds2020-267_370 243 pfam12137 gnl|CDD|314930 3.71293e-05 137 217 -3 pfam12137, RapA_C, RNA polymerase recycling family C-terminal. This domain is found in bacteria. This domain is about 360 amino acids in length. This domain is found associated with pfam00271, pfam00176. The function of this domain is not known, but structurally it forms an alpha-beta fold in nature with a central beta-sheet flanked by helices and loops, the beta-sheet being mainly antiparallel and flanked by four alpha helices, among which the two longer helices exhibit a coiled-coil arrangement. cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1) 24.942 +ds2020-267_372 242 pfam00146 gnl|CDD|306623 2.12078e-10 22 111 1 pfam00146, NADHdh, NADH dehydrogenase. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 24.942 +ds2020-267_373 242 pfam00124 gnl|CDD|306604 4.44151e-07 21 125 3 pfam00124, Photo_RC, Photosynthetic reaction centre protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 33.663 +ds2020-267_374 241 pfam02123 gnl|CDD|280316 5.78854e-08 35 214 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 21.831 +ds2020-267_380 239 pfam06122 gnl|CDD|310603 1.30391e-05 29 172 2 pfam06122, TraH, Conjugative relaxosome accessory transposon protein. The TraH protein is thought to be a relaxosome accessory component, also necessary for transfer but not for H-pilus synthesis within the conjugative transposon. cellular organisms(1);Bacteria(1);Pseudomonadota(1);Gammaproteobacteria(1) 37.888 +ds2020-267_385 239 pfam00361 gnl|CDD|306795 3.63199e-05 70 219 1 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 18.868 +ds2020-267_386 239 pfam00177 gnl|CDD|306646 1.05327e-06 28 126 1 pfam00177, Ribosomal_S7, Ribosomal protein S7p/S5e. This family contains ribosomal protein S7 from prokaryotes and S5 from eukaryotes. cellular organisms(2);Eukaryota(1);Viridiplantae(1) 29.545 +ds2020-267_395 235 pfam03154 gnl|CDD|308660 0.000842762 28 126 1 pfam03154, Atrophin-1, Atrophin-1 family. Atrophin-1 is the protein product of the dentatorubral-pallidoluysian atrophy (DRPLA) gene. DRPLA OMIM:125370 is a progressive neurodegenerative disorder. It is caused by the expansion of a CAG repeat in the DRPLA gene on chromosome 12p. This results in an extended polyglutamine region in atrophin-1, that is thought to confer toxicity to the protein, possibly through altering its interactions with other proteins. The expansion of a CAG repeat is also the underlying defect in six other neurodegenerative disorders, including Huntington's disease. One interaction of expanded polyglutamine repeats that is thought to be pathogenic is that with the short glutamine repeat in the transcriptional coactivator CREB binding protein, CBP. This interaction draws CBP away from its usual nuclear location to the expanded polyglutamine repeat protein aggregates that are characteristic of the polyglutamine neurodegenerative disorders. This interferes with CBP-mediated transcription and causes cytotoxicity. Eukaryota(1);cellular organisms(1);Opisthokonta(1);Metazoa(1) 36.317 +ds2020-267_403 235 pfam00164 gnl|CDD|278589 1.83229e-23 3 182 3 pfam00164, Ribosom_S12_S23, Ribosomal protein S12/S23. This protein is known as S12 in bacteria and archaea and S23 in eukaryotes. cellular organisms(2);Eukaryota(1);Viridiplantae(1) 21.831 +ds2020-267_404 234 pfam00155 gnl|CDD|306629 0.000251531 3 182 3 pfam00155, Aminotran_1_2, Aminotransferase class I and II. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 25.314 +ds2020-267_835 233 pfam00680 gnl|CDD|279070 0.000703744 3 182 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 28.244 +ds2020-267_837 231 pfam00481 gnl|CDD|306885 0.00063843 3 182 3 pfam00481, PP2C, Protein phosphatase 2C. Protein phosphatase 2C is a Mn++ or Mg++ dependent protein serine/threonine phosphatase. Eukaryota(2);cellular organisms(1);Viridiplantae(1) 22.921 +ds2020-267_838 230 pfam00072 gnl|CDD|306560 5.30837e-08 50 208 2 pfam00072, Response_reg, Response regulator receiver domain. This domain receives the signal from the sensor partner in bacterial two-component systems. It is usually found N-terminal to a DNA binding effector domain. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 34.356 +ds2020-267_843 230 pfam00201 gnl|CDD|278624 2.93544e-07 46 210 1 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) 26.684 +ds2020-267_852 228 pfam17035 gnl|CDD|319097 3.87403e-09 108 203 3 pfam17035, BET, Bromodomain extra-terminal - transcription regulation. The BET, or bromodomain extra-terminal domain, is found on bromodomain proteins that play key roles in development, cancer progression and virus-host pathogenesis. It interacts with NSD3, JMJD6, CHD4, GLTSCR1, and ATAD5 all of which are shown to impart a pTEFb-independent transcriptional activation function on the bromodomain proteins. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 34.188 +ds2020-267_855 207 pfam04061 gnl|CDD|309259 7.30581e-19 1 159 1 pfam04061, ORMDL, ORMDL family. Evidence form suggests that ORMDLs are involved in protein folding in the ER. Orm proteins have been identified as negative regulators of sphingolipid synthesis that form a conserved complex with serine palmitoyltransferase, the first and rate-limiting enzyme in sphingolipid production. This novel and conserved protein complex, has been termed the SPOTS complex (serine palmitoyltransferase, Orm1/2, Tsc3, and Sac1). cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 21.368 +ds2020-267_858 206 pfam10775 gnl|CDD|313884 0.00091969 1 159 1 pfam10775, ATP_sub_h, ATP synthase complex subunit h. Subunit h is a component of the yeast mitochondrial F1-F0 ATP synthase. It is essential for the correct assembly and functioning of this enzyme. Subunit h occupies a central place in the peripheral stalk between the F1 sector and the membrane. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Fungi(1) 32.258
--- a/test-data/input_otu_rps_s2.tab Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/input_otu_rps_s2.tab Tue May 13 11:52:07 2025 +0000 @@ -1,50 +1,50 @@ -#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom -Query_1 2975 pfam02874 gnl|CDD|308490 6.56656e-19 2202 2405 -1 pfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain. This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella. cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_8 1120 pfam00146 gnl|CDD|306623 6.73934e-18 936 1097 -3 pfam00146, NADHdh, NADH dehydrogenase. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_19 872 pfam01443 gnl|CDD|307550 7.69575e-33 10 696 -3 pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase. Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) -Query_22 847 pfam13456 gnl|CDD|316018 1.2307e-09 176 397 2 pfam13456, RVT_3, Reverse transcriptase-like. This domain is found in plants and appears to be part of a retrotransposon. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_30 681 pfam00416 gnl|CDD|306841 7.7464e-31 92 409 -3 pfam00416, Ribosomal_S13, Ribosomal protein S13/S18. This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes. cellular organisms(2);Bacteria(2) -Query_36 644 pfam00078 gnl|CDD|306564 2.13234e-08 190 636 -3 pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1) -Query_40 623 pfam00346 gnl|CDD|306783 6.5049e-56 191 496 -2 pfam00346, Complex1_49kDa, Respiratory-chain NADH dehydrogenase, 49 Kd subunit. cellular organisms(2);Bacteria(1);Eukaryota(1) -Query_43 620 pfam00115 gnl|CDD|306596 2.19638e-51 78 548 3 pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_45 598 pfam00115 gnl|CDD|306596 4.78609e-34 21 302 3 pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_50 458 pfam02123 gnl|CDD|280316 1.82963e-26 27 443 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_51 458 pfam03732 gnl|CDD|309014 1.12045e-06 256 441 1 pfam03732, Retrotrans_gag, Retrotransposon gag protein. Gag or Capsid-like proteins from LTR retrotransposons. There is a central motif QGXXEXXXXXFXXLXXH that is common to Retroviridae gag-proteins, but is poorly conserved. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_53 454 pfam14111 gnl|CDD|316622 3.40587e-07 213 353 3 pfam14111, DUF4283, Domain of unknown function (DUF4283). This domain family is found in plants, and is approximately 100 amino acids in length. Considering the very diverse range of other domains it is associated with it is possible that this domain is a binding/guiding region. There are two highly conserved tryptophan residues. cellular organisms(1);Eukaryota(1);Streptophytina(1);Viridiplantae(1) -Query_58 446 pfam01348 gnl|CDD|279664 1.01441e-09 40 303 -3 pfam01348, Intron_maturas2, Type II intron maturase. Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_61 442 pfam02123 gnl|CDD|280316 1.50074e-23 115 429 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_65 433 pfam00253 gnl|CDD|306712 1.66195e-07 329 415 2 pfam00253, Ribosomal_S14, Ribosomal protein S14p/S29e. This family includes both ribosomal S14 from prokaryotes and S29 from eukaryotes. cellular organisms(2);Bacteria(1);Eukaryota(1) -Query_67 426 pfam00078 gnl|CDD|306564 9.00965e-09 268 405 -1 pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1) -Query_70 424 pfam00665 gnl|CDD|307008 1.57397e-23 93 413 3 pfam00665, rve, Integrase core domain. Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain pfam02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain pfam00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyzes the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site. cellular organisms(2);Viruses(1) -Query_76 406 pfam00361 gnl|CDD|306795 9.79473e-05 212 379 2 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_88 372 pfam02123 gnl|CDD|280316 7.63867e-10 160 363 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_108 229 pfam02123 gnl|CDD|280316 5.2142e-12 25 213 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_111 229 pfam00421 gnl|CDD|306845 5.07684e-21 15 218 3 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_114 229 pfam05518 gnl|CDD|253234 1.06567e-09 26 229 2 pfam05518, Totivirus_coat, Totivirus coat protein. Viruses(1);Riboviria(1);Duplornaviricota(1);Orthornavirae(1) -Query_118 228 pfam01333 gnl|CDD|307480 2.57329e-21 54 218 3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_123 228 pfam00006 gnl|CDD|306512 7.83347e-19 20 211 2 pfam00006, ATP-synt_ab, ATP synthase alpha/beta family, nucleotide-binding domain. This entry includes the ATP synthase alpha and beta subunits, the ATP synthase associated with flagella and the termination factor Rho. cellular organisms(2);Bacteria(2) -Query_125 228 pfam00421 gnl|CDD|306845 1.91926e-10 23 226 -3 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_138 226 pfam00421 gnl|CDD|306845 4.1109e-19 14 193 2 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_139 226 pfam01660 gnl|CDD|307679 1.36829e-05 15 209 3 pfam01660, Vmethyltransf, Viral methyltransferase. This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) -Query_148 225 pfam13041 gnl|CDD|315669 1.22135e-07 54 185 -2 pfam13041, PPR_2, PPR repeat family. This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_155 225 pfam04392 gnl|CDD|282274 7.01878e-09 28 177 -1 pfam04392, ABC_sub_bind, ABC transporter substrate binding protein. This family contains many hypothetical proteins and some ABC transporter substrate binding proteins. Bacteria(2);cellular organisms(1);Terrabacteria group(1) -Query_168 224 pfam00223 gnl|CDD|306687 7.46218e-21 41 205 -2 pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_171 223 pfam13683 gnl|CDD|316225 0.000987396 41 205 -2 pfam13683, rve_3, Integrase core domain. Bacteria(2);cellular organisms(1);Terrabacteria group(1) -Query_173 223 pfam01809 gnl|CDD|307773 1.03441e-07 121 189 -2 pfam01809, Haemolytic, Haemolytic domain. This domain has haemolytic activity. It is found in short (73-103 amino acid) proteins and contains three conserved cysteine residues. Bacteria(2);cellular organisms(1);Terrabacteria group(1) -Query_189 222 pfam02468 gnl|CDD|280606 3.26069e-17 22 123 1 pfam02468, PsbN, Photosystem II reaction centre N protein (psbN). This is a family of small proteins encoded on the chloroplast genome. psbN is involved in photosystem II during photosynthesis, but its exact role is unknown. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_191 222 pfam00978 gnl|CDD|250270 1.6261e-12 24 206 -2 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) -Query_195 222 pfam00562 gnl|CDD|306936 2.01964e-09 26 145 -3 pfam00562, RNA_pol_Rpb2_6, RNA polymerase Rpb2, domain 6. RNA polymerases catalyze the DNA dependent polymerization of RNA. Prokaryotes contain a single RNA polymerase compared to three in eukaryotes (not including mitochondrial. and chloroplast polymerases). This domain represents the hybrid binding domain and the wall domain. The hybrid binding domain binds the nascent RNA strand / template DNA strand in the Pol II transcription elongation complex. This domain contains the important structural motifs, switch 3 and the flap loop and binds an active site metal ion. This domain is also involved in binding to Rpb1 and Rpb3. Many of the bacterial members contain large insertions within this domain, as region known as dispensable region 2 (DRII). cellular organisms(2);Eukaryota(1);Viridiplantae(1) -Query_201 222 pfam00201 gnl|CDD|278624 0.000513014 26 145 -3 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) -Query_203 222 pfam02670 gnl|CDD|308348 4.98265e-14 25 135 1 pfam02670, DXP_reductoisom, 1-deoxy-D-xylulose 5-phosphate reductoisomerase. This is a family of 1-deoxy-D-xylulose 5-phosphate reductoisomerases. This enzyme catalyzes the formation of 2-C-methyl-D-erythritol 4-phosphate from 1-deoxy-D-xylulose-5-phosphate in the presence of NADPH. This reaction is part of the terpenoid biosynthesis pathway. Bacteria(2);cellular organisms(1);Pseudomonadota(1) -Query_205 221 pfam00329 gnl|CDD|306769 1.68456e-18 81 197 -1 pfam00329, Complex1_30kDa, Respiratory-chain NADH dehydrogenase, 30 Kd subunit. cellular organisms(2);Bacteria(1);Eukaryota(1) -Query_218 220 pfam05724 gnl|CDD|310379 3.15883e-08 38 196 2 pfam05724, TPMT, Thiopurine S-methyltransferase (TPMT). This family consists of thiopurine S-methyltransferase proteins from both eukaryotes and prokaryotes. Thiopurine S-methyltransferase (TPMT) is a cytosolic enzyme that catalyzes S-methylation of aromatic and heterocyclic sulfhydryl compounds, including anticancer and immunosuppressive thiopurines. cellular organisms(2);Bacteria(1);Eukaryota(1) -Query_238 219 pfam02123 gnl|CDD|280316 1.42892e-13 35 199 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_260 217 pfam02123 gnl|CDD|280316 4.65988e-13 13 210 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_264 216 pfam02123 gnl|CDD|280316 7.05387e-17 8 214 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_275 215 pfam02123 gnl|CDD|280316 3.8356e-09 37 198 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_277 215 pfam00201 gnl|CDD|278624 5.96981e-07 113 193 -2 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) -Query_282 215 pfam02123 gnl|CDD|280316 4.70874e-08 33 209 3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) -Query_289 214 pfam00361 gnl|CDD|306795 1.62395e-10 59 196 -1 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -Query_292 211 pfam05892 gnl|CDD|283531 0.000183874 59 196 -1 pfam05892, Tricho_coat, Trichovirus coat protein. This family consists of several coat proteins which are specific to the ssRNA positive-strand, no DNA stage viruses such as the Trichovirus and Vitivirus. Viruses(1);Kitrinoviricota(1);Orthornavirae(1);Tymovirales(1) -Query_293 211 pfam07727 gnl|CDD|311594 9.19953e-05 43 120 1 pfam07727, RVT_2, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. This Pfam entry includes reverse transcriptases not recognized by the pfam00078 model. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) -Query_297 211 pfam00978 gnl|CDD|250270 2.21971e-14 16 201 1 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) +#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom pident +ds2020-328_2 2975 pfam02874 gnl|CDD|308490 6.56656e-19 2202 2405 -1 pfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain. This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella. cellular organisms(2);Eukaryota(1);Viridiplantae(1) 23.821 +ds2020-328_16 1120 pfam00146 gnl|CDD|306623 6.73934e-18 936 1097 -3 pfam00146, NADHdh, NADH dehydrogenase. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 34.959 +ds2020-328_27 872 pfam01443 gnl|CDD|307550 7.69575e-33 10 696 -3 pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase. Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) 30.000 +ds2020-328_38 847 pfam13456 gnl|CDD|316018 1.2307e-09 176 397 2 pfam13456, RVT_3, Reverse transcriptase-like. This domain is found in plants and appears to be part of a retrotransposon. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 33.486 +ds2020-328_39 681 pfam00416 gnl|CDD|306841 7.7464e-31 92 409 -3 pfam00416, Ribosomal_S13, Ribosomal protein S13/S18. This family includes ribosomal protein S13 from prokaryotes and S18 from eukaryotes. cellular organisms(2);Bacteria(2) 34.188 +ds2020-328_40 644 pfam00078 gnl|CDD|306564 2.13234e-08 190 636 -3 pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1) 18.868 +ds2020-328_41 623 pfam00346 gnl|CDD|306783 6.5049e-56 191 496 -2 pfam00346, Complex1_49kDa, Respiratory-chain NADH dehydrogenase, 49 Kd subunit. cellular organisms(2);Bacteria(1);Eukaryota(1) 28.244 +ds2020-328_42 620 pfam00115 gnl|CDD|306596 2.19638e-51 78 548 3 pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 26.543 +ds2020-328_43 598 pfam00115 gnl|CDD|306596 4.78609e-34 21 302 3 pfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 23.849 +ds2020-328_44 458 pfam02123 gnl|CDD|280316 1.82963e-26 27 443 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 28.405 +ds2020-328_45 458 pfam03732 gnl|CDD|309014 1.12045e-06 256 441 1 pfam03732, Retrotrans_gag, Retrotransposon gag protein. Gag or Capsid-like proteins from LTR retrotransposons. There is a central motif QGXXEXXXXXFXXLXXH that is common to Retroviridae gag-proteins, but is poorly conserved. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 37.970 +ds2020-328_46 454 pfam14111 gnl|CDD|316622 3.40587e-07 213 353 3 pfam14111, DUF4283, Domain of unknown function (DUF4283). This domain family is found in plants, and is approximately 100 amino acids in length. Considering the very diverse range of other domains it is associated with it is possible that this domain is a binding/guiding region. There are two highly conserved tryptophan residues. cellular organisms(1);Eukaryota(1);Streptophytina(1);Viridiplantae(1) 27.296 +ds2020-328_47 446 pfam01348 gnl|CDD|279664 1.01441e-09 40 303 -3 pfam01348, Intron_maturas2, Type II intron maturase. Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 18.868 +ds2020-328_48 442 pfam02123 gnl|CDD|280316 1.50074e-23 115 429 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 20.084 +ds2020-328_49 433 pfam00253 gnl|CDD|306712 1.66195e-07 329 415 2 pfam00253, Ribosomal_S14, Ribosomal protein S14p/S29e. This family includes both ribosomal S14 from prokaryotes and S29 from eukaryotes. cellular organisms(2);Bacteria(1);Eukaryota(1) 28.428 +ds2020-328_50 426 pfam00078 gnl|CDD|306564 9.00965e-09 268 405 -1 pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. Viruses(1);Riboviria(1);Pararnavirae(1);Artverviricota(1) 28.428 +ds2020-328_51 424 pfam00665 gnl|CDD|307008 1.57397e-23 93 413 3 pfam00665, rve, Integrase core domain. Integrase mediates integration of a DNA copy of the viral genome into the host chromosome. Integrase is composed of three domains. The amino-terminal domain is a zinc binding domain pfam02022. This domain is the central catalytic domain. The carboxyl terminal domain that is a non-specific DNA binding domain pfam00552. The catalytic domain acts as an endonuclease when two nucleotides are removed from the 3' ends of the blunt-ended viral DNA made by reverse transcription. This domain also catalyzes the DNA strand transfer reaction of the 3' ends of the viral DNA to the 5' ends of the integration site. cellular organisms(2);Viruses(1) 21.127 +ds2020-328_52 406 pfam00361 gnl|CDD|306795 9.79473e-05 212 379 2 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 34.188 +ds2020-328_53 372 pfam02123 gnl|CDD|280316 7.63867e-10 160 363 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 20.084 +ds2020-328_54 229 pfam02123 gnl|CDD|280316 5.2142e-12 25 213 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 34.746 +ds2020-328_55 229 pfam00421 gnl|CDD|306845 5.07684e-21 15 218 3 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 34.836 +ds2020-328_56 229 pfam05518 gnl|CDD|253234 1.06567e-09 26 229 2 pfam05518, Totivirus_coat, Totivirus coat protein. Viruses(1);Riboviria(1);Duplornaviricota(1);Orthornavirae(1) 20.084 +ds2020-328_98 228 pfam01333 gnl|CDD|307480 2.57329e-21 54 218 3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 22.535 +ds2020-328_99 228 pfam00006 gnl|CDD|306512 7.83347e-19 20 211 2 pfam00006, ATP-synt_ab, ATP synthase alpha/beta family, nucleotide-binding domain. This entry includes the ATP synthase alpha and beta subunits, the ATP synthase associated with flagella and the termination factor Rho. cellular organisms(2);Bacteria(2) 20.084 +ds2020-328_612 228 pfam00421 gnl|CDD|306845 1.91926e-10 23 226 -3 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 15.909 +ds2020-328_613 226 pfam00421 gnl|CDD|306845 4.1109e-19 14 193 2 pfam00421, PSII, Photosystem II protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 21.831 +ds2020-328_614 226 pfam01660 gnl|CDD|307679 1.36829e-05 15 209 3 pfam01660, Vmethyltransf, Viral methyltransferase. This RNA methyltransferase domain is found in a wide range of ssRNA viruses, including Hordei-, Tobra-, Tobamo-, Bromo-, Clostero- and Caliciviruses. This methyltransferase is involved in mRNA capping. Capping of mRNA enhances its stability. This usually occurs in the nucleus. Therefore, many viruses that replicate in the cytoplasm encode their own. This is a specific guanine-7-methyltransferase domain involved in viral mRNA cap0 synthesis. Specificity for guanine 7 position is shown by NMR in and in vivo role in cap synthesis. Based on secondary structure prediction, the basic fold is believed to be similar to the common AdoMet-dependent methyltransferase fold. A curious feature of this methyltransferase domain is that it together with flanking sequences seems to have guanylyltransferase activity coupled to the methyltransferase activity. The domain is found throughout the so-called Alphavirus superfamily, (including alphaviruses and several other groups). It forms the defining, unique feature of this superfamily. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) 35.041 +ds2020-328_615 225 pfam13041 gnl|CDD|315669 1.22135e-07 54 185 -2 pfam13041, PPR_2, PPR repeat family. This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 28.244 +ds2020-328_616 225 pfam04392 gnl|CDD|282274 7.01878e-09 28 177 -1 pfam04392, ABC_sub_bind, ABC transporter substrate binding protein. This family contains many hypothetical proteins and some ABC transporter substrate binding proteins. Bacteria(2);cellular organisms(1);Terrabacteria group(1) 25.314 +ds2020-328_617 224 pfam00223 gnl|CDD|306687 7.46218e-21 41 205 -2 pfam00223, PsaA_PsaB, Photosystem I psaA/psaB protein. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 29.688 +ds2020-328_618 223 pfam13683 gnl|CDD|316225 0.000987396 41 205 -2 pfam13683, rve_3, Integrase core domain. Bacteria(2);cellular organisms(1);Terrabacteria group(1) 33.894 +ds2020-328_619 223 pfam01809 gnl|CDD|307773 1.03441e-07 121 189 -2 pfam01809, Haemolytic, Haemolytic domain. This domain has haemolytic activity. It is found in short (73-103 amino acid) proteins and contains three conserved cysteine residues. Bacteria(2);cellular organisms(1);Terrabacteria group(1) 27.296 +ds2020-328_620 222 pfam02468 gnl|CDD|280606 3.26069e-17 22 123 1 pfam02468, PsbN, Photosystem II reaction centre N protein (psbN). This is a family of small proteins encoded on the chloroplast genome. psbN is involved in photosystem II during photosynthesis, but its exact role is unknown. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 27.848 +ds2020-328_621 222 pfam00978 gnl|CDD|250270 1.6261e-12 24 206 -2 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) 22.772 +ds2020-328_682 222 pfam00562 gnl|CDD|306936 2.01964e-09 26 145 -3 pfam00562, RNA_pol_Rpb2_6, RNA polymerase Rpb2, domain 6. RNA polymerases catalyze the DNA dependent polymerization of RNA. Prokaryotes contain a single RNA polymerase compared to three in eukaryotes (not including mitochondrial. and chloroplast polymerases). This domain represents the hybrid binding domain and the wall domain. The hybrid binding domain binds the nascent RNA strand / template DNA strand in the Pol II transcription elongation complex. This domain contains the important structural motifs, switch 3 and the flap loop and binds an active site metal ion. This domain is also involved in binding to Rpb1 and Rpb3. Many of the bacterial members contain large insertions within this domain, as region known as dispensable region 2 (DRII). cellular organisms(2);Eukaryota(1);Viridiplantae(1) 23.864 +ds2020-328_688 222 pfam00201 gnl|CDD|278624 0.000513014 26 145 -3 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) 26.154 +ds2020-328_690 222 pfam02670 gnl|CDD|308348 4.98265e-14 25 135 1 pfam02670, DXP_reductoisom, 1-deoxy-D-xylulose 5-phosphate reductoisomerase. This is a family of 1-deoxy-D-xylulose 5-phosphate reductoisomerases. This enzyme catalyzes the formation of 2-C-methyl-D-erythritol 4-phosphate from 1-deoxy-D-xylulose-5-phosphate in the presence of NADPH. This reaction is part of the terpenoid biosynthesis pathway. Bacteria(2);cellular organisms(1);Pseudomonadota(1) 21.111 +ds2020-328_692 221 pfam00329 gnl|CDD|306769 1.68456e-18 81 197 -1 pfam00329, Complex1_30kDa, Respiratory-chain NADH dehydrogenase, 30 Kd subunit. cellular organisms(2);Bacteria(1);Eukaryota(1) 24.942 +ds2020-328_705 220 pfam05724 gnl|CDD|310379 3.15883e-08 38 196 2 pfam05724, TPMT, Thiopurine S-methyltransferase (TPMT). This family consists of thiopurine S-methyltransferase proteins from both eukaryotes and prokaryotes. Thiopurine S-methyltransferase (TPMT) is a cytosolic enzyme that catalyzes S-methylation of aromatic and heterocyclic sulfhydryl compounds, including anticancer and immunosuppressive thiopurines. cellular organisms(2);Bacteria(1);Eukaryota(1) 25.191 +ds2020-328_725 219 pfam02123 gnl|CDD|280316 1.42892e-13 35 199 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 20.084 +ds2020-328_747 217 pfam02123 gnl|CDD|280316 4.65988e-13 13 210 -2 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 34.560 +ds2020-328_751 216 pfam02123 gnl|CDD|280316 7.05387e-17 8 214 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 21.831 +ds2020-328_762 215 pfam02123 gnl|CDD|280316 3.8356e-09 37 198 -3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 21.244 +ds2020-328_764 215 pfam00201 gnl|CDD|278624 5.96981e-07 113 193 -2 pfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophytina(1) 20.149 +ds2020-328_769 215 pfam02123 gnl|CDD|280316 4.70874e-08 33 209 3 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) 34.188 +ds2020-328_776 214 pfam00361 gnl|CDD|306795 1.62395e-10 59 196 -1 pfam00361, Proton_antipo_M, Proton-conducting membrane transporter. This is a family of membrane transporters that inlcudes some 7 of potentially 14-16 TM regions. In many instances the family forms part of complex I that catalyzes the transfer of two electrons from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 21.311 +ds2020-328_826 211 pfam05892 gnl|CDD|283531 0.000183874 59 196 -1 pfam05892, Tricho_coat, Trichovirus coat protein. This family consists of several coat proteins which are specific to the ssRNA positive-strand, no DNA stage viruses such as the Trichovirus and Vitivirus. Viruses(1);Kitrinoviricota(1);Orthornavirae(1);Tymovirales(1) 21.782 +ds2020-328_827 211 pfam07727 gnl|CDD|311594 9.19953e-05 43 120 1 pfam07727, RVT_2, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses. This Pfam entry includes reverse transcriptases not recognized by the pfam00078 model. cellular organisms(1);Eukaryota(1);Viridiplantae(1);Streptophyta(1) 22.535 +ds2020-328_831 211 pfam00978 gnl|CDD|250270 2.21971e-14 16 201 1 pfam00978, RdRP_2, RNA dependent RNA polymerase. This family may represent an RNA dependent RNA polymerase. The family also contains the following proteins: 2A protein from bromoviruses putative RNA dependent RNA polymerase from tobamoviruses Non structural polyprotein from togaviruses. Viruses(1);Riboviria(1);Orthornavirae(1);Kitrinoviricota(1) 18.868
--- a/test-data/rps_test.tab Sun Sep 08 14:09:07 2024 +0000 +++ b/test-data/rps_test.tab Tue May 13 11:52:07 2025 +0000 @@ -1,5 +1,5 @@ -#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom -No definition line 211 pfam01490 gnl|CDD|279788 0.000177299 15 134 -2 pfam01490, Aa_trans, Transmembrane amino acid transporter protein. This transmembrane region is found in many amino acid transporters including UNC-47 and MTR. UNC-47 encodes a vesicular amino butyric acid (GABA) transporter, (VGAT). UNC-47 is predicted to have 10 transmembrane domains. MTR is a N system amino acid transporter system protein involved in methyltryptophan resistance. Other members of this family include proline transporters and amino acid permeases. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) -ds2020-267_4 2297 pfam00680 gnl|CDD|279070 3.12197e-05 995 1873 -2 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -ds2020-267_5 2029 pfam00680 gnl|CDD|279070 8.86955e-06 840 1706 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) -ds2020-267_6 1860 pfam02123 gnl|CDD|280316 1.27376e-17 1147 1764 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Duplornaviricota(1) +#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom pident +No definition line 211 pfam01490 gnl|CDD|279788 0.000177299 15 134 -2 pfam01490, Aa_trans, Transmembrane amino acid transporter protein. This transmembrane region is found in many amino acid transporters including UNC-47 and MTR. UNC-47 encodes a vesicular amino butyric acid (GABA) transporter, (VGAT). UNC-47 is predicted to have 10 transmembrane domains. MTR is a N system amino acid transporter system protein involved in methyltryptophan resistance. Other members of this family include proline transporters and amino acid permeases. cellular organisms(1);Eukaryota(1);Opisthokonta(1);Metazoa(1) 35.000 +ds2020-267_4 2297 pfam00680 gnl|CDD|279070 3.12197e-05 995 1873 -2 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 16.986 +ds2020-267_5 2029 pfam00680 gnl|CDD|279070 8.86955e-06 840 1706 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 17.974 +ds2020-267_6 1860 pfam02123 gnl|CDD|280316 1.27376e-17 1147 1764 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Resentoviricetes(1) 23.671
--- a/virAnnot_rps2tsv.xml Sun Sep 08 14:09:07 2024 +0000 +++ b/virAnnot_rps2tsv.xml Tue May 13 11:52:07 2025 +0000 @@ -1,4 +1,4 @@ -<tool id="virAnnot_rps2tsv" name="virAnnot Rps2tsv" version="1.1.0+galaxy0" profile="21.05"> +<tool id="virAnnot_rps2tsv" name="virAnnot Rps2tsv" version="@TOOL_VERSION@+galaxy0" profile="21.05"> <description>Convert xml rpstblast results to tab file with taxonomic informations</description> <macros> <import>macros.xml</import> @@ -30,7 +30,7 @@ <param name="max_evalue" value="0.0001"/> <output name="output" file="rps_test.tab"> <assert_contents> - <has_n_columns n="10" /> + <has_n_columns n="11" /> <has_n_lines n="5" /> <has_text text="pfam00680, RdRP_1" /> </assert_contents> @@ -42,8 +42,11 @@ This module takes as input rps XML file from rps motives annotation. The standard maximum evalue is 0.0001 [default value]. The expected result is a tabular file. See example: -#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom -ds2020-267_120 339 pfam01333 gnl|CDD|366578 0.000848733 197 325 -3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. Eukaryota(227);Bacteria(73); +#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom pident +ds2020-267_4 2297 pfam00680 gnl|CDD|279070 3.12197e-05 995 1873 -2 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 16.986 +ds2020-267_5 2029 pfam00680 gnl|CDD|279070 8.86955e-06 840 1706 3 pfam00680, RdRP_1, RNA dependent RNA polymerase. Viruses(1);Riboviria(1);Orthornavirae(1);Pisuviricota(1) 17.974 +ds2020-267_6 1860 pfam02123 gnl|CDD|280316 1.27376e-17 1147 1764 -1 pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus. Viruses(1);Riboviria(1);Orthornavirae(1);Resentoviricetes(1) 23.671 + ]]></help> <expand macro="citations" />
