| Next changeset 1:84d20d9c292d (2024-05-18) |
|
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit 3a3b40c15ae5e82334f016e88b1f3c5bbbb3b2cd |
|
added:
blast2tsv.py macros.xml otu.py rps2tree_html.py rps2tsv.py seek_otu.R test-data/blast2tsv_contigs.fa test-data/blast2tsv_contigs.txt test-data/blast2tsv_input.xml test-data/blast2tsv_output.tab test-data/blast2tsv_output_with_rn.tab test-data/blast2tsv_read_nb.tab test-data/blast2tsv_reads.txt test-data/blast2tsv_reads_with_rn.txt test-data/index.html test-data/otu_s1.fa test-data/otu_s1_rps.tab test-data/otu_s1_tblastx.tab test-data/otu_s2.fa test-data/otu_s2_rps.tab test-data/otu_s2_tblastx.tab test-data/rps_test.tab test-data/rps_test.xml virAnnot_rps2tsv.xml |
| b |
| diff -r 000000000000 -r bbaa89f070f4 blast2tsv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast2tsv.py Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| b'@@ -0,0 +1,284 @@\n+#!/usr/bin/env python3\n+\n+\n+# Name: blast2tsv\n+# Author(s): Sebastien Theil, Marie Lefebvre - INRAE\n+# Aims: Convert blast xml output to tsv and add taxonomy\n+\n+\n+import argparse\n+import csv\n+import logging as log\n+import os\n+\n+from Bio import Entrez\n+from Bio import SeqIO\n+from Bio.Blast import NCBIXML\n+from ete3 import NCBITaxa\n+\n+ncbi = NCBITaxa()\n+\n+\n+def main():\n+ options = _set_options()\n+ _set_log_level(options.verbosity)\n+ hits = _read_xml(options)\n+ _write_tsv(options, hits)\n+\n+\n+def _guess_database(accession):\n+ """Guess the correct database for querying based off the format of the accession"""\n+ database_mappings_refseq = {\'AC_\': \'nuccore\', \'NC_\': \'nuccore\', \'NG_\': \'nuccore\',\n+ \'NT_\': \'nuccore\', \'NW_\': \'nuccore\', \'NZ_\': \'nuccore\',\n+ \'AP_\': \'protein\', \'NP_\': \'protein\', \'YP_\': \'protein\',\n+ \'XP_\': \'protein\', \'WP_\': \'protein\'}\n+ return database_mappings_refseq[accession[0:3]]\n+\n+\n+def _read_xml(options):\n+ """\n+ Parse XML blast results file\n+ Keep only the first hit\n+ """\n+ log.info("Read XML file.")\n+ results = open(options.xml_file, \'r\')\n+ records = NCBIXML.parse(results)\n+ xml_results = {}\n+ for blast_record in records:\n+ for aln in blast_record.alignments:\n+ hit_count = 1\n+ for hit in aln.hsps:\n+ hsp = {}\n+ if hit_count == 1:\n+ first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand\n+ cumul_hit_identity = hit.identities if hit.identities else 0\n+ cumul_hit_score = hit.bits # hit score\n+ cumul_hit_evalue = hit.expect # evalue\n+ cumul_hit_length = hit.align_length if hit.align_length is not None else 0\n+ hit_count = hit_count + 1\n+ else:\n+ # all HSPs in different strand than 1st HSPs will be discarded.\n+ if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0):\n+ cumul_hit_identity = cumul_hit_identity + hit.identities\n+ cumul_hit_length = cumul_hit_length + hit.align_length\n+ cumul_hit_evalue = cumul_hit_evalue + hit.expect\n+ cumul_hit_score = cumul_hit_score + hit.bits\n+ hit_count = hit_count + 1\n+ if hit_count == 1:\n+ final_hit_count = hit_count\n+ elif hit_count > 1:\n+ final_hit_count = hit_count - 1\n+ hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match\n+ hsp["query_id"] = blast_record.query_id\n+ hsp["query_length"] = blast_record.query_length # length of the query\n+ hsp["accession"] = aln.accession.replace("ref|", "")\n+ hsp["description"] = aln.hit_def\n+ hsp["hit_length"] = aln.length # length of the hit\n+ hsp["hsp_length"] = hit.align_length # length of the hsp alignment\n+ hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, \'hsp\', hsp["query_length"])[0]\n+ if cumul_hit_length == 0:\n+ hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage\n+ else:\n+ hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage\n+ hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity\n+ hsp["num_hsps"] = final_hit_count\n+ hsp["hit_cumul_length"] = cumul_hit_length\n+ hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, \'hit\', hsp["query_length"])[1]\n+ db = _guess_database(hsp["accession"])\n+ try:\n+ handle = Entrez.esummary(db=db, id=hsp["accession"])\n+ '..b' line_count += 1\n+ else:\n+ # no annotation\n+ if len(row) == 16:\n+ if row[14] != "":\n+ nb_reads = row[2]\n+ if nb_reads == "":\n+ current_reads_nb = 0\n+ log.debug("No reads number for " + row[1])\n+ else:\n+ current_reads_nb = int(nb_reads)\n+ contig_id = row[14]\n+ if contig_id in abundance:\n+ # add reads\n+ abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb\n+ abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1\n+ else:\n+ # init reads for this taxo\n+ abundance[contig_id] = {}\n+ abundance[contig_id]["reads_nb"] = current_reads_nb\n+ abundance[contig_id]["contigs_nb"] = 1\n+ else:\n+ log.debug("No annotations for contig " + row[1])\n+ else:\n+ log.debug("No annotations for contig " + row[1])\n+ log.debug(abundance)\n+ reads_file = open(options.output + "/blast2tsv_reads.txt", "w+")\n+ for taxo in abundance:\n+ reads_file.write(str(abundance[taxo]["reads_nb"]))\n+ reads_file.write("\\t")\n+ reads_file.write("\\t".join(taxo.split(";")))\n+ reads_file.write("\\n")\n+ reads_file.close()\n+ log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt")\n+ contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+")\n+ for taxo in abundance:\n+ contigs_file.write(str(abundance[taxo]["contigs_nb"]))\n+ contigs_file.write("\\t")\n+ contigs_file.write("\\t".join(taxo.split(";")))\n+ contigs_file.write("\\n")\n+ contigs_file.close()\n+ log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt")\n+\n+\n+def _set_options():\n+ parser = argparse.ArgumentParser()\n+ parser.add_argument(\'-x\', \'--xml\', help=\'XML files with results of blast\', action=\'store\', required=True, dest=\'xml_file\')\n+ parser.add_argument(\'-rn\', \'--read-count\', help=\'Tab-delimited file associating seqID with read number.\', action=\'store\', dest=\'rn_file\')\n+ parser.add_argument(\'-c\', \'--contigs\', help=\'FASTA file with contigs sequence.\', action=\'store\', required=True, dest=\'fasta_file\')\n+ parser.add_argument(\'-me\', \'--max_evalue\', help=\'Max evalue\', action=\'store\', type=float, default=0.0001, dest=\'max_evalue\')\n+ parser.add_argument(\'-qov\', \'--min_query_overlap\', help=\'Minimum query overlap\', action=\'store\', type=int, default=5, dest=\'min_qov\')\n+ parser.add_argument(\'-mhov\', \'--min_hit_overlap\', help=\'Minimum hit overlap\', action=\'store\', type=int, default=5, dest=\'min_hov\')\n+ parser.add_argument(\'-s\', \'--min_score\', help=\'Minimum score\', action=\'store\', type=int, default=30, dest=\'min_score\')\n+ parser.add_argument(\'-a\', \'--algo\', help=\'Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).\', action=\'store\', type=str, default=\'BLASTX\', dest=\'algo\')\n+ parser.add_argument(\'-o\', \'--out\', help=\'The output file (.csv).\', action=\'store\', type=str, default=\'./blast2tsv\', dest=\'output\')\n+ parser.add_argument(\'-v\', \'--verbosity\', help=\'Verbose level\', action=\'store\', type=int, choices=[1, 2, 3, 4], default=1)\n+ args = parser.parse_args()\n+ return args\n+\n+\n+def _set_log_level(verbosity):\n+ if verbosity == 1:\n+ log_format = \'%(asctime)s %(levelname)-8s %(message)s\'\n+ log.basicConfig(level=log.INFO, format=log_format)\n+ elif verbosity == 3:\n+ log_format = \'%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s\'\n+ log.basicConfig(level=log.DEBUG, format=log_format)\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 macros.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| @@ -0,0 +1,25 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="1.83">biopython</requirement> + <requirement type="package" version="3.1.3">ete3</requirement> + <requirement type="package" version="1.2.4">clustalo</requirement> + <requirement type="package" version="8.5.0">curl</requirement> + <requirement type="package" version="4.3.2">r-base</requirement> + <requirement type="package" version="23.12.0">pyaml</requirement> + <requirement type="package" version="3.1.2">openpyxl</requirement> + <requirement type="package" version="3.1.9">xlsxwriter</requirement> + <requirement type="package" version="2.0.1">xlrd</requirement> + <requirement type="package" version="2.2.0">pandas</requirement> + <requirement type="package" version="2.8.1">krona</requirement> + <requirement type="package" version="3.0">zip</requirement> + <yield /> + </requirements> + </xml> + <token name="@HEADLESS@"><![CDATA[export QT_QPA_PLATFORM='offscreen' &&]]></token> + <xml name="citations"> + <citations> + <citation type="doi">10.1094/PBIOMES-07-19-0037-A</citation> + </citations> + </xml> +</macros> |
| b |
| diff -r 000000000000 -r bbaa89f070f4 otu.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/otu.py Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| b'@@ -0,0 +1,442 @@\n+#!/usr/bin/env python3\n+\n+\n+# Name: virAnnot_otu\n+# Author: Marie Lefebvre - INRAE\n+# Reuirements: Ete3 toolkit and external apps\n+# Aims: Create viral OTUs based on RPS and Blast annotations\n+\n+\n+import argparse\n+import csv\n+import logging as log\n+import os\n+import random\n+import re\n+\n+import pandas as pd\n+import xlsxwriter\n+from Bio import SeqIO\n+from Bio.Align.Applications import ClustalOmegaCommandline\n+from ete3 import NodeStyle, SeqGroup, SeqMotifFace, Tree, TreeStyle\n+\n+\n+def main():\n+ """\n+ 1 - retrieve info (sequence, query_id, taxo) from RPS file\n+ 2 - align protein sequences of the same domain, calculate\n+ matrix of distances, generate trees\n+ 3 - get statistics (read number) per otu\n+ 4 - create HTML report\n+ """\n+ options = _set_options()\n+ _set_log_level(options.verbosity)\n+ hits_collection = _cut_sequence(options)\n+ _align_sequences(options, hits_collection)\n+ _get_stats(options, hits_collection)\n+ _create_html(options, hits_collection)\n+\n+\n+def _cut_sequence(options):\n+ """\n+ Retrieve viral hits and sequences from RPS files\n+ """\n+ log.info("Cut sequences")\n+ i = 0 # keep track of iterations over rps files to use the corresponding fasta file\n+ collection = {}\n+ options.rps.sort()\n+ for rps_file in options.rps:\n+ log.debug("Reading rps file " + str(rps_file))\n+ with open(rps_file[0], \'r\') as rps_current_file:\n+ rps_reader = csv.reader(rps_current_file, delimiter=\'\\t\')\n+ headers = 0\n+ for row in rps_reader:\n+ if headers == 0:\n+ # headers\n+ headers += 1\n+ else:\n+ if row[1] == "no_hit":\n+ pass\n+ else:\n+ query_id = row[0]\n+ cdd_id = row[2]\n+ startQ = int(row[5])\n+ endQ = int(row[6])\n+ frame = float(row[7])\n+ description = row[8]\n+ superkingdom = row[9]\n+ match = re.search("Viruses", superkingdom)\n+ # if contig is viral then retrieve sequence\n+ if match:\n+ options.fasta.sort()\n+ seq = _retrieve_fasta_seq(options.fasta[i][0], query_id)\n+ seq_length = len(seq)\n+ if endQ < seq_length:\n+ seq = seq[startQ - 1:endQ]\n+ else:\n+ seq = seq[startQ - 1:seq_length]\n+ if frame < 0:\n+ seq = seq.reverse_complement()\n+ prot = seq.translate()\n+ if len(prot) >= options.min_protein_length:\n+ log.debug("Add " + query_id + " to collection")\n+ if cdd_id not in collection:\n+ collection[cdd_id] = {}\n+ collection[cdd_id][query_id] = {}\n+ collection[cdd_id][query_id]["nuccleotide"] = seq\n+ collection[cdd_id][query_id]["protein"] = prot\n+ collection[cdd_id][query_id]["full_description"] = description\n+ if options.blast is not None:\n+ options.blast.sort()\n+ with open(options.blast[i][0], \'r\') as blast_current_file:\n+ blast_reader = csv.reader(blast_current_file, delimiter=\'\\t\')\n+ for b_query in blast_reader:\n+ if b_query[1] == query_id:\n+ collection[cdd_id][query_id]["nb"] = b_quer'..b'(samples_list) + 2, ",".join(otu_collection[otu][\'contigs_list\']))\n+ row += 1\n+ workbook.close()\n+ read_file = pd.ExcelFile(file_xlsx)\n+ for sheet in read_file.sheet_names:\n+ cluster_nb_reads_file = options.output + "/" + sheet.replace(" ", "_") + "/cluster_nb_reads_files.tab"\n+ data_xls = pd.read_excel(file_xlsx, sheet, dtype=str, index_col=None)\n+ data_xls.to_csv(cluster_nb_reads_file, encoding=\'utf-8\', index=False, sep=\'\\t\')\n+\n+\n+def _create_html(options, hits_collection):\n+ """\n+ Create HTML file with all results\n+ """\n+ # create mapping file with all informations to use to create HTML report\n+ map_file_path = options.output + "/map.txt"\n+ if os.path.exists(map_file_path):\n+ os.remove(map_file_path)\n+\n+ map_file = open(map_file_path, "w+")\n+ headers = [\'#cdd_id\', \'align_files\', \'tree_files\', \'cluster_files\', \'cluster_nb_reads_files\', \'pairwise_files\', \'description\', \'full_description\\n\']\n+ map_file.write("\\t".join(headers))\n+ for cdd_id in hits_collection:\n+ cdd_output = hits_collection[cdd_id]["short_description"].replace(" ", "_")\n+ short_description = cdd_output\n+ file_seq_aligned = cdd_output + \'/seq_aligned.final_tree.fa\'\n+ tree_file = cdd_output + \'/tree.dnd.png\'\n+ file_cluster = cdd_output + \'/otu_cluster.csv\'\n+ file_matrix = cdd_output + "/identity_matrix.csv"\n+ cluster_nb_reads_files = cdd_output + "/cluster_nb_reads_files.tab"\n+ map_file.write(cdd_id + "\\t" + file_seq_aligned + "\\t" + tree_file + "\\t")\n+ map_file.write(file_cluster + "\\t" + cluster_nb_reads_files + "\\t" + file_matrix + "\\t")\n+ map_file.write(short_description + "\\t" + hits_collection[cdd_id]["full_description"] + "\\n")\n+ map_file.close()\n+ log.info("Writing HTML report")\n+ html_cmd = os.path.join(options.tool_path, \'rps2tree_html.py\') + \' -m \' + map_file_path + \' -o \' + options.output\n+ log.debug(html_cmd)\n+ os.system(html_cmd)\n+\n+\n+def _set_options():\n+ parser = argparse.ArgumentParser()\n+ parser.add_argument(\'-b\', \'--blast\', help=\'TAB blast file from blast2ecsv module.\', action=\'append\', required=False, dest=\'blast\', nargs=\'+\')\n+ parser.add_argument(\'-r\', \'--rps\', help=\'TAB rpsblast file from rps2ecsv module.\', action=\'append\', required=True, dest=\'rps\', nargs=\'+\')\n+ parser.add_argument(\'-f\', \'--fasta\', help=\'FASTA file with contigs\', action=\'append\', required=True, dest=\'fasta\', nargs=\'+\')\n+ parser.add_argument(\'-p\', \'--percentage\', help=\'Percentage similarity threshold for OTUs cutoff.\', action=\'store\', type=int, default=90, dest=\'perc\')\n+ parser.add_argument(\'-vp\', \'--viral_portion\', help=\'Minimun portion of viral sequences in RPS domain to be included.\', action=\'store\', type=float, default=0.3, dest=\'viral_portion\')\n+ parser.add_argument(\'-mpl\', \'--min_protein_length\', help=\'Minimum query protein length.\', action=\'store\', type=int, default=100, dest=\'min_protein_length\')\n+ parser.add_argument(\'-tp\', \'--tool_path\', help=\'Path to otu_seek.R\', action=\'store\', type=str, default=\'./\', dest=\'tool_path\')\n+ parser.add_argument(\'-o\', \'--out\', help=\'The output directory\', action=\'store\', type=str, default=\'./Rps2tree_OTU\', dest=\'output\')\n+ parser.add_argument(\'-rgb\', \'--rgb-conf\', help=\'Color palette for contigs coloration\', action=\'store\', type=str, default=\'rgb.txt\', dest=\'file_rgb\')\n+ parser.add_argument(\'-v\', \'--verbosity\', help=\'Verbose level\', action=\'store\', type=int, choices=[1, 2, 3, 4], default=1)\n+ args = parser.parse_args()\n+ return args\n+\n+\n+def _set_log_level(verbosity):\n+ if verbosity == 1:\n+ log_format = \'%(asctime)s %(levelname)-8s %(message)s\'\n+ log.basicConfig(level=log.INFO, format=log_format)\n+ elif verbosity == 3:\n+ log_format = \'%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s\'\n+ log.basicConfig(level=log.DEBUG, format=log_format)\n+\n+\n+if __name__ == "__main__":\n+ main()\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 rps2tree_html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rps2tree_html.py Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +import argparse +import csv +import logging +import sys + + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def main(): + options = _set_options() + data, headers = _read_map_file(options.map) + html = _print_html(data, headers, options.out) + index_file = options.out + '/index.html' + fh = open(index_file, mode='w') + fh.write(html) + fh.close() + + +def _get_google_script_headers(data, headers, out_dir): + html = '<script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>' + "\n" + html += '<script type="text/javascript">' + "\n" + html += 'google.charts.load(\'current\', {\'packages\':[\'table\']});' + "\n" + chart_names, java_scripts = _get_google_js(data, headers, out_dir) + for i in range(0, len(chart_names)): + html += 'google.charts.setOnLoadCallback(' + chart_names[i].replace('-', '_') + ');' + "\n" + html += 'function ' + chart_names[i].replace('-', '_') + '() {' + "\n" + html += java_scripts[i] + "\n" + html += '}' + "\n" + html += '</script>' + "\n" + return html + + +def _get_google_js(data, headers, out_dir): + java_scripts = [] + chart_names = [] + for cdd in data: + chart_names.append(cdd['cdd_id'] + '_' + cdd['description']) + js = 'var data = new google.visualization.DataTable();' + "\n" + mat, head = _parse_csv(out_dir + '/' + cdd['cluster_nb_reads_files']) + for el in head: + if el == '#OTU_name': + js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n" + elif el == 'taxonomy': + js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n" + elif el == 'contigs_list' or el == 'seq_list': + js += 'data.addColumn(\'string\', \'' + el + '\');' + "\n" + else: + js += 'data.addColumn(\'number\', \'' + el + '\');' + "\n" + js += 'data.addRows([' + "\n" + for j in range(0, len(mat)): + js += '[\'' + mat[j][head[0]] + '\'' + for i in range(1, len(head) - 2): + js += ',' + mat[j][head[i]] + js += ',\'' + mat[j][head[len(head) - 2]] + '\'' + js += ',\'' + mat[j][head[len(head) - 1]] + '\'' + js += ']' + if j != (len(mat) - 1): + js += ',' + js += "\n" + js += ']);' + "\n" + js += 'var table = new google.visualization.Table(document.getElementById(\'' + (cdd['cdd_id'] + '_' + cdd['description']).replace('-', '_') + '_div' + '\'));' + "\n" + js += 'table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});' + "\n" + java_scripts.append(js) + return chart_names, java_scripts + + +def _parse_csv(file): + fh = open(file) + reader = csv.reader(fh, delimiter="\t") + data = list(reader) + headers = data[0] + matrix = [] + for i in range(1, len(data)): + dict = {} + for j in range(0, len(data[i])): + if data[i][j] == '': + dict[headers[j]] = None + elif data[i][j] == 'null': + dict[headers[j]] = None + else: + dict[headers[j]] = data[i][j] + matrix.append(dict) + return matrix, headers + + +def _print_html(data, headers, out_dir): + html = '<html>' + "\n" + html += '<head>' + "\n" + html += '<title>' + 'rps2tree' + '</title>' + html += _get_google_script_headers(data, headers, out_dir) + html += '</head>' + "\n" + html += '<div style="text-align:center">' + "\n" + html += '<h1 align=center>rps2tree</h1>' + "\n" + html += '<body>' + "\n" + html += _print_data(data, headers) + html += '</body>' + "\n" + html += '</div>' + "\n" + html += '</html>' + "\n" + return html + + +def _print_data(data, headers): + html = '' + for cdd in data: + html += '<h2>' + cdd['cdd_id'] + ' ' + cdd['description'] + '</h2>' + "\n" + html += '<p>' + cdd['full_description'] + '</br>' + '</p>' + "\n" + html += '<div id="' + (cdd['cdd_id'] + '_' + cdd['description']).replace('-', '_') + '_div' + '"></div>' + "\n" + html += '</br>' + "\n" + html += '</br>' + "\n" + html += '<img src=' + cdd['tree_files'] + ' href="' + cdd['tree_files'] + '">' + "\n" + html += '</br>' + "\n" + html += '<a href="' + cdd['align_files'] + '">' + cdd['align_files'] + '</a>' + "\n" + html += '</br>' + "\n" + html += '<a href="' + cdd['cluster_files'] + '">' + cdd['cluster_files'] + '</a>' + "\n" + html += '</br>' + "\n" + html += '<a href="' + cdd['cluster_nb_reads_files'] + '">' + cdd['cluster_nb_reads_files'] + '</a>' + "\n" + html += '</br>' + "\n" + html += '<a href="' + cdd['pairwise_files'] + '">' + cdd['pairwise_files'] + '</a>' + "\n" + html += '</br>' + "\n" + html += '</br>' + "\n" + html += '<hr>' + "\n" + return html + + +def _read_map_file(file): + reader = csv.reader(file, delimiter="\t") + data = list(reader) + headers = data[0] + headers[0] = headers[0][1:] + map_obj = [] + for i in range(1, len(data)): + dict = {} + if len(data[i]) != len(headers): + sys.exit('line and headers not the same length.') + for j in range(0, len(headers)): + dict[headers[j]] = data[i][j] + map_obj.append(dict) + return map_obj, headers + + +def _set_options(): + parser = argparse.ArgumentParser() + parser.add_argument('-m', '--map', help='The map file produced by rps2tree.pl script.', action='store', type=argparse.FileType('r'), required=True) + parser.add_argument('-o', '--out', help='The title for the HTML page.', action='store', type=str, default='./') + args = parser.parse_args() + return args + + +if __name__ == "__main__": + main() |
| b |
| diff -r 000000000000 -r bbaa89f070f4 rps2tsv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rps2tsv.py Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + + +# Name: rps2ecsv +# Author: Marie Lefebvre - INRAE +# Aims: Convert rpsblast xml output to csv and add taxonomy + + +import argparse +import json +import logging as log +from urllib import request +from urllib.error import HTTPError, URLError + +from Bio.Blast import NCBIXML +from ete3 import NCBITaxa + +ncbi = NCBITaxa() + + +def main(): + options = _set_options() + _set_log_level(options.verbosity) + hits = _read_xml(options) + _write_tsv(options, hits) + + +def _read_xml(options): + """ + Parse XML RPSblast results file + """ + log.info("Read XML file " + options.xml_file) + xml = open(options.xml_file, 'r') + records = NCBIXML.parse(xml) + xml_results = {} + for blast_record in records: + for aln in blast_record.alignments: + for hit in aln.hsps: + hsp = {} + hit_evalue = hit.expect + if hit_evalue > options.max_evalue: + continue + hit_frame = hit.frame[0] # frame + hit_evalue = hit.expect # evalue + hit_startQ = hit.query_start + hit_endQ = hit.query_end + hsp["frame"] = hit_frame + hsp["evalue"] = hit_evalue + hsp["startQ"] = hit_startQ + hsp["endQ"] = hit_endQ + hsp["query_id"] = blast_record.query_id + hsp["cdd_id"] = aln.hit_def.split(",")[0] + hsp["hit_id"] = aln.hit_id + hsp["query_length"] = blast_record.query_length # length of the query + hsp["description"] = aln.hit_def + hsp["accession"] = aln.accession + hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF") + log.info("Requeting Interpro for " + hsp["pfam_id"]) + url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/" + hsp["pfam_id"] + "/taxonomy/uniprot/" + req = request.Request(url) + try: + response = request.urlopen(req) + except HTTPError as e: + log.debug('Http error for interpro: ', e.code) + except URLError as e: + log.debug('Url error for interpro: ', e.reason) + else: + encoded_response = response.read() + decoded_response = encoded_response.decode() + payload = json.loads(decoded_response) + kingdoms = [] + for item in payload["taxonomy_subset"]: + lineage_string = item["lineage"] + lineage = [int(i) for i in lineage_string] + translation = ncbi.get_taxid_translator(lineage) + names = list(translation.values()) + taxonomy = names[1:] # remove 'root' at the begining + kingdoms.append(taxonomy[0]) + frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} + sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) + concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) + hsp["taxonomy"] = concat_freq + xml_results[hsp["query_id"]] = hsp + return xml_results + + +def _write_tsv(options, hits): + """ + Write output + """ + log.info("Write output file " + options.output) + headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" + f = open(options.output, "w+") + f.write(headers) + for h in hits: + f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") + f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") + f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") + f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) + f.write("\n") + f.close() + + +def _set_options(): + parser = argparse.ArgumentParser() + parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') + parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') + parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') + parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) + args = parser.parse_args() + return args + + +def _set_log_level(verbosity): + if verbosity == 1: + log_format = '%(asctime)s %(levelname)-8s %(message)s' + log.basicConfig(level=log.INFO, format=log_format) + elif verbosity == 3: + log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' + log.basicConfig(level=log.DEBUG, format=log_format) + + +if __name__ == "__main__": + main() |
| b |
| diff -r 000000000000 -r bbaa89f070f4 seek_otu.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seek_otu.R Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| @@ -0,0 +1,48 @@ +#!/usr/bin/env Rscript + +## Redirect R error handling to stderr. +options(show.error.messages = FALSE, error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) +}) + +## Avoid crashing Galaxy with a UTF8 error on German LC settings +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) == 0) { + stop("Arguments missing for Rscrpit", call. = FALSE) +} else { + # percentage of identity + id_threshold <- as.numeric(args[3]) + # get input data (matrix) + data <- read.csv(args[1], header = FALSE, sep = ",", row.names = 1) + # remove last 2 columns + data_length <- length(data) + # create matrix + mat <- as.matrix(data[, 1:data_length], fill = TRUE) + # create coordinate matrix + d <- as.dist(1 - mat) + # create tree + hc <- hclust(d, method = "single") + # assign otu based on identity value + otu <- cutree(hc, h = -id_threshold) + # group contigs by otu + # Print results to output file + output <- args[2] + # unique is used to know the number of different otu + for (i in unique(otu)) { + # retrieve contigs belonging to the same otu + clust <- which(otu == i) + # write otu number and number of contigs in this otu + cat( + paste("OTU_", i, ",", length(clust), ",", sep = ""), + file = output, append = TRUE + ) + for (n in names(clust)) { + # write contigs name + cat(paste(gsub(" ", "", n), ",", sep = ""), file = output, append = TRUE) + } + cat("\n", sep = "", file = output, append = TRUE) + } +} |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_contigs.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_contigs.fa Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,16 @@ +>ds2020-482-EDGG-1-Q4_42600 +TCGGTGGGGGGACCTTGCGGACATGGGCGGCGGACCGTAAGATGTATAGAGGTGGGGGTA +GTAGTTTTGATGCCCTTTTGCTTTTGTGCCAAGCCA +>ds2020-482-EDGG-1-Q4_107243 +TATATCTGTGCTTTGGAACACAATGATTCTCAAAGTCTATGTCGAGACTGGAAACTCTCT +>ds2020-482-EDGG-1-Q4_2681 +CCTTCCTAGCGACCACGCACACGTCAAGACCGGCATCATCAATGTCGCGACAATCGTGAA +CCACTTTAGTATAGTCCACATCAAGATCATCATAAGGTAGATAAAAGGAATCAATTTCCC +TAGGAAAAAGTCCAGAATCATCTTCCTCATAAAAATCTGGTATCGAGGGATCAATGGTTC +GCACCACCATCTCGAATGTATCAAAGATCGTCGCGAAATCAAACTTTGCGGTATGCTTAA +CGACAAACTCGAAAAGGAAAAGTTTTACCCATTCGTCGTAGTTGTCATCTTTATGCACAC +CGAACGTCGAGAAAAACCCAAAGAACGTGTGCGTGGTCGCTAGGAAGG +>ds2020-482-EDGG-1-Q4_107857 +TCAAGATTGTCGAAAGTGCCACACAGATATTGGTTGCAGCTGTGATTACTGCAATTGGC +>ds2020-482-EDGG-1-Q4_63163 +AAGTTCATGGACTTCATCCGAGGAGTTGCCGTCATTGGGGAAGGGCAGTGGGGGATTGAG \ No newline at end of file |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_contigs.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_contigs.txt Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,1 @@ +1 Viruses Riboviria Orthornavirae Kitrinoviricota Alsuviricetes Martellivirales Bromoviridae Ilarvirus Blackberry chlorotic ringspot virus |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_input.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_input.xml Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,593 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+ <BlastOutput_program>tblastx</BlastOutput_program>\n+ <BlastOutput_version>TBLASTX 2.10.1+</BlastOutput_version>\n+ <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n+ <BlastOutput_db>/save/tcandresse/refseq/refseq.short.fa</BlastOutput_db>\n+ <BlastOutput_query-ID>ds2020-482-EDGG-1-Q4_42600</BlastOutput_query-ID>\n+ <BlastOutput_query-def>No definition line</BlastOutput_query-def>\n+ <BlastOutput_query-len>96</BlastOutput_query-len>\n+ <BlastOutput_param>\n+ <Parameters>\n+ <Parameters_matrix>BLOSUM62</Parameters_matrix>\n+ <Parameters_expect>0.001</Parameters_expect>\n+ <Parameters_gap-open>11</Parameters_gap-open>\n+ <Parameters_gap-extend>1</Parameters_gap-extend>\n+ <Parameters_filter>L;</Parameters_filter>\n+ </Parameters>\n+ </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+ <Iteration_iter-num>1</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-482-EDGG-1-Q4_42600</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>96</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+ <Hit_num>1</Hit_num>\n+ <Hit_id>ref|NC_035070.1|</Hit_id>\n+ <Hit_def>Spinach amalgavirus 1 isolate SRP059420 fusion protein and putative coat protein genes, complete cds</Hit_def>\n+ <Hit_accession>NC_035070</Hit_accession>\n+ <Hit_len>3420</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>51.4703</Hsp_bit-score>\n+ <Hsp_score>106</Hsp_score>\n+ <Hsp_evalue>6.20873e-08</Hsp_evalue>\n+ <Hsp_query-from>3</Hsp_query-from>\n+ <Hsp_query-to>95</Hsp_query-to>\n+ <Hsp_hit-from>1338</Hsp_hit-from>\n+ <Hsp_hit-to>1430</Hsp_hit-to>\n+ <Hsp_query-frame>3</Hsp_query-frame>\n+ <Hsp_hit-frame>3</Hsp_hit-frame>\n+ <Hsp_identity>20</Hsp_identity>\n+ <Hsp_positive>24</Hsp_positive>\n+ <Hsp_gaps>0</Hsp_gaps>\n+ <Hsp_align-len>31</Hsp_align-len>\n+ <Hsp_qseq>GGGTLRTWAADRKMYRGGGSSFDALLLLCQA</Hsp_qseq>\n+ <Hsp_hseq>GGGAMRSWEVDSQMYRGGGNSADALRLLGQA</Hsp_hseq>\n+ <Hsp_midline>GGG +R+W D +MYRGGG+S DAL LL QA</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>24</Statistics_hsp-len>\n+ <Statistics_eff-space>96786528</Statistics_eff-space>\n+ <Statistics_kappa>0.133956144488482</Statistics_kappa>\n+ <Statistics_lambda>0.317605957635731</Statistics_lambda>\n+ <Statistics_entropy>0.401214524497119</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>2</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-482-EDGG-1-Q4_60894</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>82</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>19</Statistics_hsp-len>\n+ <Statistics_eff-space>97069448</Statistics_eff-space>\n+ <Statistics_kappa>0.139224951877679</Statistics_kappa>\n+ <Statistics_lambda>0.315124495232289</Statistics_lambda>\n+ <Statistics_entropy>0.441609275168242</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>3</Iteration_iter-num>\n+ <Iteratio'..b' <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>14</Statistics_hsp-len>\n+ <Statistics_eff-space>97352368</Statistics_eff-space>\n+ <Statistics_kappa>0.133956144488482</Statistics_kappa>\n+ <Statistics_lambda>0.317605957635731</Statistics_lambda>\n+ <Statistics_entropy>0.401214524497119</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>22</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-482-EDGG-1-Q4_31663</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>111</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>29</Statistics_hsp-len>\n+ <Statistics_eff-space>96503608</Statistics_eff-space>\n+ <Statistics_kappa>0.133956144488482</Statistics_kappa>\n+ <Statistics_lambda>0.317605957635731</Statistics_lambda>\n+ <Statistics_entropy>0.401214524497119</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>23</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-482-EDGG-1-Q4_63163</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>81</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+ <Hit_num>1</Hit_num>\n+ <Hit_id>ref|NC_011591.1|</Hit_id>\n+ <Hit_def>Southern tomato virus, complete genome</Hit_def>\n+ <Hit_accession>NC_011591</Hit_accession>\n+ <Hit_len>3437</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>44.1389</Hsp_bit-score>\n+ <Hsp_score>90</Hsp_score>\n+ <Hsp_evalue>1.00223e-05</Hsp_evalue>\n+ <Hsp_query-from>1</Hsp_query-from>\n+ <Hsp_query-to>81</Hsp_query-to>\n+ <Hsp_hit-from>3034</Hsp_hit-from>\n+ <Hsp_hit-to>3114</Hsp_hit-to>\n+ <Hsp_query-frame>1</Hsp_query-frame>\n+ <Hsp_hit-frame>1</Hsp_hit-frame>\n+ <Hsp_identity>17</Hsp_identity>\n+ <Hsp_positive>19</Hsp_positive>\n+ <Hsp_gaps>0</Hsp_gaps>\n+ <Hsp_align-len>27</Hsp_align-len>\n+ <Hsp_qseq>KFMDFIRGVAVIGEGQWGIEMDRWIRF</Hsp_qseq>\n+ <Hsp_hseq>KVMDLIRGNATIGRGQWGNDVMDWIRF</Hsp_hseq>\n+ <Hsp_midline>K MD IRG A IG GQWG ++ WIRF</Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>19</Statistics_hsp-len>\n+ <Statistics_eff-space>97069448</Statistics_eff-space>\n+ <Statistics_kappa>0.133956144488482</Statistics_kappa>\n+ <Statistics_lambda>0.317605957635731</Statistics_lambda>\n+ <Statistics_entropy>0.401214524497119</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>24</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-482-EDGG-1-Q4_91422</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>67</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>7073</Statistics_db-num>\n+ <Statistics_db-len>36804204</Statistics_db-len>\n+ <Statistics_hsp-len>14</Statistics_hsp-len>\n+ <Statistics_eff-space>97352368</Statistics_eff-space>\n+ <Statistics_kappa>0.133956144488482</Statistics_kappa>\n+ <Statistics_lambda>0.317605957635731</Statistics_lambda>\n+ <Statistics_entropy>0.401214524497119</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+</BlastOutput_iterations>\n+</BlastOutput>\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_output.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_output.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,6 @@ +#algo query_id nb_reads query_length accession description organism percentIdentity nb_hsps queryOverlap hitOverlap evalue score tax_id taxonomy sequence +TBLASTX ds2020-482-EDGG-1-Q4_42600 96 +TBLASTX ds2020-482-EDGG-1-Q4_107243 60 +TBLASTX ds2020-482-EDGG-1-Q4_2681 348 NC_011554 Blackberry chlorotic ringspot virus RNA2, complete genome Blackberry chlorotic ringspot virus 56.3 2 100 9.0 1.04985e-23 128.1421 339420 Viruses;Riboviria;Orthornavirae;Kitrinoviricota;Alsuviricetes;Martellivirales;Bromoviridae;Ilarvirus;Blackberry chlorotic ringspot virus CCTTCCTAGCGACCACGCACACGTCAAGACCGGCATCATCAATGTCGCGACAATCGTGAACCACTTTAGTATAGTCCACATCAAGATCATCATAAGGTAGATAAAAGGAATCAATTTCCCTAGGAAAAAGTCCAGAATCATCTTCCTCATAAAAATCTGGTATCGAGGGATCAATGGTTCGCACCACCATCTCGAATGTATCAAAGATCGTCGCGAAATCAAACTTTGCGGTATGCTTAACGACAAACTCGAAAAGGAAAAGTTTTACCCATTCGTCGTAGTTGTCATCTTTATGCACACCGAACGTCGAGAAAAACCCAAAGAACGTGTGCGTGGTCGCTAGGAAGG +TBLASTX ds2020-482-EDGG-1-Q4_107857 59 +TBLASTX ds2020-482-EDGG-1-Q4_63163 81 |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_output_with_rn.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_output_with_rn.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,6 @@ +#algo query_id nb_reads query_length accession description organism percentIdentity nb_hsps queryOverlap hitOverlap evalue score tax_id taxonomy sequence +TBLASTX ds2020-482-EDGG-1-Q4_42600 12 96 +TBLASTX ds2020-482-EDGG-1-Q4_107243 63 60 +TBLASTX ds2020-482-EDGG-1-Q4_2681 8 348 NC_011554 Blackberry chlorotic ringspot virus RNA2, complete genome Blackberry chlorotic ringspot virus 56.3 2 100 9.0 1.04985e-23 128.1421 339420 Viruses;Riboviria;Orthornavirae;Kitrinoviricota;Alsuviricetes;Martellivirales;Bromoviridae;Ilarvirus;Blackberry chlorotic ringspot virus CCTTCCTAGCGACCACGCACACGTCAAGACCGGCATCATCAATGTCGCGACAATCGTGAACCACTTTAGTATAGTCCACATCAAGATCATCATAAGGTAGATAAAAGGAATCAATTTCCCTAGGAAAAAGTCCAGAATCATCTTCCTCATAAAAATCTGGTATCGAGGGATCAATGGTTCGCACCACCATCTCGAATGTATCAAAGATCGTCGCGAAATCAAACTTTGCGGTATGCTTAACGACAAACTCGAAAAGGAAAAGTTTTACCCATTCGTCGTAGTTGTCATCTTTATGCACACCGAACGTCGAGAAAAACCCAAAGAACGTGTGCGTGGTCGCTAGGAAGG +TBLASTX ds2020-482-EDGG-1-Q4_107857 402 59 +TBLASTX ds2020-482-EDGG-1-Q4_63163 88 81 |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_read_nb.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_read_nb.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,6 @@ +ds2020-482-EDGG-1-Q4_42600 12 +ds2020-482-EDGG-1-Q4_107243 63 +ds2020-482-EDGG-1-Q4_2681 8 +ds2020-482-EDGG-1-Q4_107857 402 +ds2020-482-EDGG-1-Q4_63163 88 +ds2020-482-EDGG-1-Q4_47667 1 |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_reads.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_reads.txt Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,1 @@ +0 Viruses Riboviria Orthornavirae Kitrinoviricota Alsuviricetes Martellivirales Bromoviridae Ilarvirus Blackberry chlorotic ringspot virus |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/blast2tsv_reads_with_rn.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/blast2tsv_reads_with_rn.txt Mon Mar 04 19:56:16 2024 +0000 |
| b |
| @@ -0,0 +1,6 @@ +#algo query_id nb_reads query_length accession description organism percentIdentity nb_hsps queryOverlap hitOverlap evalue score tax_id taxonomy sequence +TBLASTX ds2020-482-EDGG-1-Q4_42600 12 96 +TBLASTX ds2020-482-EDGG-1-Q4_107243 63 60 +TBLASTX ds2020-482-EDGG-1-Q4_2681 8 348 NC_011554 Blackberry chlorotic ringspot virus RNA2, complete genome Blackberry chlorotic ringspot virus 56.3 2 100 9.0 1.04985e-23 128.1421 339420 Viruses;Riboviria;Orthornavirae;Kitrinoviricota;Alsuviricetes;Martellivirales;Bromoviridae;Ilarvirus;Blackberry chlorotic ringspot virus CCTTCCTAGCGACCACGCACACGTCAAGACCGGCATCATCAATGTCGCGACAATCGTGAACCACTTTAGTATAGTCCACATCAAGATCATCATAAGGTAGATAAAAGGAATCAATTTCCCTAGGAAAAAGTCCAGAATCATCTTCCTCATAAAAATCTGGTATCGAGGGATCAATGGTTCGCACCACCATCTCGAATGTATCAAAGATCGTCGCGAAATCAAACTTTGCGGTATGCTTAACGACAAACTCGAAAAGGAAAAGTTTTACCCATTCGTCGTAGTTGTCATCTTTATGCACACCGAACGTCGAGAAAAACCCAAAGAACGTGTGCGTGGTCGCTAGGAAGG +TBLASTX ds2020-482-EDGG-1-Q4_107857 402 59 +TBLASTX ds2020-482-EDGG-1-Q4_63163 88 81 |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/index.html --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/index.html Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| b'@@ -0,0 +1,235 @@\n+<html>\n+<head>\n+<title>rps2tree</title><script type="text/javascript" src="https://www.gstatic.com/charts/loader.js"></script>\n+<script type="text/javascript">\n+google.charts.load(\'current\', {\'packages\':[\'table\']});\n+google.charts.setOnLoadCallback(pfam02123_pfam02123_RdRP_4);\n+function pfam02123_pfam02123_RdRP_4() {\n+var data = new google.visualization.DataTable();\n+data.addColumn(\'string\', \'#OTU_name\');\n+data.addColumn(\'number\', \'ds2020-267\');\n+data.addColumn(\'number\', \'ds2020-328\');\n+data.addColumn(\'string\', \'taxonomy\');\n+data.addColumn(\'string\', \'contigs_list\');\n+data.addRows([\n+[\'OTU_1\',2258,77,\'Viruses Wuhan insect virus 27\',\'ds2020-267_16,ds2020-267_4,ds2020-328_97,ds2020-328_98\'],\n+[\'OTU_2\',7516,0,\'Viruses Orthornavirae Duplornaviricota Chrymotiviricetes Ghabrivirales Quadriviridae Quadrivirus Rosellinia necatrix quadrivirus 1\',\'ds2020-267_2\'],\n+[\'OTU_3\',1161,0,\'Viruses Orthornavirae Duplornaviricota Chrymotiviricetes Ghabrivirales Quadriviridae Quadrivirus Rosellinia necatrix quadrivirus 1\',\'ds2020-267_6\'],\n+[\'OTU_4\',0,77,\'Viruses Orthornavirae Duplornaviricota Chrymotiviricetes Ghabrivirales Totiviridae Totivirus Scheffersomyces segobiensis virus L\',\'ds2020-328_109\']\n+]);\n+var table = new google.visualization.Table(document.getElementById(\'pfam02123_pfam02123_RdRP_4_div\'));\n+table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});\n+\n+}\n+google.charts.setOnLoadCallback(pfam00680_pfam00680_RdRP_1);\n+function pfam00680_pfam00680_RdRP_1() {\n+var data = new google.visualization.DataTable();\n+data.addColumn(\'string\', \'#OTU_name\');\n+data.addColumn(\'number\', \'ds2020-267\');\n+data.addColumn(\'string\', \'taxonomy\');\n+data.addColumn(\'string\', \'contigs_list\');\n+data.addRows([\n+[\'OTU_1\',912,\'Viruses Cryphonectria parasitica bipartite mycovirus 1\',\'ds2020-267_21\'],\n+[\'OTU_2\',3373,\'Viruses Penicillium aurantiogriseum partiti-like virus\',\'ds2020-267_8\']\n+]);\n+var table = new google.visualization.Table(document.getElementById(\'pfam00680_pfam00680_RdRP_1_div\'));\n+table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});\n+\n+}\n+google.charts.setOnLoadCallback(pfam00665_pfam00665_rve);\n+function pfam00665_pfam00665_rve() {\n+var data = new google.visualization.DataTable();\n+data.addColumn(\'string\', \'#OTU_name\');\n+data.addColumn(\'number\', \'ds2020-328\');\n+data.addColumn(\'string\', \'taxonomy\');\n+data.addColumn(\'string\', \'contigs_list\');\n+data.addRows([\n+[\'OTU_1\',104,\'Unknown\',\'ds2020-328_118\']\n+]);\n+var table = new google.visualization.Table(document.getElementById(\'pfam00665_pfam00665_rve_div\'));\n+table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});\n+\n+}\n+google.charts.setOnLoadCallback(pfam01443_pfam01443_Viral_helicase1);\n+function pfam01443_pfam01443_Viral_helicase1() {\n+var data = new google.visualization.DataTable();\n+data.addColumn(\'string\', \'#OTU_name\');\n+data.addColumn(\'number\', \'ds2020-328\');\n+data.addColumn(\'string\', \'taxonomy\');\n+data.addColumn(\'string\', \'contigs_list\');\n+data.addRows([\n+[\'OTU_1\',164,\'Viruses Orthornavirae Kitrinoviricota Alsuviricetes Martellivirales Bromoviridae Ilarvirus Prune dwarf virus\',\'ds2020-328_26\']\n+]);\n+var table = new google.visualization.Table(document.getElementById(\'pfam01443_pfam01443_Viral_helicase1_div\'));\n+table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});\n+\n+}\n+google.charts.setOnLoadCallback(pfam00078_pfam00078_RVT_1);\n+function pfam00078_pfam00078_RVT_1() {\n+var data = new google.visualization.DataTable();\n+data.addColumn(\'string\', \'#OTU_name\');\n+data.addColumn(\'number\', \'ds2020-328\');\n+data.addColumn(\'string\', \'taxonomy\');\n+data.addColumn(\'string\', \'contigs_list\');\n+data.addRows([\n+[\'OTU_1\',0,\'Unknown\',\'ds2020-328_43\']\n+]);\n+var table = new google.visualization.Table(document.getElementById(\'pfam00078_pfam00078_RVT_1_div\'));\n+table.draw(data, {showRowNumber: false, width: \'70%\', height: \'70%\'});\n+\n+}\n+google.charts.setOnLoadCallback(pfam01787_pfam01787_Ilar_coat);\n+function pfam01787_pfam01787_Ilar_coat() {\n+v'..b'/br>\n+<a href="pfam00665_rve/otu_cluster.csv">pfam00665_rve/otu_cluster.csv</a>\n+</br>\n+<a href="pfam00665_rve/cluster_nb_reads_files.tab">pfam00665_rve/cluster_nb_reads_files.tab</a>\n+</br>\n+<a href="pfam00665_rve/identity_matrix.csv">pfam00665_rve/identity_matrix.csv</a>\n+</br>\n+</br>\n+<hr>\n+<h2>pfam01443 pfam01443_Viral_helicase1</h2>\n+<p>pfam01443, Viral_helicase1, Viral (Superfamily 1) RNA helicase. Helicase activity for this family has been demonstrated and NTPase activity. This helicase has multiple roles at different stages of viral RNA replication, as dissected by mutational analysis.</br></p>\n+<div id="pfam01443_pfam01443_Viral_helicase1_div"></div>\n+</br>\n+</br>\n+<img src=pfam01443_Viral_helicase1/tree.dnd.png href="pfam01443_Viral_helicase1/tree.dnd.png">\n+</br>\n+<a href="pfam01443_Viral_helicase1/seq_aligned.final_tree.fa">pfam01443_Viral_helicase1/seq_aligned.final_tree.fa</a>\n+</br>\n+<a href="pfam01443_Viral_helicase1/otu_cluster.csv">pfam01443_Viral_helicase1/otu_cluster.csv</a>\n+</br>\n+<a href="pfam01443_Viral_helicase1/cluster_nb_reads_files.tab">pfam01443_Viral_helicase1/cluster_nb_reads_files.tab</a>\n+</br>\n+<a href="pfam01443_Viral_helicase1/identity_matrix.csv">pfam01443_Viral_helicase1/identity_matrix.csv</a>\n+</br>\n+</br>\n+<hr>\n+<h2>pfam00078 pfam00078_RVT_1</h2>\n+<p>pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses.</br></p>\n+<div id="pfam00078_pfam00078_RVT_1_div"></div>\n+</br>\n+</br>\n+<img src=pfam00078_RVT_1/tree.dnd.png href="pfam00078_RVT_1/tree.dnd.png">\n+</br>\n+<a href="pfam00078_RVT_1/seq_aligned.final_tree.fa">pfam00078_RVT_1/seq_aligned.final_tree.fa</a>\n+</br>\n+<a href="pfam00078_RVT_1/otu_cluster.csv">pfam00078_RVT_1/otu_cluster.csv</a>\n+</br>\n+<a href="pfam00078_RVT_1/cluster_nb_reads_files.tab">pfam00078_RVT_1/cluster_nb_reads_files.tab</a>\n+</br>\n+<a href="pfam00078_RVT_1/identity_matrix.csv">pfam00078_RVT_1/identity_matrix.csv</a>\n+</br>\n+</br>\n+<hr>\n+<h2>pfam01787 pfam01787_Ilar_coat</h2>\n+<p>pfam01787, Ilar_coat, Ilarvirus coat protein. This family consists of various coat proteins from the ilarviruses part of the Bromoviridae, members include apple mosaic virus and prune dwarf virus. The ilarvirus coat protein is required to initiate replication of the viral genome in host plants. Members of the Bromoviridae have a positive stand ssRNA genome with no DNA stage in there replication.</br></p>\n+<div id="pfam01787_pfam01787_Ilar_coat_div"></div>\n+</br>\n+</br>\n+<img src=pfam01787_Ilar_coat/tree.dnd.png href="pfam01787_Ilar_coat/tree.dnd.png">\n+</br>\n+<a href="pfam01787_Ilar_coat/seq_aligned.final_tree.fa">pfam01787_Ilar_coat/seq_aligned.final_tree.fa</a>\n+</br>\n+<a href="pfam01787_Ilar_coat/otu_cluster.csv">pfam01787_Ilar_coat/otu_cluster.csv</a>\n+</br>\n+<a href="pfam01787_Ilar_coat/cluster_nb_reads_files.tab">pfam01787_Ilar_coat/cluster_nb_reads_files.tab</a>\n+</br>\n+<a href="pfam01787_Ilar_coat/identity_matrix.csv">pfam01787_Ilar_coat/identity_matrix.csv</a>\n+</br>\n+</br>\n+<hr>\n+<h2>pfam01573 pfam01573_Bromo_MP</h2>\n+<p>pfam01573, Bromo_MP, Bromovirus movement protein. </br></p>\n+<div id="pfam01573_pfam01573_Bromo_MP_div"></div>\n+</br>\n+</br>\n+<img src=pfam01573_Bromo_MP/tree.dnd.png href="pfam01573_Bromo_MP/tree.dnd.png">\n+</br>\n+<a href="pfam01573_Bromo_MP/seq_aligned.final_tree.fa">pfam01573_Bromo_MP/seq_aligned.final_tree.fa</a>\n+</br>\n+<a href="pfam01573_Bromo_MP/otu_cluster.csv">pfam01573_Bromo_MP/otu_cluster.csv</a>\n+</br>\n+<a href="pfam01573_Bromo_MP/cluster_nb_reads_files.tab">pfam01573_Bromo_MP/cluster_nb_reads_files.tab</a>\n+</br>\n+<a href="pfam01573_Bromo_MP/identity_matrix.csv">pfam01573_Bromo_MP/identity_matrix.csv</a>\n+</br>\n+</br>\n+<hr>\n+</body>\n+</div>\n+</html>\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s1.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s1.fa Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,6856 @@\n+>ds2020-267_1\n+CAGCACGTCCGCAAAAGGTCCTCGCTCCAAAACTGTCTCATCATCTCTATCGTGCCCTGA\n+CCCGTTTTCATCTTTGTACCTAGCTCTACTTCTTCTATCCTTGTTGGCACTTCTATCGCC\n+ATTACCCTCGCTATGTTGCCACGTTGTCCGTTGACCATAGGCCGCCACATCCTATAATGC\n+CTCATGTCTCCAACTACTGCCCCACTCATTGTTATCTCTGTGTATACTGGACATGTTTGC\n+CCTCTGTCTGGCCACAGCAGATTGGCTGCGGCTATAAACTTGTACAGTTTCACCTCGTCG\n+TCATGTGGTGTCAGCACTATGTTCTTATCTATCACGACTCTAGCTTCATGTGGAAAGCAG\n+TTCTCCTGATCTACCGGTATTCTGTGTAGTGAACCATACCCTAGGTGGCAACAGGGCAAC\n+GACATGTGCACCAAATTCACGTGCGTGACTAGAGGCCTCTTGAAGTACATCTTCGCACGG\n+TATTCAAACTCCGTTTGCATCATCCTTCCTTTCACACCTGCCACAATTGGGTTCCTGATG\n+TGAAATTCTCCATATTCTAGTGACATTTCTAGGTTCACTGGCGTCACCAGGAACCCCAGC\n+ATGTTTGACAGCCACGGCACCACTGCGTTGAATGTTGATAGCTCTCCAAGGTGTTGGTGC\n+AGCTCACCAAGTCTGCAGTTCCTGAAGGAGTGTGGCAGTCTAATATCCAACTCCCTCACT\n+TCAGACAGCTCCCAATGCCATATAGGAAAGTCTATGAAACTGCCTCTAATTGTGTTCTTA\n+TCTTTCGTCTGCTTGAAGGAAGTACATCCCAGGCACACACAGTACCTCGTCACATCTCTG\n+AAAGTTGCCTGACCAGGTGCAAACGTGGGTGGAGGCTGTGGTATCTTTGGTTCAATCAAC\n+CTTATTATTTCAGCAGGTGCTATCCTGGTGAACATGTCTGCCTGTTTTGCCAGTTTCAGC\n+AAAGAGTGCTCATCACCCTGCTTCTTGAATAGGCAGAAATGCGGCCCTGTGACATTGTGT\n+TTGTGCCCACAGCTCAGCTTCAGCTGGCTATTCTGTGTAGAGTTGCCAAACCAGTTTATC\n+TTTTCGCCGTAGTACATCCACCTACTTGGGTGCATTTCTTCTTCGTTAAGTGGCATGCCT\n+TTCACCATCCCTTGCATAACCTGGGCCTCGGTATTGATGCCGTGTTCGTGGTCAAAAATA\n+GGGCACTTGCCCTCTGCACCGCATGACATCAACGGCATGCCTTGCCATTTCCCGGTCAGT\n+TCCTGGCAATTTTCAGGTGTCGGTGGTACCAGTGAGGTTCCTCCGCGCTTGAAAGTGTCA\n+TCGTCATCACTCTGCTCTTCCATTATGTGACGTCTTATCCTTCCTAGCTCCTCTCTCGCT\n+CGCTCAGTTCTGTAAGCCTTTTCTTCTTTTCTGTTCATCCATGGCACGTCCCATGATCGT\n+GCGTCTTTTTCCCTTTGTTCCGAACTCGTGTCTGTTTGTGAATCGACAAACTCTTTCGCC\n+TGCCTCAATTTTTCAGCCACATCATCACACTCGATTCTCACTTCTGGCTCTAGCACGACA\n+TGCCCCTCACCTATAAGCAGGTTCTTCATGGTGTCCGCCCCCTCTGTATCTGACTCTGAC\n+GCTTCGTCCTGCGTCTTAACAGGGCTTGGTGTAGGGCCACCATCGAACGTGACGTCAGTT\n+TCTTGCTCTACATCCACCTCAACACTCAGCTTCTTAGAGTGAGCGCTTTTCCGTTCAGTG\n+AACTTGTAGACCAACATGTTTGCGTTGTACATGAGCTGTCCCCTTATGTCTTCCGATTTG\n+TCGCTTGGCCTACGCTGTTTCTGATCGAGCGCCTCGTCAACGTATTCAATGTCTGCTACG\n+TGCTCAGTGCTCATCATCTTGGCGTCTATTACGTCCTCGATTGACGGTGTCCTCTTTAGA\n+GTCTTCACCTCTAGTGTTCCGTCAACCACGACACCTGCCTCTTCGATTGCCTGGTTGCAC\n+AGCGCGCACGACACAAGCTGCTTGCACTTCCTGCACACATGCTGCATTTTTATGTATCCA\n+CAGCAACTTGTTTGGCACGGTGAAGACATCCTGACACACTTCTTGTTCTTCACCCTCGTC\n+TTGGGGGTGCTCATCACTTCATGGACAGACTTGATATTGCGTTGCCCTTTTTGACCGGTT\n+TCAGCCACACATACTTGATACGTGTGCACCAGGCTCAAGTAGTTTGGCCAGTTTGCATCA\n+GCTGCCACCACCAGTGACGCAACATACCTCACACGCGGTGAGTAGTTAGCAAACAAAGTG\n+TGTACCTGCTCCAATCTAGCAGCAGGTGTCATCTCGGTGCTGTTGAGTTTCTCATCCTTG\n+TATTTTAAACCCCTTTTGGTCAAAGTGAGGGCGTAACCGGTGCAGGCGTGCCATACTAGT\n+CTTGAAAGTGACCGTTTTGCGTGCTTGGTCAGTGGCAAGTAATAAACTTGCTTCGCATGT\n+TCGAAAGCCCTCTTACTATGCCAGGTGTACTCGAGTTCACTTTCTTCTGCCAGCAGTGCT\n+GCTGCATCTAGCCATTCTTACATGTCAGACTGATCAACGTACTGGCCTCCTGTTGCGGAC\n+GTGCTGNNNNNNNNNNATCTAAGGAGCAGCACGTCCGCAATGATGGCCCCATACAGTAGT\n+GATTTCTTGTCGAAACCACCAAATTCGAACACTGGCCTGTTGATACTTGAGCTAGTGATC\n+AACATCTTAGCGTTTTCTGTCTCTGCAGTGAAAACAGACCTGTACGTTGCAGCTCCAATA\n+ATGAGCTGTTTCAAATCCATTTGCTTGTCGTGCCATTTGGTCGGTGACACACATATTCCT\n+CTCTTGGTTCTGATGCACATTAGCGCTCTGTACAAGTTTACGACTGACGCTGCTCCTGTC\n+ACGTCGTTGATTGCTGACGTCAGTGACGCAAGGAACATCGATTTTGACGCCTGAGTTCCT\n+ATTGGCCTACCGCTTGACGTCACCGGAACTGCAATCCTGCAGTTGCTATTTGTGGTAACA\n+GCNNNNNNNNNNCAGCACGTCCGCAATTGCCTTCCATTGGGTTGCCAACTCCCAGTACCT\n+TTTCGTCACTCACCCTCAGTTCGAAGCCATATCGCCTGAGTTCTGGTGCGATAGCCATCA\n+CTCTCGCTTCTGCTCCCATTAACTGACCATTTATGATGTTGACATCAGATCTGTCGTTGC\n+CATCAGCAACGCAAGCGTCATCAACCCATGCCATCAGCGAGTTCAATGCTCCTGCCGTTG\n+CCCTCGCTGCTGATACATTAGTCAGCAGCACGCCTGCCTCATTTTGAGGGCCTTGGACAT\n+ACTCATCATGTTGCGATGTCACCGAACAAGCGTAGCTACCTTCAGACTGAACGTCAGTAG\n+TGTACCTGGTGCGCACTGCGCTACCTAGCACACTATCTGAAGTCAAAGCGCTTCTCGAGC\n+TCATTATGCCCGCCCGCATGAAAGGTTCATCTTTGTCAACCACTTCATAAACACCATGAC\n+CCGTCGGAGTTACAACGCTCTCGTCATGCTGCTTGTGTTTGCCACCAAAACCAGCGTATC\n+TGTTGGTGGCGATTATTGGCGTGACTCTTGCTGCGACTGATTTTTCAGCAGAGTTGTATT\n+GAAAAACTGGTACGCCCATCTTGTACAGTTGGCTGCAATGTCCCAA\n+>ds2020-267_2\n+CAGCACGTCCGCAACAGTTGGTCCTTGCTAACAGTAATGGCGACTTGTGGACACGCATGG\n+GCGCCTTGCATGGACAAAGTTCTGCAGTGGCCTGACATAACGAACACGTTTATGTCATCA\n+CTACTACTTGCCATGGCTGCACTACCACCAGAGTTATATGTACTAATGGTGGAGTGG'..b'TGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+A\n+>ds2020-267_1209\n+CTTGCGGACGTGCTGAGTACAATATCTTTGCGGACGTGCTGCTCCTTAGATAGATCGGAA\n+G\n+>ds2020-267_1210\n+TTGCGGACGTGCTGACCGGATCTAAGTTGCGGACGTGCTGCTCCTTAGATAGATCGTAAG\n+A\n+>ds2020-267_1211\n+TTGCGGACGTGCTGAGGGGATTCGCTTTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+A\n+>ds2020-267_1212\n+CAGCACGTCCGCAAACGATAGGCGTTAGCACGTCCGCAAGTTTATACGCCTCAGCACGTC\n+>ds2020-267_1213\n+CAGCACGTCCGCAATGCAGACCCTTTAGCACGTCCGCAATTCGGCACTCTCAGCACGTCC\n+>ds2020-267_1214\n+TTGCGGACGCGCTGGACGTGCTGATCAAGGCGCATTTGCGGACGTGCTGCTCCTTAGATA\n+>ds2020-267_1215\n+TCTCCGATCTATCTAAGGAGCAGCACGTCCGCAATTACACCCACCTCAGCACGTCCGCAA\n+>ds2020-267_1216\n+CTTCCGATCTATCTAAGGAGCAGCACGTCCGCAAGTGATGATCTCTCAGCACGTCCGCAA\n+>ds2020-267_1217\n+GCAGCCGTCCGCAACCAACTGCTGCTCAGCACGTCGCAAGTCATATGGCCTCAGCACGTC\n+>ds2020-267_1218\n+GGACGTGCTGAGGACTGCTACATTGGGACGTGCTAAGCGAGCATGGTTGCGGACGTGCTG\n+>ds2020-267_1219\n+TTGCGGACGTGCTGAGGTATGGTAGATTGCGGACTGCTGCTCCTTAGATAGATCGGAAGA\n+>ds2020-267_1220\n+TTGCGGACGTGCTGAGGGATCCGCAGTTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+>ds2020-267_1221\n+TTGCGGACGTGCTGAGGGGCGTGCTATTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+>ds2020-267_1222\n+TTGCGGACGTGCTGAGCCATGCACACTTGCGGACTGCTGCTCCTTAGATAGATCGGAAGA\n+>ds2020-267_1223\n+TTGCGGACGTGCTGAGTCCCGACCACTTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+>ds2020-267_1224\n+CTTCCGATCTATCTAAGGAGCAGCACGTCCGCAAATCAAAGTACCTCAGCACGTCCGCAA\n+>ds2020-267_1225\n+TCTCCGATCTATCTAAGGAGCAGCACGTCCGCAACGTAGGATCTGTCAGCACGTCCGCAA\n+>ds2020-267_1226\n+TTGCGGACGTGCTGAGGATTCCACATTTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+>ds2020-267_1227\n+TTGCGGACGTGCTGAGGTAAGGAGCATTGCGGACGTGCTGCTCCTTAGATAGATCGGGAG\n+>ds2020-267_1228\n+CTCTCCGATCTATCTAAGGAGCAGCACGTCCGCAAGCGTTATCCCTCAGCACGTCCGCAA\n+>ds2020-267_1229\n+TTGCGGACGTGCTGCCCGGATAACATTGCGGACGTGCTGCTCCTTAGATAGATCGGAAGA\n+>ds2020-267_1230\n+TTGCGGACGTGCTGCCGAGATAACATTGCGGACGTGCTGCTCCTTAGATAGATCGGAAGA\n+>ds2020-267_1231\n+TTGCGGACGTGCTGAGAGAATCAAGTTGCGGACGTGCTGCTCCTTAGATAGATCGGGAAG\n+>ds2020-267_1232\n+TTGCGGACGTGCTACCCCGTGATACTTGCGGACGTGCTGCTCCTTAGATAGATCGAAAGA\n+>ds2020-267_1233\n+TTGCGGACGTGCTGAGGGGATCCTTATTGCGGACGTGCTGCTCCTTAGATAGATCGGAAG\n+>ds2020-267_1234\n+TCCGATCTATCTAAGGAGCAGCACGTCCGCAACCTGCTATCCGTCAGCACGTCCGCAACT\n+>ds2020-267_1235\n+TGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGAGTGA\n+>ds2020-267_1236\n+CCACTGCCACTGCCACTGTTGAGACTATCCCCCAAGCCAAAGGTATTGCGGACGTGCTG\n+>ds2020-267_1237\n+TTGCGGACGTGCTGACTGAGAGGGCATTGCGGAAGTGATCACGTATTGCGGACGTGCTG\n+>ds2020-267_1238\n+TTCCGATCTATCTAAGGAGCAGCACGTCCGCAAATTCATTCTGGTCAGCACGTCCGCAA\n+>ds2020-267_1239\n+ATTGCGGACGTGCTGAGGCACTGTTCGTTGCGGACGTGCTGCTCCTTAGATAGATCGGA\n+>ds2020-267_1240\n+TTGCGGACGTGCTGATGGGTTTCGTCTTGCGGACGTGCTGCTCCTTAGATAGATCGGAA\n+>ds2020-267_1241\n+CTTCCGATCTATCTAAGGAGCAGCACGTCCGCAACGTGCTATGCCTCACACGTCCGCAA\n+>ds2020-267_1242\n+TTGCGGACGTGCTGAGGGAACCGGCTTTGCGGACGTGCTGCTCCTTAGATAGATCGGAA\n+>ds2020-267_1243\n+TTCCGATCTATCTAAGGAGCAGCACGTCCGCAATCCGATTGCCCTCAGCACGTCCGCAA\n+>ds2020-267_1244\n+CTTCGATCTATCTAAGGAGCAGCACGTCCGCAAGAGATTACTCCTCAGCACGTCCGCAA\n+>ds2020-267_1245\n+TTGCGGACGTGCTGGGGAGTATTGCTTGCGGACGTGCTGCTCCTTAGATAGATCGGGAG\n+>ds2020-267_1246\n+TTAGAGGGACTATCGGCTCAAGCCGATGGAAGTTTGAGGCAATAACAGGTCTGTGCTG\n+>ds2020-267_1247\n+TTGCGGACGTGCTGAGGCGATACCTCTTGCGGACGTGCTGCTCCTTAGATAGATCGGA\n+>ds2020-267_1248\n+GTAAAAGCTCACTGGTAACCGGTCCAAAACGAAACTCTTAAAACAGTGGATACCCTCC\n+>ds2020-267_1249\n+CGGACGTGCTGAGACAATGGCGCTTGCGGACGTGCTGCTCCTTAGATAGATCGGAAGA\n+>ds2020-267_1250\n+CCCGATCTATCTAAGGAGCAGCACGCCCGCAAATGTACACCGGTCAGCACGTCCGCAA\n+>ds2020-267_1251\n+TCCGATCTATCTAAGGAGCAGCACGTCCGCAAGGTAGACGCCCTCAGCACGTCCGCAA\n+>ds2020-267_1252\n+GTGCTGAGCCAGACTACTTGCGGACGTGCTGAGGGAGCCTAAATTGCGGACGTGCTG\n+>ds2020-267_1253\n+TTGCGGACGTGCTGAGTGTTTACAATTTGCGGACGTGCTGCTCCTTAGATAGATCGG\n+>ds2020-267_1254\n+TGCTGAGCGACTATAAATTGCGGACGTGCTGAGGGATTCACCGTTGCGGACGTGCTG\n+>ds2020-267_1255\n+TTGCGGACGTGCTGACGGACGACTATTTGCGGACGTGCTGCTCCTTAGATAGATCGG\n+>ds2020-267_1256\n+GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n+>ds2020-267_1257\n+GCTGAGGGCAGTGGGCTTGCGGACGTGCTGACGGATACGTCATTGCGGGCGTGCTG\n+>ds2020-267_1258\n+CTCACTCCTCAGCACGTCCGCAAACTGCTTCGGGTTGGGACGTGCTGAGGAGTGAC\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s1_rps.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s1_rps.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,109 @@\n+#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\tno rank\tfamily\tgenus\n+"ds2020-267_100"\t"376"\t"pfam02823"\t"gnl|CDD|376940"\t"3.06167e-09"\t"228"\t"347"\t"-3"\t"pfam02823, ATP-synt_DE_N, ATP synthase, Delta/Epsilon chain, beta-sandwich domain. Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. The subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213)."\t"Bacteria(0.97);Eukaryota(0.03);"\t"(1.00);"\t"Lactobacillaceae(0.05);Rhodobacteraceae(0.04);Streptococcaceae(0.03);Bacillaceae(0.03);Burkholderiaceae(0.02);"\t"Lactobacillus(0.04);Streptococcus(0.03);Bacillus(0.02);Mycoplasma(0.02);Synechococcus(0.01);"\n+"ds2020-267_100"\t"376"\t"pfam00401"\t"gnl|CDD|366077"\t"8.90041e-05"\t"87"\t"218"\t"-3"\t"pfam00401, ATP-synt_DE, ATP synthase, Delta/Epsilon chain, long alpha-helix domain. Part of the ATP synthase CF(1). These subunits are part of the head unit of the ATP synthase. This subunit is called epsilon in bacteria and delta in mitochondria. In bacteria the delta (D) subunit is equivalent to the mitochondrial Oligomycin sensitive subunit, OSCP (pfam00213)."\t"Bacteria(0.97);Eukaryota(0.03);"\t"(1.00);"\t"(0.06);Clostridiaceae(0.05);Lachnospiraceae(0.05);Bacillaceae(0.04);Peptococcaceae(0.04);"\t"(0.06);Clostridium(0.05);Lactobacillus(0.03);Bacillus(0.03);Eubacterium(0.02);"\n+"ds2020-267_114"\t"347"\t"pfam00471"\t"gnl|CDD|376336"\t"8.05888e-12"\t"132"\t"302"\t"3"\t"pfam00471, Ribosomal_L33, Ribosomal protein L33. "\t"Bacteria(0.86);Eukaryota(0.14);"\t"(1.00);"\t"(0.07);Mycoplasmataceae(0.07);Clostridiaceae(0.06);Bacillaceae(0.03);Lactobacillaceae(0.03);"\t"Mycoplasma(0.06);Clostridium(0.05);(0.04);Lactobacillus(0.02);Bacillus(0.02);"\n+"ds2020-267_117"\t"344"\t"pfam00252"\t"gnl|CDD|376306"\t"7.27175e-23"\t"107"\t"295"\t"2"\t"pfam00252, Ribosomal_L16, Ribosomal protein L16p/L10e. "\t"Bacteria(0.58);Eukaryota(0.29);Archaea(0.13);"\t"(1.00);"\t"(0.08);Clostridiaceae(0.03);Mycoplasmataceae(0.03);Spirochaetaceae(0.02);"\t"(0.04);Clostridium(0.03);Mycoplasma(0.02);"\n+"ds2020-267_118"\t"343"\t"pfam00421"\t"gnl|CDD|366090"\t"7.68219e-41"\t"92"\t"337"\t"-1"\t"pfam00421, PSII, Photosystem II protein. "\t"Bacteria(0.79);Eukaryota(0.21);"\t"(1.00);"\t"Gloeobacteraceae(0.14);Synechococcaceae(0.14);Prochloraceae(0.14);Acaryochloridaceae(0.14);Nostocaceae(0.07);"\t"Acaryochloris(0.14);Gloeobacter(0.14);Prochlorococcus(0.14);Synechococcus(0.14);Nostoc(0.07);"\n+"ds2020-267_120"\t"339"\t"pfam16639"\t"gnl|CDD|374695"\t"2.20279e-25"\t"197"\t"325"\t"-3"\t"pfam16639, Apocytochr_F_N, Apocytochrome F, N-terminal. This is the N-terminal domain of cytochrome f. It is a soluble lumen-side domain."\t"Bacteria(0.75);Eukaryota(0.25);"\t"(1.00);"\t"Synechococcaceae(0.25);Gloeobacteraceae(0.07);Prochloraceae(0.07);Aphanothecaceae(0.07);(0.07);"\t"Synechococcus(0.21);Prochlorococcus(0.07);Gloeobacter(0.07);Oscillatoria(0.04);Aureococcus(0.04);"\n+"ds2020-267_130"\t"330"\t"pfam00680"\t"gnl|CDD|366242"\t"7.64962e-05"\t"124"\t"282"\t"1"\t"pfam00680, RdRP_1, RNA dependent RNA polymerase. "\t"Viruses(1.00);"\t"Riboviria(1.00);"\t"Caliciviridae(0.30);Picornaviridae(0.30);Secoviridae(0.20);Potyviridae(0.20);"\t"Vesivirus(0.20);Aphthovirus(0.10);Sequivirus(0.10);Bymovirus(0.10);Potyvirus(0.10);"\n+"ds2020-267_139"\t"320"\t"pfam05860"\t"gnl|CDD|368641"\t"1.34887e-13"\t"167"\t"298"\t"2"\t"pfam05860, Haemagg_act, haemagglutination activity domain. This domain is suggested to be a carbohydrate- dependent haemagglutination activity site. It is found in a range of haemagglutinins and haemolysins."\t"Bacteria(1.00);"\t"(1.00);"\t"Nostocaceae(0.36);Burkholderiaceae(0.14);Pasteurellaceae(0.14);Pseudomonadaceae(0.12);Neisseriaceae(0.07);"\t"Nostoc(0.36);Ralstonia(0.14);Pseudomonas(0.12);Haemophilus(0.10);Neisseria(0.07);"\n+"ds2020-267_145"\t"315"\t"pfam02626"\t"gnl|CDD|376868"\t"3.97676e-05"\t"140"\t"256"\t"-3"\t"pfam02626, CT_A_B, Carboxyltransferase do'..b' in SelR proteins and fused with the peptide methionine sulfoxide reductase enzymatic domain pfam01625. The domain has two conserved cysteine and histidines. The domain binds both selenium and zinc. The final cysteine is found to be replaced by the rare amino acid selenocysteine in some members of the family. This family has methionine-R-sulfoxide reductase activity."\t"Bacteria(0.79);Eukaryota(0.16);Archaea(0.05);"\t"(1.00);"\t"Flavobacteriaceae(0.05);(0.04);Saccharomycetaceae(0.02);Vibrionaceae(0.01);Spirochaetaceae(0.01);"\t"(0.02);Mycoplasma(0.01);Vibrio(0.01);Corynebacterium(0.01);"\n+"ds2020-267_8"\t"1703"\t"pfam00680"\t"gnl|CDD|366242"\t"2.85682e-13"\t"685"\t"1458"\t"-3"\t"pfam00680, RdRP_1, RNA dependent RNA polymerase. "\t"Viruses(1.00);"\t"Riboviria(1.00);"\t"Caliciviridae(0.30);Picornaviridae(0.30);Secoviridae(0.20);Potyviridae(0.20);"\t"Vesivirus(0.20);Aphthovirus(0.10);Sequivirus(0.10);Bymovirus(0.10);Potyvirus(0.10);"\n+"ds2020-267_811"\t"208"\t"pfam07991"\t"gnl|CDD|285265"\t"1.80927e-08"\t"20"\t"190"\t"-1"\t"pfam07991, IlvN, Acetohydroxy acid isomeroreductase, NADPH-binding domain. Acetohydroxy acid isomeroreductase catalyzes the conversion of acetohydroxy acids into dihydroxy valerates. This reaction is the second in the synthetic pathway of the essential branched side chain amino acids valine and isoleucine. This N-terminal region of the enzyme carries the binding-site for NADPH. The active-site for enzymatic activity lies in the C-terminal part, IlvC, pfam01450."\t"Bacteria(0.76);Archaea(0.24);"\t"(1.00);"\t"Bacillaceae(0.07);Helicobacteraceae(0.05);Sulfolobaceae(0.05);Bartonellaceae(0.02);Leptospiraceae(0.02);"\t"Bacillus(0.07);Thermus(0.02);Tropheryma(0.02);Corynebacterium(0.02);Pyrococcus(0.02);"\n+"ds2020-267_817"\t"208"\t"pfam05656"\t"gnl|CDD|377540"\t"3.45664e-06"\t"86"\t"190"\t"-1"\t"pfam05656, DUF805, Protein of unknown function (DUF805). This family consists of several bacterial proteins of unknown function."\t"Bacteria(1.00);"\t"(1.00);"\t"Veillonellaceae(0.07);Sutterellaceae(0.06);Sphingomonadaceae(0.05);Rhodobacteraceae(0.04);Caulobacteraceae(0.04);"\t"Veillonella(0.04);Sphingomonas(0.04);Asticcacaulis(0.03);Dakarella(0.03);Prevotella(0.03);"\n+"ds2020-267_837"\t"207"\t"pfam04061"\t"gnl|CDD|367791"\t"2.43363e-18"\t"1"\t"159"\t"1"\t"pfam04061, ORMDL, ORMDL family. Evidence form suggests that ORMDLs are involved in protein folding in the ER. Orm proteins have been identified as negative regulators of sphingolipid synthesis that form a conserved complex with serine palmitoyltransferase, the first and rate-limiting enzyme in sphingolipid production. This novel and conserved protein complex, has been termed the SPOTS complex (serine palmitoyltransferase, Orm1/2, Tsc3, and Sac1)."\t"Eukaryota(1.00);"\t"(1.00);"\t"Saccharomycetaceae(0.15);Nosematidae(0.04);(0.04);Phaffomycetaceae(0.03);Salpingoecidae(0.03);"\t"Kazachstania(0.04);Thalassiosira(0.03);Trichomonas(0.03);Nosema(0.03);Nakaseomyces(0.03);"\n+"ds2020-267_94"\tno_hit\n+"ds2020-267_97"\t"380"\t"pfam04879"\t"gnl|CDD|368171"\t"1.9903e-08"\t"125"\t"274"\t"-2"\t"pfam04879, Molybdop_Fe4S4, Molybdopterin oxidoreductase Fe4S4 domain. This domain is found in formate dehydrogenase H for which the structure is known. This first domain (residues 1 to 60) of Structure 1aa6 is an Fe4S4 cluster just below the protein surface."\t"Bacteria(0.75);Archaea(0.25);"\t"(1.00);"\t"Enterobacteriaceae(0.11);Bacillaceae(0.09);Pseudomonadaceae(0.08);Methanobacteriaceae(0.06);Phyllobacteriaceae(0.06);"\t"Bacillus(0.09);Escherichia(0.09);Pseudomonas(0.08);Mesorhizobium(0.06);Synechococcus(0.06);"\n+"ds2020-267_98"\t"379"\t"pfam16203"\t"gnl|CDD|374428"\t"1.33948e-30"\t"131"\t"280"\t"-1"\t"pfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase. This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases."\t"Eukaryota(1.00);"\t"(1.00);"\t"Cryptosporidiidae(0.06);Vahlkampfiidae(0.06);(0.03);Opisthorchiidae(0.03);Chaetomiaceae(0.03);"\t"Naegleria(0.06);Cryptosporidium(0.06);Micromonas(0.03);Batrachochytrium(0.03);Caenorhabditis(0.03);"\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s1_tblastx.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s1_tblastx.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,57 @@\n+#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n+TBLASTX\tds2020-267_392\t26\t240\tNC_005979\tHelminthosporium victoriae 145S virus\tHelminthosporium victoriae 145S virus\t40.0\t1\t100\t6.0\t1.12512e-11\t66.1329\t164750\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Chrysoviridae;Chrysovirus;Helminthosporium victoriae 145S virus\tCAGCACGTCCGCAAGTTGTCCGGCTTAAATCTTTAGCCCCTAACTTAAGTGCCGCTACAGCTCCATTTTCTATTACTTTTTTTGTTCTATCACATAACCACATTCCTTCGAAGACTGATAGTTGTGAAATTTCATATATGTTGTCTTCATCTAAGTAATATAGAAATTTGAAACTTGGTGTTGCGTCCGTTAAACGTAGATCAGTGAAGTACGCACCCATTCGTAGTTGCGGACGTGCTG\n+TBLASTX\tds2020-267_268\t14\t259\tNC_001963\tSphaeropsis sapinea RNA virus 1, complete genome\tSphaeropsis sapinea RNA virus 1\t62.0\t1\t100\t5.0\t1.78772e-31\t132.115\t73497\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Totiviridae;Victorivirus;Sphaeropsis sapinea RNA virus 1\tCAGCACGTCCGCAATGCATGGCCTCTGAGTTTGTGGAAACGAACCCTCTGCCAACCTGGGACGGCACGACCCACGTCTCTAAGTCTGCCAAGTTAGAACACGGGAAGACCCGCGCAATATTCGCCTGCGACACCCGGTCGTATTTTGGGTTTTCGTGGATCCTAGATGCGACCCAACAGGCCTGGAAGAACGAACGGGTTGTCATGGATCCGGGCAAAGGTGGGAAGTGTGGGATGACGCAACGATTGCGGACGTGCTG\n+TBLASTX\tds2020-267_4\t1434\t2297\tNC_038699\tXanthophyllomyces dendrorhous virus L1b capsid protein (CP) and RNA-dependent RNA polymerase (RdRp) genes, complete cds\tXanthophyllomyces dendrorhous virus L1B\t44.3\t8\t100\t64.0\t1.9240409540575e-07\t928.6219\t1167691\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Totiviridae;Totivirus;Xanthophyllomyces dendrorhous virus L1B\tCTTCCGATCTATCTAAGGAGCAGCACGTCCGCAATTCAGCTACTCTCAGCACGTCCGCAATACTAACAGCTCAGCGCGTCCGCAACACGACTGGGGCACACGTCCGCAACGTCCACGACTTATCACTTGCGGACGTGCTGCTCCTTAGATTCTGGAAGTAAGGACGTTCGGGCTTCCTATTCCGCTTTATTAAGGTACACTAAGTAACTATTCCTATTAACTCCATCGGGTCCTTAGATCCTTTGAGTAGGTGTAGTAGTGGGCCGCTTTTATTGGTCCCATTCAACACGTCCATCAGGAAGCCGGTCATTTTGGCTTTTCCGTAATTGACTATGTCTGTTTCTTCTTTGTGTACCTTGAATAACGCTCGCAAAATCTCGTACCGCTTGATGTTACCATTCCTTGAAACGTTTATGTTCCTTTCTTTTGGTATCACCGCGTCGTATGTTGCGCGCATTATTCGTGAAATGAAATCCTGCAAAGGTCTTTCAAGTTGTAGTGATGCTTTCACCATCCTGGAGTAGTCCACCACTCCCGGTAACACGCCTATCTGCGTGGCACCCTTCCTGAAGCCCGACGACCGTATCATCCACTTTACGTCCGATCTCTTGTCTTCACTTATGCCCCCCACACACCTGTGGGAGGTCTTTATTTTATAACAGTCTGAAACTGTCATGTGCAATCGCTCACATTGTCTCGAGTAGTATTTGTTCCTCAATGACGCCGCTAACCACATTGGCATTCCCCTAGACACTGAATCGTCCAGACGCGACTCCAGCGCTTCCAAAAGGTCCCTCATATCGCTTGAAGGTTTTGACTCTATCCTCGAGTGTACCAAGGTTGCCATAGCTCTAGATAGATACTGTCCCTTAGACCCACGTTTGTGATCTACGCGTAGAAACTCTGCTATGGCCCCATACGCACATTTGCTCATCTGCAGGCGTATGTTGTGCTTCTTGGCATTTTTGCCAGCTAGCAACACGTCCTCGAGCGAATTACTTCCCAGTAGCACGTCGTCACCGTTGTGGAGGCTGTTTTGCGATTGTACCACGTCAGGTACTATCAGTTGAGTGTAAATGTAGTTAAGCACGCTGTTCATGAACGTAGTGAGTCGCCACCCCGATAACAGGGTCCCCTTAGCGTTGTACTCCATTTTCAAGCCTTGATTGTCGTGTACTATTACCCTATCCAGTGAAAGCCGAGTCCACTCCACAGCTGCTAGTTGCTCCTGAGTCAGGAAGTGTCCGAAAGTATCTCTGTACGCATCTATTACTGCTTTCATAGATTGTACACTGTGTTGACTGTTGAAATCCTCGAAATCTACACAATACTGAGTCCTGCCTTCTAAGACTGACCTTACTCTACTGCGGACGTTCTCATCGTTGGCTGCTTTTCCCACCGGGAACGGCGAGGGCAATACGTCCTCGCAGTTATAGAAGGCGAAATGTGCCAATACGTAACTAGTGACATCTGTCCCGTAGATAGCGCGGAGTTTGCTCCATTCATACTTCGTGGATGACCATGCGTGAAGTTCAGGATCTCTTTCGCGCCACGAGTCCATATTCATATCCGGCATGGCCAGTATTGATATGAACTTGTTCTTGAGGTATATGTCTTTGAATATGTATTTATCGTCTTCTGAATATTGCGAGTGTATGCTGCCGGCCGCACTCCACTGCCACCTACTCTGCCAGTACTCCCGCCAATCAAACTTCCTCGGTCTCTTACCCGCTGAGATCGATCTACTGAAGAGCTGCGAGGCCCTTTCATAAACCAATCCCTCCGGCATCTCGGCCAAGTTAGGGGACACCCTGTTCTTGTGCTCCTCCTCCCAGTTGACCAGTCTTGATTTGCGGACGTGCTGCTCCTTAGATANNNNNNNNNNGGACGTTCGGGCTTTGCGCTGGCAATGGAGAACAGTCCTGACCCTCTAGCGAGCTGCATCTCCTCGGGGGTGAGACCAGCTGCCCACAGTGCCACGCCCGTAAGGAATGAGTTAGTAGCTTCTCTGGTTATTGATAAGGCCAGAGCTACGCTGTCAGAGTTGACTCCCAAAATGTCTACCACCTCCTTGAACGAAAAGTGAACATGATGCGACGCCGTTATCTTGGTGTGTTTTGCCGACATTGCTTCATGTAACTGCCACCCTCTGCCTTGCTGTCCATTTACTTTCCTCAATAATCGCTTCGGAGACACAGGGTCCTCAAAGTCGATAGAATCGTAAAGACCTGAGGTGTGCCTGGTCATTTGAGAAAGTATTTCTTTGCGTATACCCCAAGATCTTTGCGGACGTGCTG\n+TBLASTX\tds2020-267_18\t840\t1037\tNC_016760\tRosellinia ne'..b'a;Chrymotiviricetes;Ghabrivirales;Quadriviridae;Quadrivirus;Rosellinia necatrix quadrivirus 1\tCAGCACGTCCGCAACTACGACACGTTTTTGTAGCGCTCCCTGGCCGTGTCTGCGTTCTGCATTGCTCGCGGCAAACACAGCAGGTTGTGTGCATGATCAAGTTGCAGACGCGTAACGTCCTCATCTTTTAGACACATTTGTGACCGCTGTGCCACGTGCACCTCTGCCATCGCTCTACCGCTGCCCAACATCTGGATGCTCGAGCAGATCATGCTGACCATCTCATCACCGTTGATTGACGAATTCTCACTGATCTTGTTAAACTGAGGGCTGTTAGATGATGAAAACAGCCTGTCAACCGTTGGCTCGCAAAAGGGTTGTACCACTTTGTACACCCCTGCGTAAAACACGTCAGCATAGTCGTGGTGGGACAAGGCTGCAGGCTGGTAGGTAGACGCCACAGCTAGCGACGCCATTAACATCACGTTTTTAGAAGCCATGAAGTGCCTGGTCGTGACGTGTCCAATGAGGTCCAACATGTCGTTGATCGAGAAACCAACACTGTTGAACCAGTCTCTGTCGAGGTGGGGCGCTGACGACCCTACACCTGCAATGAAGGACACATTGCACTCACTTGCCACCGCCATCAACTTCTGATGCCCCACAGCAGCCTCAATGGAAGGCGAGAACCCCGTATTTGCGGACGTGCTG\n+TBLASTX\tds2020-267_5\t37987\t2029\tNC_023684\tRhizoctonia solani dsRNA virus 2 segment 1, complete sequence\tRhizoctonia solani dsRNA virus 2\t47.6\t8\t100\t100\t3.1306275000000004e-37\t1379.9565000000002\t1411681\tViruses;Orthornavirae;Pisuviricota;Duplopiviricetes;Durnavirales;Partitiviridae;Rhizoctonia solani dsRNA virus 2\tATCGCACATGATAAAGCCCGATATCTAAGGAGCAGCACGTCCGCAACCCTCTGCCTCCAACAATAAAGCAGATTTCTTTGCTCTTCTAACAGCTATTACTTACCACAATGGACCACCTCACTTCCCTTTTCGAGCTTTTTGCTATCACACCGAAAACACAAAACAATCTACAGTTTGTTGGGATCTACCACAGACCTCCACACTCCGTTCGAGCAAACCTCCGCAACGTTGAAAAACACAAAATCACAGTCGCTCACGCCATGCACAAGTACCTTTACCCGCATGAAATCGACTTTGTTATCAACCAAATGCGACGCTCAGACGTCACTGAAGATGCCATACTTGCTGACTTTTTCGACAACAACGTCGAACCACTTGAACCTGTTCTTGACGAACACTTCGAACGTGGACTCTCCGCAATGCTGGACGCTTTTCGCCCTCCGCAGAAATGCCTACCTGCCCACATCTATGATGTGCAGCACCACTACCCATATAAATGGCAAGTGAACGCTGAAGCCCCCTTCTCCACCGATTCCTATTTCTTAGCGAATCGACCAACCTTCCGCGCAGTGTTTGAACGACTCGAATCGCTCTACACACACCTCGCAACCGATTGGCACCGCCGATACGGAAACAAAACCGACAATGATGATTTTATGAATGATCATGTCCCTGCGAAATTTGGCCCTATGAAAGAAACAGTCTTCTCATGGACTCACCGATGGCACCACGTCATCAAATCCAACTTCACCGACACAGCTGGATTGTCTAAAGACTATTACTTCAAAAACCGATACATCTTCCCAATGCTACTTCACACGAAGACAGCGATTGTCAAGAAAGACGACCCGAATAAGATGCGAACCATCTGGGGCTGTTCAAAGCCTTGGATCATCGCAGACACCATGCTATGGTGGGAATACGTCGCGTACGCTAAGTTACAACCTGGAGCCACACCAATGCTCTGGAGTTACGAAACCTTCACAGGTGGCTGGCTTAGACTCAACCACGCACTTTTCTCTTCATACATACGGCACTCGTACATCACACTCGACTGGAAACGCTTCGACAAGAAAGCGTATTTCTGCATCATCGACAAAATTTTCGATGGCGTTGAAACATTCCTCGACTTTGACAACGGCTATTTGCCTACGAAAGATTATCCCGATACCAAATCGACTTGGACACAAGAACGTTCCACCCGCCTCAAACGCCTGTTTGACTGGACAAAAGAGAACTTCTACCATGCACCAATTGTCCTACCCAATGGGCACATGTACGTCCGAAAATTCGCTGGAATACCCTCTGGCCTATTTATCACTCAACTGATCGATTCCTGGTACAACTACACCATGCTCGCAACCATCCTATCCGCGATGGGCTTCGACCCTCGGTCCTGTATTATTAAAGTCCAAGGTGATGACTCAATCATCCGCCTCAGTGCACTCATCCCTCCGGATGCTCACGATTCTTTTTTAACTAAGGTCCAAGAACTCGCCGACTACTACTTTCAATCAGTAGTCTCCGTGAACAAGTCTGAAGTACGCAACGAGCTCAACGGATGCGAAGTTTTATCGTACCGACACAGACACGGTTTACCATACCGCGATGAACTAGCTATGCTAGCTCAACTGTATCACACGAAAGCACGCAACCCAAGTCCCGAAATCACAATGGCACAATCCATCGGCTTCGCCTACGCTTCCTTCGGAAATCATGAAAGAGTACGTCTCGTACTACATGATATCTACGAATATTACAAGCATCAAGGCTACACACCCAACCGAGCCGGACTCAGCCTCGTCTTCGGAAACTCTCCTGACCTCATGATCCCGCACTACACACTTGATCACTTTCCCTCAATCAGGGAAATAAAAATGTTCCTGACTAATGCAAAATATGCCAATGAAGAAACCAACTCACGAACGTGGCCTTTAACCCACTTTCTCCATCTTCCTTGTCATCGCACTTAGTATTTGAGCAATTGCAATTACAACATAATTACAAAAAAAGGATTGCGGACGTGCTG\n+TBLASTX\tds2020-267_43\t465\t563\tNC_021222\tCryphonectria parasitica bipartite mycovirus 1 strain 09269 segment RNA1, complete sequence\tCryphonectria parasitica bipartite mycovirus 1\t57.9\t2\t100\t21.0\t5.04473e-43\t192.2912\t1329781\tViruses;Cryphonectria parasitica bipartite mycovirus 1\tCAGCACGTCCGCAACTACTCTCCCTTGCATCAGGCGGTGAAATGCGCTCTCTGCGTCGGCTTGTGCCAGGGGATTCGCCTCTTCCCTGGTCTTAAGTCCCATTGTAGCGTACTCAATGCCGGCGAACTTCTTGGGGTAGAACTTAACGGTGGCGAGATCTTCCACTGCCGGGAAGAAGAGGAAGTCGGGAAGGTCGAGTAACTTTGCCACTACATAGGCGGCCTTCTCTATGGAGTCGGTGTAGCGGGGGTCAGGGGTGTGATTTGGCCGATCGAAGTGGCGAAGGTGTGCCATCTCAACTGGGGTGCAGGCCTTGACGAATGTGAATTCCTCCGGCTTGAAATTGTTGATGGGGGGGGGATGGAGGAGGGTGTAGGCGACAACTTCGGGGTCGGGTGGCGCAATGTCGAGTTCCTTGTTTAGCTTCCCATAGGCTTCAACCCTACCGAGGATTCGTATATGTTGGAACATGTTGGTGAGTTCCATCACTTCGCGGGTCGCGGCCTCTTTGGACCTCATGGATCGCATACGCGTCCTACGCTTGATAGATTGCGGACGTGCTG\n+TBLASTX\tds2020-267_453\t17\t232\t\n+TBLASTX\tds2020-267_352\t4\t245\t\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s2.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s2.fa Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,6446 @@\n+>ds2020-328_1\n+GGTCGGGATAGACGTTGGAGCGCGGTCAGCCGAGACCCCTGACAGAGGAAAGAGTCTTGA\n+GGAGTCCAACGTTCGGCCAGGCATAATAATTCGTGCCCACTAATCGAATCGGTTTACTCG\n+CCCACCATGTCAGCGCCTTCGGGTTAGTTCTTTATGGAGTTATTTCTTGTTCTTTCTGTC\n+ATAACAATTCTCCTTATGGAAGTCCCCCACAAGATTAAGAATGCTTGTTGCCGCGGCTGT\n+CTTGGTTCTTCGTTTAGCTTTCTGGCATATTCTACGGAGCTTTCCACTCTGGAGCTGGCT\n+TTCAGAAAAACCTTCATTCACTAGATGTGAAGCACCCCTATGGGGGAACCCTCTTACCCG\n+CTTTCCTCCCCCCCCGATATGGGGGGCCTCGCTGTCGCCTTTGGCTCGAGCTACTTTTTC\n+TCCTGGAACGGATAGCTTTCTGTCCAAGTCTATCTCCCAAAAGTCAGCCATGTAATTGAC\n+TTCTAACGTCTAATTTTCTTTTTCACCGGGGGTCCCTGATCCCCGTTGAATATTCCTTCC\n+TTCTGAAAAAGCTGTGACTCCTAAATTCTTTGATTGAATGAAATGTGGACTTGGTCACGG\n+GGCAATCTTCTTTTTTAGACCCCGCTTCTCTCGGTGTTACCTTTTTCGCCTTCTCGTCTC\n+GCTTCGCGTCGTCNNNNNNNNNNGGTCGGGATAGACGCGCCGCTTCCGTCTGTTTTACCT\n+TGTTAGAATTCTCGGCGCGCTTGGCGTCGTCTTTAGAGTCAATTCTATTGGAATCTCCTT\n+TCCCCTTCTTTTCTTCCCCCACGATGAAAAAAATAAATATTGCAAAAAGAACAATATTTC\n+CCCCCGTGGTCATCATAGTGGTTCCTTCTGATCCTAGAAAACGATTAAAAAGAAGGCAAA\n+AAAAACAAAGGAGGATCTGCTTTTTCATATTAAAGCGCTTTCTTTCTTTTAAAAATACAT\n+CATAGTAAAGCGCTTCCTTTTAAAAAAGCATCTGGTTCCATCTTTCTTTCGTTAGTTAAC\n+CCACCTTTTTCTAAAAGGGATTGTAGTAATTCTGGTTTTACACTATTTAGAATGGCTCTC\n+TCATATTGAGAGATTCTGTCTAGTGGCATTCGATCACAGAATCCATTGACAGCTGCATAA\n+ATGACTAGAATTTGTTTTTCAATTGGAAGTGGTGCATATTGTGGTTGTTTCAGTACTTCT\n+GTAAGCCTTGCACCTCTATTGAGTAATGCCTGAGTCGCAGCATCAAGGTCTGACCCAAAT\n+TGAGCAAAGGCCGCCACTTCGCGATACTGTGCCAATTCCAGTTTTAAACTACCGCAGACC\n+TGTTTCATAATTTTCAACTGAGCGGCAGACCCGACGCGACTGACAGATAAGCCGACGTTA\n+ATAGCAGGTCTAATTCCGCGATAAAAGAGCTCTGTTTCCAAACAGATTTGTCCATCAGTA\n+ATGGAGATTACATTGGTTGGAATATAGGCCGATACGTCTCCAGCTTGTGTTTCAATGACG\n+GGTAAGGCGGTCAAGCTACCTGCACCTGTCTGGTCCGATCGTTTAGCGGCTCTTTCTAAG\n+AGACGGGAATGTAAATAGAAAACATCGCCTGGGAAAGCCTCACGGCCTGGTGGTCGGCGT\n+AACAATAATGACATTTGTCGATATGCCACCGCCTGTTTACTAAGATCATCATAGATTATT\n+AATGCGTGCATTCCATTATCGCGGAAATATTCCCCCATGGCACACCCAGAATATGGGGCC\n+AGAAATTGCAGAGGAGCTGGATCCGAAGCGGTGGCTGCTACAAGAATGGAATATTCCAAA\n+GCATTCGCTTCTGAAAGAATTTGAACTAATTGTGCCACAGTCGAGCGTTTCTGTCCAATT\n+GCTACATAGACACAATACAATGTCTCACTCTCAGAGGTGGCCCTTGAGTTCAGTTGCTTT\n+TGGTTTAATATGGTATCGATAGCAATAGCTGTTTTTCCAGTTTGTCGGTCCCCGATTATA\n+AGTTCTCGTTGACCACGGCCTATAGGAACCAGGCTATCTACCGCTTTTAACCCTGTTTGC\n+ATAGGCTCGTGCACAGATTTACGTTCAATAATCCCAGGGGCTTTCACTTCGACACGTCTT\n+CGCTCGTGATCGCTTAGAGCCCCTCTTCCATCAATAGGAACTCCCAACCCGTCGACCACG\n+CGCCCTAGCATAGCCTTTCCCGCAGGAACATCCACAATGGATCCAGTGCGCTTGACAAGA\n+TCTCCTTCTTTAATAGCGGTATCACTACCAAAGACAACAATCCCTACATTCTCATTCTCA\n+AGATTCAACGCTATTCCTTTCACACCGCTGGCAAATTCAACCATTTCCCCAGCTTGAATC\n+TCGTTCAATCCATAAACACGTGCAATCCCATCTCCAACTGAGACCACTCGACCGATCTCA\n+TCCACTTGAAAATTCGTGTAAAAGTTGGTAATTCTACTTTCTAATAGAGTTGTTAGTTCC\n+GCAGCTCTGGTAGAGAATTCCATAATTTTTTCTTTTAAAGAAAGTCAAGGGAGAATTCCG\n+CTTATTGTTTTTGGCTCGAAATAAAGCTAGGGTCCTGATCGAGCAACTAGTAGTCCTATC\n+TATCCACCTCTCCAGAAGGGCTATTTGGGGTCTAATTTTCTTTCTATCTGACAGGACAAA\n+CAAAGAGGAAGGGGTGGTTCTTTCATTGCATTGATAGAAGTCTAACTAGAAAAAGATCTC\n+TCTATTACTTTGAGAAGAGAATCGTTGGTTTGACCGACGAACTACGTGGGAAATATGAGT\n+TGAGAGGACAAGAGGATTCGATCTCCACGAAAGGCTAAAGGAACATAAAAAAAGCTAGAA\n+TTTGTTGCAAACAGTGACCGAGATGCCAGGGAAAAACTGTTGTTTCACATTTCCGGAAAG\n+ACCACCTATTTGTTCGTTTACCAGGTTCGGTACGAAATCATAAATAAGCTCTACCCCGGG\n+CCATCGCCTTATGGCCTAGGGGCGTCTATCCCGCC\n+>ds2020-328_2\n+CCCCCCCTTTCGCCCTTTTTTATGCAGACGATTCCCCGATCGGGGAATCGTCTGCTTCCC\n+TACGTATTAATCTTCTTCTTTTCTCCTTTTTCGCGTTTTCCTCTTATTCCTCTTTCGTTT\n+TCCTCTTATTCTTTTTATACGCAATTTCTTTTTTAATTTCTTACTGGTCTAAGTCCCACT\n+CCTCTTTCTCCCCGTTTTGCGTTAAGAATATTTCACATGGCATCGGTTTATAGCCTTTTT\n+CCCTTGTCATCTCCTCTACAATCTTTTCTATTTTTTCATATTTCTTCTTATAAAATTCTT\n+CCTCTTCTCTTCTCTTGCTTACTTGTATGGTTGCCGGAAATACTCTTGTTTCGCCTATTT\n+GTATTTGTAGAGGCCATGTAGCATAGTCGTTTCCTTGTTGAGCCCCCTTACTTCTTTTAA\n+CTTCCATGTAGCTTCTTGCTGTCCAATCCCTTTTATCGTAGAATATCCTTTTAATTTTCT\n+TTGTTTCTGAGTTTTCGTCTTGCTCTCTCTCTCCTTCATTTTCCTCGTCGCTTCCCTCCT\n+GGTTTTCCTCCTCATATTCTTCCTTACTCTTAAATAGCTGCAAGAATCTCCTTCTTTTTT\n+TCTCCTCTTTTTCTTCTTTCTTTTTTACGGGTATACACGCAAAATCTAACAGTGCCATTT\n+CCTTCTCTTGTTCGCCCCATGTGTAGCATTCCTCACCATCGATTGTTTGTAATTCAAACA\n+TCAAATAACTTCCTCCCGCTGTGGTTTTAATCTTTTTTATTTTTACTTCATATATTTTTC\n+CTTTTTTTGTTATAAAGATAACTTCTTTCTTTAGAATGTATTCTGCGGCCTCTGGGTTCA\n+ACTCCC'..b'AGTTCCTTTCCACCTGCCGTGGACCTAGTAGAGTTGCCCCTGCCG\n+TGGACCTAGTTCCTTTCCACCTGCCGTGGACCTAGTAGAGTTGCCCC\n+>ds2020-328_950\n+TTCTTCGATGATGCGCAAATTGAAGCTTCGGTACCCTTGTTTTTTCCAATCGCCCAATTT\n+CTCTTCTTTCTTTTTTATTTCGGGTTTCTTTATTTTTTCAGGGGGG\n+>ds2020-328_951\n+AACCAAGAGCGCTTTTTCTTTCCATTCGCCTGGGACAAGGCCTCCCATCACGCTTCCATT\n+GAAGAGTTAATCCTTCAAGTAGCGGTGGTGCACCCTGCCTGTACT\n+>ds2020-328_952\n+TGGAGTGATGGTGGCTTGTATGGTTTCTTAGGTGGTAGGGGTGGTGAGTAAACTGGTGGC\n+TTGTAGACTGGTGTGGGTGGTGGGGGAGACTTGTAGTGGTA\n+>ds2020-328_953\n+AGCTTGGAGTGTGGTGAGGAACAGGCAGGAGAGAAGCTTGGAGTGTGGTGAGGAACAGGC\n+AGGAGAGAAGCTTGGAGTGTGGTGAGGAA\n+>ds2020-328_954\n+GCTTAGGTGGTGAAGGTGATGGTGGTGGTGGAGACTTGTAGTGGTATGGGTGCTTAGGTG\n+GTGAAGGTGATGGTGGTGGTGGAG\n+>ds2020-328_955\n+GATTGATTTCTTGTTTATTGGCGTCAGTGGTGAGGTTTGACACCATGTGTTTGAAGGAGG\n+GATCGTGTGCGTCTATCCCGACC\n+>ds2020-328_956\n+CGTCTATCCCGACCAGCGATTGTGACCGTCTATCCCGACCAGGGCACTGCCACGTCTATC\n+CCGACCGCCGTAATTCAGATC\n+>ds2020-328_957\n+TCGAAGAGGGGCTTGCTAAAGAGGCTCGAAGAGGGGCTTGCTAAAGAGGCTCGAAGAGGG\n+GCTTGCTAAAGAGGCTCGAA\n+>ds2020-328_958\n+TGAATTACGGCGGTCGGGATAGACGAGCTAAACGCCTGGTCGGGATAGACGAACGTATTC\n+TCTGGTCGGGATAGACG\n+>ds2020-328_959\n+GGAACACCAGTGGCGAAGGAAGGAACACCAGTGGCGAAGGAAGGAACACCAGTGGCGAAG\n+GAAGGAACACCAGTGG\n+>ds2020-328_960\n+GAATTACGGCGGTCGGGATAGACGCGAAGCACCGGTGGTCGGGATAGACGGCCGACCCCC\n+TTGGTCGGGATAGACG\n+>ds2020-328_961\n+TGGATTACGGCGGTCGGGATAGACGTTGACCGTGCCTGGTCGGGATAGACGAGTGAACCG\n+CTGGTCGGGATAGACG\n+>ds2020-328_962\n+TGAATTACGGCGGTCGGGATAGACGCCAATCGCCCCTGGTCGGGATAGACGCAATACCAC\n+CCTTCGGGATAGACG\n+>ds2020-328_963\n+CGTCTATCCCGACCAGGTCCCATTTTCGTCTATCCCGACCAAGGCTAAATGACGTCTATC\n+CGCCGCCGTAATTCA\n+>ds2020-328_964\n+GAATTCGGCGGTCGGGATAGACGAAAGGAGCGGGGGTCGGGATAGACGATTGCTTGCCTT\n+GGTCGGGATAGACG\n+>ds2020-328_965\n+TTAGACGTCTAGTGTCCCTGGTCGGGATAAACGGGGCAAATCGTCTATCCCGACCAGGGA\n+CACTAGACGTCTAT\n+>ds2020-328_966\n+TCTATCCCGACCAGGGAAATACACCGTCTATCCCGACCAGGAGCGGGGTTCGTCTATCCG\n+ACCGCCGAATTCA\n+>ds2020-328_967\n+AGGTAGTTTACTTGCTTACTTGTTAGAGTAAGGAAGAGAGGAAAAGGGTGCTGTCGTCTA\n+TCCCGACCC\n+>ds2020-328_968\n+GTGATGGTGGTGGTGGAGACTTGTAGTGGTAGGGGTGCTTGGGTGGTGAAGGTGATGGTG\n+GTGGTGGAG\n+>ds2020-328_969\n+CGTCTATCCCGACCAACGTCTATCCCGACCAGTGGTATTAATCGTCTATCCCGACCGCCG\n+TAATTCA\n+>ds2020-328_970\n+GGTCGGGATAGACGCCGCGGTATCCTGGTCGGGATAGACGGTAATATTGCCTGGTCGGGA\n+TAGACG\n+>ds2020-328_971\n+GGTCGGGATAGACGAGATAATTACCTGGTCGGGATAGACGTCATGAGTCCCTGGTCGGGA\n+TAGACG\n+>ds2020-328_972\n+CGTCTATCCCGACCAGGCGGTGTATACGCCAATCCCGACCAGGGGGTTAGGGCGTCTATC\n+CCGACC\n+>ds2020-328_973\n+GGTCGGGATAGACGGAACATGACGCTGGTCGGGATAGACGACATAGACCCCTGGTCGGGA\n+TAGAC\n+>ds2020-328_974\n+TTCGAGCCTCTTTAGCAAGCCCCTCTTCGAGCCTCTTTAGCAAGCCCCTCTTCGAAAGAT\n+TCTTT\n+>ds2020-328_975\n+GGTCGGGATAGACGATCACACCCTGGTCGGGATAGACGCACGAACAGGCTGGTCGGGATA\n+GACG\n+>ds2020-328_976\n+AGTGATGTCAGTGATGTCAGTGATGTCAGTGATGTCAGTGATGTCAGTGATGTCAGTGAT\n+GTCA\n+>ds2020-328_977\n+CGTCTATCCCGACCAGGGGCAAATTGCGTCTATCCCGACCAAATCCTTCGTCTATCCCGA\n+CCG\n+>ds2020-328_978\n+GTAGGTCCGTAGGTCCGTAGGTCCGTAGGTCCGTAGGTCCGTAGGTCCGTAGGTCCGTAG\n+GTC\n+>ds2020-328_979\n+GGTCGGGATAGACGGATGGACTCGTGGTCGTGATAGACGTTTCCCAGTCTGTCGGGATAG\n+ACG\n+>ds2020-328_980\n+TCACTTTTCACTTTTCACTTTTCACTTTTCACTTTTCACTTTTCACTTTTCACTTTTCAC\n+TT\n+>ds2020-328_981\n+CATACGTCATACGTCATACGTCATACGTCATACGTCATACGTCATACGTCATACGTCATA\n+CG\n+>ds2020-328_982\n+GGTCGGGATAGACGAACAAAGACACTGGTCGGGATAGACGCTCGATACTACTGGTCGGGA\n+TA\n+>ds2020-328_983\n+TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG\n+T\n+>ds2020-328_984\n+GTGGTGGAGACTTGTAGTGGTATGGGTGCTTGGGTGGTGAAGGTGATGGTGGTGGGGGAG\n+>ds2020-328_985\n+CATCCATCCATCCATCCATCCATCCATCCATCCATCCATCCATCCATCCATCCATCCAT\n+>ds2020-328_986\n+TTCCGATCTGAATTACGGCGGTCGGGATAGACGATAGCCACCACTGGTCGGGATAGACG\n+>ds2020-328_987\n+GTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTGTG\n+>ds2020-328_988\n+GGTCGGGATAGACGCATGCATTGTCTGGTCGGGATAGACGGCGCGCTGCACTGTCGG\n+>ds2020-328_989\n+GGTCGGGATAGACGTTAGCACCCACTGGTCGGGATAGACGAGACAGTTAGCTGGTCG\n+>ds2020-328_990\n+GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG\n+>ds2020-328_991\n+GGTCGGGATAGACGTAGACAGCCCCCGGTCGGGATAGACGTTAATTTCTGCTGGTC\n+>ds2020-328_992\n+GGTCGGGATAGACGATTATGCTCTCTGGTCGGGATAGACGTGTGACTCCCCTGGTC\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s2_rps.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s2_rps.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,149 @@\n+#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\tno rank\tfamily\tgenus\n+"ds2020-328_1"\t"2975"\t"pfam00006"\t"gnl|CDD|376291"\t"6.25354e-106"\t"1359"\t"2033"\t"-1"\t"pfam00006, ATP-synt_ab, ATP synthase alpha/beta family, nucleotide-binding domain. This entry includes the ATP synthase alpha and beta subunits, the ATP synthase associated with flagella and the termination factor Rho."\t"Bacteria(0.88);Archaea(0.07);Eukaryota(0.05);"\t"(1.00);"\t"(0.07);Mycoplasmataceae(0.06);Clostridiaceae(0.04);Spirochaetaceae(0.03);Rhodobacteraceae(0.02);"\t"Mycoplasma(0.06);(0.03);Clostridium(0.03);Treponema(0.01);Persephonella(0.01);"\n+"ds2020-328_1"\t"2975"\t"pfam00306"\t"gnl|CDD|366015"\t"1.33353e-53"\t"1008"\t"1340"\t"-1"\t"pfam00306, ATP-synt_ab_C, ATP synthase alpha/beta chain, C terminal domain. "\t"Bacteria(0.94);Eukaryota(0.06);"\t"(1.00);"\t"(0.08);Mycoplasmataceae(0.08);Clostridiaceae(0.06);Ruminococcaceae(0.03);Eubacteriaceae(0.02);"\t"Mycoplasma(0.07);(0.07);Clostridium(0.05);Eubacterium(0.02);Faecalibacterium(0.01);"\n+"ds2020-328_1"\t"2975"\t"pfam02874"\t"gnl|CDD|367225"\t"8.80807e-19"\t"2202"\t"2405"\t"-1"\t"pfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain. This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella."\t"Bacteria(0.60);Eukaryota(0.28);Archaea(0.13);"\t"(1.00);"\t"Spirochaetaceae(0.04);Bacillaceae(0.04);Schizosaccharomycetaceae(0.03);Chlamydomonadaceae(0.03);Sulfolobaceae(0.03);"\t"Treponema(0.04);Schizosaccharomyces(0.03);Chlamydomonas(0.03);Bacillus(0.03);Thermotoga(0.02);"\n+"ds2020-328_10"\t"1434"\t"pfam17917"\t"gnl|CDD|375428"\t"1.68574e-20"\t"187"\t"453"\t"-1"\t"pfam17917, RT_RNaseH, RNase H-like domain found in reverse transcriptase. DNA polymerase and ribonuclease H (RNase H) activities allow reverse transcriptases to convert the single-stranded retroviral RNA genome into double-stranded DNA, which is integrated into the host chromosome during infection. This entry represents the RNase H like domain."\t"unknown"\t"unknown"\t"unknown"\t"unknown"\n+"ds2020-328_10"\t"1434"\t"pfam00078"\t"gnl|CDD|365856"\t"1.48081e-05"\t"920"\t"1051"\t"-3"\t"pfam00078, RVT_1, Reverse transcriptase (RNA-dependent DNA polymerase). A reverse transcriptase gene is usually indicative of a mobile element such as a retrotransposon or retrovirus. Reverse transcriptases occur in a variety of mobile elements, including retrotransposons, retroviruses, group II introns, bacterial msDNAs, hepadnaviruses, and caulimoviruses."\t"Eukaryota(0.62);Viruses(0.23);Bacteria(0.15);"\t"(0.77);Ortervirales(0.21);Poxviridae(0.02);"\t"Retroviridae(0.15);Drosophilidae(0.15);Brassicaceae(0.12);Enterobacteriaceae(0.09);Caulimoviridae(0.06);"\t"Drosophila(0.15);Arabidopsis(0.12);Lentivirus(0.08);Escherichia(0.08);Bombyx(0.05);"\n+"ds2020-328_101"\t"454"\t"pfam14111"\t"gnl|CDD|372914"\t"8.33283e-09"\t"213"\t"353"\t"3"\t"pfam14111, DUF4283, Domain of unknown function (DUF4283). This domain family is found in plants, and is approximately 100 amino acids in length. Considering the very diverse range of other domains it is associated with it is possible that this domain is a binding/guiding region. There are two highly conserved tryptophan residues."\t"Eukaryota(1.00);"\t"(1.00);"\t"Salicaceae(0.35);Brassicaceae(0.27);Poaceae(0.13);Vitaceae(0.08);Solanaceae(0.06);"\t"Populus(0.35);Brassica(0.13);Arabidopsis(0.11);Brachypodium(0.10);Vitis(0.08);"\n+"ds2020-328_106"\t"446"\t"pfam01348"\t"gnl|CDD|279664"\t"1.08017e-09"\t"40"\t"303"\t"-3"\t"pfam01348, Intron_maturas2, Type II intron maturase. Group II introns use intron-encoded reverse transcriptase, maturase and DNA endonuclease activities for site-specific insertion into DNA. Although this type of intron is self splicing in vitro they require a maturase protein for splicing in vivo. It has been shown that a specific region of the aI2 intron is needed for the maturase function. This region was found to be conserved in group II introns and called domain X."\t"Eukar'..b'ns from NADH to ubiquinone in a reaction that is associated with proton translocation across the membrane, and in this context is a combination predominantly of subunits 2, 4, 5, 14, L, M and N. In many bacterial species these proteins are probable stand-alone transporters not coupled with oxidoreduction. The family in total represents homologs across the phyla."\t"Eukaryota(0.54);Bacteria(0.46);"\t"(1.00);"\t"Culicidae(0.23);Rhodobacteraceae(0.23);Poaceae(0.15);Enterobacteriaceae(0.15);Phasianidae(0.08);"\t"Paracoccus(0.23);Anopheles(0.23);Escherichia(0.15);Zea(0.15);Aspergillus(0.08);"\n+"ds2020-328_90"\t"476"\t"pfam01578"\t"gnl|CDD|307628"\t"6.70073e-11"\t"145"\t"399"\t"-3"\t"pfam01578, Cytochrom_C_asm, Cytochrome C assembly protein. This family consists of various proteins involved in cytochrome c assembly from mitochondria and bacteria; CycK from Rhizobium, CcmC from E. coli and Paracoccus denitrificans and orf240 from wheat mitochondria. The members of this family are probably integral membrane proteins with six predicted transmembrane helices. It has been proposed that members of this family comprise a membrane component of an ABC (ATP binding cassette) transporter complex. It is also proposed that this transporter is necessary for transport of some component needed for cytochrome c assembly. One member CycK contains a putative heme-binding motif, orf240 also contains a putative heme-binding motif and is a proposed ABC transporter with c-type heme as its proposed substrate. However it seems unlikely that all members of this family transport heme nor c-type apocytochromes because CcmC in the putative CcmABC transporter transports neither. CcmF forms a working module with CcmH and CcmI, CcmFHI, and itself is unlikely to bind haem directly."\t"Bacteria(0.56);Eukaryota(0.38);Archaea(0.05);"\t"(1.00);"\t"Enterobacteriaceae(0.10);Pasteurellaceae(0.08);Histionidae(0.05);Marchantiaceae(0.05);Archaeoglobaceae(0.05);"\t"Escherichia(0.08);Reclinomonas(0.05);Archaeoglobus(0.05);Marchantia(0.05);Bradyrhizobium(0.05);"\n+"ds2020-328_904"\tno_hit\n+"ds2020-328_908"\t"207"\t"pfam02123"\t"gnl|CDD|280316"\t"5.3529e-06"\t"4"\t"207"\t"-1"\t"pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus."\t"Viruses(1.00);"\t"Riboviria(1.00);"\t"Totiviridae(0.44);Solemoviridae(0.19);Luteoviridae(0.15);Reoviridae(0.15);Chrysoviridae(0.07);"\t"Sobemovirus(0.19);Rotavirus(0.15);Victorivirus(0.15);Polerovirus(0.11);Totivirus(0.11);"\n+"ds2020-328_921"\tno_hit\n+"ds2020-328_97"\t"461"\t"pfam02123"\t"gnl|CDD|280316"\t"5.16988e-30"\t"39"\t"461"\t"-1"\t"pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus."\t"Viruses(1.00);"\t"Riboviria(1.00);"\t"Totiviridae(0.44);Solemoviridae(0.19);Luteoviridae(0.15);Reoviridae(0.15);Chrysoviridae(0.07);"\t"Sobemovirus(0.19);Rotavirus(0.15);Victorivirus(0.15);Polerovirus(0.11);Totivirus(0.11);"\n+"ds2020-328_98"\t"458"\t"pfam02123"\t"gnl|CDD|280316"\t"1.94825e-26"\t"27"\t"443"\t"-1"\t"pfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus."\t"Viruses(1.00);"\t"Riboviria(1.00);"\t"Totiviridae(0.44);Solemoviridae(0.19);Luteoviridae(0.15);Reoviridae(0.15);Chrysoviridae(0.07);"\t"Sobemovirus(0.19);Rotavirus(0.15);Victorivirus(0.15);Polerovirus(0.11);Totivirus(0.11);"\n+"ds2020-328_99"\t"458"\t"pfam03732"\t"gnl|CDD|367628"\t"7.72961e-07"\t"256"\t"441"\t"1"\t"pfam03732, Retrotrans_gag, Retrotransposon gag protein. Gag or Capsid-like proteins from LTR retrotransposons. There is a central motif QGXXEXXXXXFXXLXXH that is common to Retroviridae gag-proteins, but is poorly conserved."\t"Eukaryota(1.00);"\t"(1.00);"\t"Brassicaceae(0.58);Poaceae(0.29);Tetraodontidae(0.04);Solanaceae(0.02);Plantaginaceae(0.02);"\t"Arabidopsis(0.58);Oryza(0.18);Sorghum(0.05);Takifugu(0.04);Zea(0.04);"\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/otu_s2_tblastx.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/otu_s2_tblastx.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,139 @@\n+#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n+TBLASTX\tds2020-328_275\t16\t279\t\n+TBLASTX\tds2020-328_625\t13\t226\tNC_008039\tPrune dwarf virus RNA 1, complete sequence\tPrune dwarf virus\t90.7\t3\t100\t17.0\t1.9242059400026399e-29\t420.172\t33760\tViruses;Orthornavirae;Kitrinoviricota;Alsuviricetes;Martellivirales;Bromoviridae;Ilarvirus;Prune dwarf virus\tGGTCGGGATAGACGGCCGCCCGCCATACTGAGCGTCTCATGCAGTACACTACCTACAAGACTAGTAGACCTGATGAAGTTCATGAACCGAACTTTTGTGAAAACACATTCCAGGACTGCTCCTTGCAAGGTAAGTATGCCATGGCAATCCATTCCACTTCGGATTTACCCTTAGGTGAGCTCTGTGAGAGCTTAAGGAAGAAGGGAGTGATTCGTCTATCCCGACC\n+TBLASTX\tds2020-328_196\t14\t318\tNC_033468\tWuhan insect virus 27 strain WHZM10130 hypothetical protein 1 and hypothetical protein 2 genes, complete cds\tWuhan insect virus 27\t45.3\t1\t100\t6.0\t2.90542e-27\t118.644\t1923731\tViruses;Wuhan insect virus 27\tGGTCGGGATAGACGTTTAGCAGCCTTAAACTCTTCATCTTCAGGGTATTGTGAGTGATATGCTCCCGTAGGTGCCCACTGCCACCTCTTATTCCAATAACTTTTCCACTTTATGTTATCTGGTTTACTCCCGAGGTTTTTCAATCTAGTGAACATTTCTCCACTAGCCCTGTATATCTGCTCTCTAGTGAAAGAAGCGACGTTGGGTTTGGTTCTGTGCTCTTTCTCGGCTTTCCAATCCACTTCTCCTATACCTCTGTTGACTAAAACTTCCATTTCAAAGAATGGTTTTAAGTTGAGCGGTGCGTCTATCCCGACC\n+TBLASTX\tds2020-328_638\t4\t225\tNC_030890\tArabidopsis halleri partitivirus 1 gene for capsid protein, complete cds\tArabidopsis halleri partitivirus 1\t48.2\t1\t100\t10.0\t1.45353e-08\t55.5941\t1849335\tViruses;Orthornavirae;Pisuviricota;Duplopiviricetes;Durnavirales;Partitiviridae;Arabidopsis halleri partitivirus 1\tGTCGGGATAGACGTACCAGTCAACTTTTTGCAATTTTCTTCATACATCACATGGTATGACTTGGTTTGCCCAAGTCCGCGATGTAGCCGCAGCCGAAGCCTCGTCTTTCGAAGGCTCAGGCACCCTGGCTGATTGTCCCCCATTCGGGATAACGTCAAACCAGGTAGTTGTTAACTACTTGGCCCCAGCCACACTGCCAACGTCCCCTATTCGTCTATCCCGACC\n+TBLASTX\tds2020-328_858\t2\t210\t\n+TBLASTX\tds2020-328_761\t8\t215\t\n+TBLASTX\tds2020-328_553\t2\t234\t\n+TBLASTX\tds2020-328_845\t11\t210\tNC_003689\tCherry virus A, complete genome\tCherry virus A\t75.0\t2\t100\t5.0\t5.449000000000025e-19\t225.74040000000002\t42882\tViruses;Orthornavirae;Kitrinoviricota;Alsuviricetes;Tymovirales;Betaflexiviridae;Capillovirus;Cherry virus A\tGCCGTGGACCTAGTTAGGAGAGTAATTTCGGATGGTGTTCTTTACTGCTCTCTGATTTTCTGATAAGATTATCGCCGGCGTGGCTGCTACTCCCTCTGAAGCCTCTACTGCTGGCTTCGCAGTCAGGTGCAATAGATCCCTTGGATCTATCATATTCTCCCAGATATAATTTACAAGGCCCCTTCTTATCACGTTGTAATTCTTATACAC\n+TBLASTX\tds2020-328_483\t8\t242\tNC_033495\tBotryosphaeria dothidea virus 1 strain YZN115 segment RNA4 hypothetical protein gene, complete cds\tBotryosphaeria dothidea virus 1\t50.0\t1\t100\t13.0\t1.155e-07\t52.8449\t1516075\tViruses;Orthornavirae;Pisuviricota;Duplopiviricetes;Durnavirales;Partitiviridae;Botryosphaeria dothidea virus 1\tGGTCGGGATAGACGGAATTAACCCATGTAATGGCACAATCGAAGGCTCTAGCCGTGGCCGTTGCACGGATAACGAGAGGGAAGGGGCGGCACGACGCATCTGTAGAAGATTTCATCTTCTACATAGCCTCAAACGGGCGTGCCGTCTCCGCTTCGGATATACCATCCGGAGTCCATAGCTTGGCTTGCGGTCCTCGTGACCATGTGGAACCCTCTGAGCGTTTACCAGCGTCTATCCCGACC\n+TBLASTX\tds2020-328_910\t2\t207\t\n+TBLASTX\tds2020-328_531\t10\t236\t\n+TBLASTX\tds2020-328_893\t4\t208\t\n+TBLASTX\tds2020-328_507\t26\t238\t\n+TBLASTX\tds2020-328_594\t8\t229\t\n+TBLASTX\tds2020-328_600\t4\t229\tNC_014823\tTolypocladium cylindrosporum virus 1, complete genome\tTolypocladium cylindrosporum virus 1\t55.8\t2\t100\t7.0\t8.34655e-06\t145.07119999999998\t939923\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Totiviridae;Victorivirus;Tolypocladium cylindrosporum virus 1\tGGTGGGGATAGACGTTTTGGTTCCTTCTAAAGGCCGCGCCCTCTCCCGCCCAATTGATGGTTCCCTGCTTGTTGAAGCGGGATACCCTACTGCTCATGCCTTGGCTGAGGATTTTGTTGGACTTTCTAAGAAGTACACTAATTTCTATGCCACGTCCGAGTACGCGTCCCTGGCTGACCTGGTTGAACACCTCATCCATGGTTTAGCTCCAACCTCCGTATATCCCGAC\n+TBLASTX\tds2020-328_765\t8\t215\t\n+TBLASTX\tds2020-328_575\t4\t231\tNC_003710\tDiscula destructiva virus 2 segment 1, complete genome\tDiscula destructiva virus 2\t69.0\t4\t100\t20.0\t0.00012443\t208.6303\t160484\tViruses;Orthornavirae;Pisuviricota;Duplopiviricetes;Durnavirales;Partitiviridae;Gammapartitivirus;Discula destructiva virus 2\tGGTCGGGATAGACGTACGTCTGGCATGAGTATGGGTGTATTAATGAAA'..b'nd RNA-dependent RNA polymerase (RdRp) genes, complete cds\tXanthophyllomyces dendrorhous virus L1B\t47.3\t1\t100\t6.0\t3.23633e-27\t118.369\t1167691\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Totiviridae;Totivirus;Xanthophyllomyces dendrorhous virus L1B\tGGTCGGGATAGACGCTATCGAGATAAATTTATTCTTCAACCTTATATCTGATTTATGTATAAATTGTTCGTCAGTTTTATATTGTGAATGGATACTCCCAGAAGCACTCCACTGCCATCTGTTGGTCCAAAATTTTTCCCAAGACATCACCATAGGAGTTCTCCCGCTAACTGCTGCACTCGCGAAGATTTTAAGTGCTTCATTCTGGATATAATCAGCACTTAGCTTGACTGTTGACGCTTTTGTTCTATGTGTGAACTCAGCTTCCCAATCGAGCGCGCCGTAACGTCTATCCCGACC\n+TBLASTX\tds2020-328_159\t47\t348\tNC_008039\tPrune dwarf virus RNA 1, complete sequence\tPrune dwarf virus\t97.0\t3\t100\t27.0\t2.5280226430549335e-55\t668.0630000000001\t33760\tViruses;Orthornavirae;Kitrinoviricota;Alsuviricetes;Martellivirales;Bromoviridae;Ilarvirus;Prune dwarf virus\tGGGTCGGGATAGACGGCAGATACCACTCGAACGTGGTTGTTCGTATTTTAAATCAATCATGACTTCTTCCGAGATCACTGCTGCCAATGTCCATGAACTTTTGGTTAAAGTTCTGGAAAAGCAATGCGCTGACGAGACCACTACCGTCGGTAAGGCTTTCTCTGAGAAAGCAAAACAGTCTTTGAATAAGACATTCGGACTAAATGACGAGTCCAAGCAACTGAAGATTTCTTTTGATTTGACGGCTGAACAGCAGGCGTTACTCAAGAGACATTTTCCGGGTCGATCGGTGATTTTTTCAAATTCATCGAGTTCCTCACACAGTTATGCAGCGCGTCTATCCCGACC\n+TBLASTX\tds2020-328_166\t24\t346\tNC_005980\tHelminthosporium victoriae 145S virus\tHelminthosporium victoriae 145S virus\t26.8\t1\t100\t8.0\t1.12841e-07\t53.7613\t164750\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Chrysoviridae;Chrysovirus;Helminthosporium victoriae 145S virus\tGGGTCGGGATAGACGTTAACTGCCCATACCAACCATGTTTGCTAGCCTTGCAACGTCAGCTATTTCCTGCCAAGTGTCGACAGCTGCCTCATCTCGCAGCCTATTGTATATCACTGCCTCATCTGCGGTTGTGCAATAACCACGTATACGTAAGTTCTGCACTAATGCATGTATTCCACAAGTACCATCTCCTTTTGTGTTAACTTCTGTCACTATTATTTTGTCCTCCTCTACTGGCATTGGCTCATCTTTGACCAACTTTGACTCTGTGCCTTTGTCTTTCAGTTCATATGAACTACCTAGCATTTCCTCACCATTTACGCCTTTGACATAAACTGGCGATGGT\n+TBLASTX\tds2020-328_97\t31\t461\tNC_020903\tXanthophyllomyces dendrorhous virus L1A capsid protein (CP) and RNA-dependent RNA polymerase (RdRp) genes, complete cds\tXanthophyllomyces dendrorhous virus L1A\t52.1\t2\t100\t14.0\t1.888695e-05\t241.7778\t1167690\tViruses;Orthornavirae;Duplornaviricota;Chrymotiviricetes;Ghabrivirales;Totiviridae;Totivirus;Xanthophyllomyces dendrorhous virus L1A\tAACGGCCGCCACTAAATTGTCCCCTTAGATCCGCGCTTGTGATCCACTCGTAGGAACTCCGCTATAGCTCCGTACGCGCACTTGCTCATTTGTAGTCGTATATTGTGCTTCTTCGCGTTTCGGCCGGCTAGTAACACGTCCCCGAGCGAGTTACTGCCCAGCAGCACGTCGTCGCCGTTATGAAGACTGTTTTGTGACTTCACGACATCTGGCACTATTAATTGAGTGTAAATGTAGTTCAGGACGCTATTCATGAATGTGGTGAGCCTCCACCCCGACAGTAAAGTTCCTTTAGCACTATACTCCATCTTCAAGCCTTGGTTGTCATGTACTATCACTCTGTTCAGCGACAGTCGAGTCCATTCCACCGCCGCCAGCTGCTCTTGCGTTAGGAAGGGCCTGAAGGTGTCTCTGTACGCATCTATCACTGTTTTCATTGACTGTACACTATGTTGACTGTT\n+TBLASTX\tds2020-328_750\t10\t216\t\n+TBLASTX\tds2020-328_92\t33\t472\tNC_033465\tWuhan insect virus 26 strain WHZM10161 hypothetical protein 1 and hypothetical protein 2 genes, complete cds\tWuhan insect virus 26\t38.6\t1\t100\t10.0\t6.24209e-27\t118.369\t1923730\tViruses;Wuhan insect virus 26\tCGTCATTGAGGAACAAATACTACTCGAGACAATGCGAGCGATTGCACATGACAGTTTCGGACTGTTATAAAATAAAGACCTCCCACAGGTGTGTGGGGGGCATAAGTGAAGACAAGAGATCGGACGTAAAGTGGATGATACGGTCGTCGGGCTTCAGGAAGGGTGCCACGCAGATAGGCGTGTTACCGGGAGTGGTGGACTACTCCAGGATGGTGAAAGCATCACTACAACTTGAAAGACCTTTGCAGGATTTCATTTCACGAATAATGCGCGCAACATACGACGCGGTGATACCAAAAGAAAGGAACATAAACGTTTCAAGGAATGGTAACATCAAGCGGTACGAGATTTTGCGAGCGTTATTCAAGGTACACAAAGAAGAAACAGACATAGTCAATTACGGAAAAGCCAAAATGACCGGCTTCCTGATGGACGTGTTGAATGGGACCAATTATTCATAGTGGCGGCCGTT\n+TBLASTX\tds2020-328_825\t7\t211\t\n+TBLASTX\tds2020-328_896\t2\t208\tNC_033476\tBotryosphaeria dothidea virus 1 strain YZN115 RNA-dependent RNA polymerase (RdRp) gene, complete cds\tBotryosphaeria dothidea virus 1\t41.8\t1\t100\t7.0\t8.29721e-08\t52.8449\t1516075\tViruses;Orthornavirae;Pisuviricota;Duplopiviricetes;Durnavirales;Partitiviridae;Botryosphaeria dothidea virus 1\tGGTCGGGATAGACGGGGGGCCTTCTTTCACTCCACACCCCCCATGCACAGGCAACAAAGAAAAAGATGAGACTGGGAACATCGACGACCATCGGTGGAACTCTTTTCGGAGGTCCACTTACCCCGGCGGCGGCTTGGGCCCACGACCAGACAAGGTCTACCTGGGCAAAGTACCTCAAAGAAACGGACGCCAGTCGTCTATCCCGACC\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/rps_test.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rps_test.tab Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,105 @@\n+#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n+ds2020-267_120\t339\tpfam01333\tgnl|CDD|366578\t0.000848733\t197\t325\t-3\tpfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034.\tEukaryota(19);Bacteria(1)\n+ds2020-267_374\t242\tpfam00124\tgnl|CDD|365890\t5.09126e-07\t21\t125\t3\tpfam00124, Photo_RC, Photosynthetic reaction centre protein. \tBacteria(9);Eukaryota(6);Viruses(4);unclassified sequences(1)\n+ds2020-267_471\t230\tpfam00201\tgnl|CDD|278624\t3.12575e-07\t46\t210\t1\tpfam00201, UDPGT, UDP-glucoronosyl and UDP-glucosyl transferase. \tEukaryota(20)\n+ds2020-267_710\t213\tpfam01127\tgnl|CDD|366480\t0.000723904\t46\t210\t1\tpfam01127, Sdh_cyt, Succinate dehydrogenase/Fumarate reductase transmembrane subunit. This family includes a transmembrane protein from both the Succinate dehydrogenase and Fumarate reductase complexes.\tBacteria(20)\n+ds2020-267_692\t214\tpfam00680\tgnl|CDD|366242\t4.79875e-05\t70\t180\t1\tpfam00680, RdRP_1, RNA dependent RNA polymerase. \tViruses(20)\n+ds2020-267_817\t208\tpfam05656\tgnl|CDD|377540\t3.45664e-06\t86\t190\t-1\tpfam05656, DUF805, Protein of unknown function (DUF805). This family consists of several bacterial proteins of unknown function.\tBacteria(17);unclassified sequences(2);Archaea(1)\n+ds2020-267_98\t379\tpfam16203\tgnl|CDD|374428\t1.33948e-30\t131\t280\t-1\tpfam16203, ERCC3_RAD25_C, ERCC3/RAD25/XPB C-terminal helicase. This is the C-terminal helicase domain of ERCC3, RAD25 and XPB helicases.\tBacteria(11);Eukaryota(6);Archaea(2);unclassified sequences(1)\n+ds2020-267_21\t858\tpfam00680\tgnl|CDD|366242\t8.36679e-11\t295\t729\t-1\tpfam00680, RdRP_1, RNA dependent RNA polymerase. \tViruses(20)\n+ds2020-267_261\t260\tpfam01051\tgnl|CDD|376444\t1.77523e-19\t26\t217\t-2\tpfam01051, Rep_3, Initiator Replication protein. This protein is an initiator of plasmid replication. RepB possesses nicking-closing (topoisomerase I) like activity. It is also able to perform a strand transfer reaction on ssDNA that contains its target. This family also includes RepA which is an E.coli protein involved in plasmid replication. The RepA protein binds to DNA repeats that flank the repA gene.\tBacteria(19);unclassified sequences(1)\n+ds2020-267_773\t210\tpfam01641\tgnl|CDD|376583\t5.23903e-34\t16\t174\t1\tpfam01641, SelR, SelR domain. Methionine sulfoxide reduction is an important process, by which cells regulate biological processes and cope with oxidative stress. MsrA, a protein involved in the reduction of methionine sulfoxides in proteins, has been known for four decades and has been extensively characterized with respect to structure and function. However, recent studies revealed that MsrA is only specific for methionine-S-sulfoxides. Because oxidized methionines occur in a mixture of R and S isomers in vivo, it was unclear how stereo-specific MsrA could be responsible for the reduction of all protein methionine sulfoxides. It appears that a second methionine sulfoxide reductase, SelR, evolved that is specific for methionine-R-sulfoxides, the activity that is different but complementary to that of MsrA. Thus, these proteins, working together, could reduce both stereoisomers of methionine sulfoxide. This domain is found both in SelR proteins and fused with the peptide methionine sulfoxide reductase enzymatic domain pfam01625. The domain has two conserved cysteine and histidines. The domain binds both selenium and zinc. The final cysteine is found to be replaced by the rare amino acid selenocysteine in some members of the family. This family has methionine-R-sulfoxide reductase activity.\tBacteria(18);Archaea(1);unclassified sequences(1)\n+ds2020-267_287\t256\tpfam00115\tgnl|CDD|376293\t2.8946e-26\t13\t237\t1\tpfam00115, COX1, Cytochrome C and Quinol oxidase polypeptide I. \tEukaryota(18);Bacteria(2)\n+ds2020-267_139\t320\tpfam05860\tgnl|CDD|368641\t1.34887e-13\t167\t298\t2\tpfam05860, Haemagg_act, haemagglutination activity domain. This domain is suggested to be a carbohydrate- dependent haemaggl'..b'e-27\t22\t207\t1\tpfam00421, PSII, Photosystem II protein. \tEukaryota(13);Bacteria(7)\n+ds2020-267_268\t259\tpfam02123\tgnl|CDD|280316\t3.22949e-21\t18\t251\t3\tpfam02123, RdRP_4, Viral RNA-directed RNA-polymerase. This family includes RNA-dependent RNA polymerase proteins (RdRPs) from Luteovirus, Totivirus and Rotavirus.\tViruses(19);unclassified sequences(1)\n+ds2020-267_719\t213\tpfam00072\tgnl|CDD|333815\t9.1657e-13\t45\t185\t-2\tpfam00072, Response_reg, Response regulator receiver domain. This domain receives the signal from the sensor partner in bacterial two-component systems. It is usually found N-terminal to a DNA binding effector domain.\tBacteria(20)\n+ds2020-267_811\t208\tpfam07991\tgnl|CDD|285265\t1.80927e-08\t20\t190\t-1\tpfam07991, IlvN, Acetohydroxy acid isomeroreductase, NADPH-binding domain. Acetohydroxy acid isomeroreductase catalyzes the conversion of acetohydroxy acids into dihydroxy valerates. This reaction is the second in the synthetic pathway of the essential branched side chain amino acids valine and isoleucine. This N-terminal region of the enzyme carries the binding-site for NADPH. The active-site for enzymatic activity lies in the C-terminal part, IlvC, pfam01450.\tBacteria(16);Archaea(2);unclassified sequences(2)\n+ds2020-267_642\t216\tpfam02874\tgnl|CDD|367225\t0.000376273\t20\t190\t-1\tpfam02874, ATP-synt_ab_N, ATP synthase alpha/beta family, beta-barrel domain. This family includes the ATP synthase alpha and beta subunits the ATP synthase associated with flagella.\tBacteria(17);Eukaryota(2);Archaea(1)\n+ds2020-267_504\t226\tpfam01578\tgnl|CDD|307628\t0.000112784\t20\t190\t-1\tpfam01578, Cytochrom_C_asm, Cytochrome C assembly protein. This family consists of various proteins involved in cytochrome c assembly from mitochondria and bacteria; CycK from Rhizobium, CcmC from E. coli and Paracoccus denitrificans and orf240 from wheat mitochondria. The members of this family are probably integral membrane proteins with six predicted transmembrane helices. It has been proposed that members of this family comprise a membrane component of an ABC (ATP binding cassette) transporter complex. It is also proposed that this transporter is necessary for transport of some component needed for cytochrome c assembly. One member CycK contains a putative heme-binding motif, orf240 also contains a putative heme-binding motif and is a proposed ABC transporter with c-type heme as its proposed substrate. However it seems unlikely that all members of this family transport heme nor c-type apocytochromes because CcmC in the putative CcmABC transporter transports neither. CcmF forms a working module with CcmH and CcmI, CcmFHI, and itself is unlikely to bind haem directly.\tBacteria(19);Archaea(1)\n+ds2020-267_274\t258\tpfam03713\tgnl|CDD|367619\t9.45376e-09\t24\t173\t-2\tpfam03713, DUF305, Domain of unknown function (DUF305). Domain found in small family of bacterial secreted proteins with no known function. Also found in Paramecium bursaria chlorella virus 1. This domain is short and found in one or two copies. The domain has a conserved HH motif that may be functionally important. This domain belongs to the ferritin superfamily. It contains two sequence similar repeats each of which is composed of two alpha helices.\tBacteria(18);unclassified sequences(2)\n+ds2020-267_42\t575\tpfam00283\tgnl|CDD|365999\t2.95472e-07\t325\t411\t1\tpfam00283, Cytochrom_B559, Cytochrome b559, alpha (gene psbE) and beta (gene psbF)subunits. \tEukaryota(18);Bacteria(2)\n+ds2020-267_283\t257\tpfam13041\tgnl|CDD|372443\t3.148e-06\t13\t114\t1\tpfam13041, PPR_2, PPR repeat family. This repeat has no known function. It is about 35 amino acids long and is found in up to 18 copies in some proteins. The family appears to be greatly expanded in plants and fungi. The repeat has been called PPR.\tEukaryota(20)\n+ds2020-267_685\t214\tpfam09334\tgnl|CDD|370442\t1.80219e-14\t16\t117\t-2\tpfam09334, tRNA-synt_1g, tRNA synthetases class I (M). This family includes methionyl tRNA synthetases.\tBacteria(17);Archaea(2);unclassified sequences(1)\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 test-data/rps_test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rps_test.xml Mon Mar 04 19:56:16 2024 +0000 |
| b |
| b'@@ -0,0 +1,21944 @@\n+<?xml version="1.0"?>\n+<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n+<BlastOutput>\n+ <BlastOutput_program>rpstblastn</BlastOutput_program>\n+ <BlastOutput_version>RPSTBLASTN 2.10.1+</BlastOutput_version>\n+ <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n+ <BlastOutput_db>/home/tcandresse/work/pfam/Pfam</BlastOutput_db>\n+ <BlastOutput_query-ID>ds2020-267_269</BlastOutput_query-ID>\n+ <BlastOutput_query-def>No definition line</BlastOutput_query-def>\n+ <BlastOutput_query-len>259</BlastOutput_query-len>\n+ <BlastOutput_param>\n+ <Parameters>\n+ <Parameters_matrix>BLOSUM62</Parameters_matrix>\n+ <Parameters_expect>0.001</Parameters_expect>\n+ <Parameters_gap-open>11</Parameters_gap-open>\n+ <Parameters_gap-extend>1</Parameters_gap-extend>\n+ <Parameters_filter>F</Parameters_filter>\n+ </Parameters>\n+ </BlastOutput_param>\n+<BlastOutput_iterations>\n+<Iteration>\n+ <Iteration_iter-num>1</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_269</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>259</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>50</Statistics_hsp-len>\n+ <Statistics_eff-space>75910968</Statistics_eff-space>\n+ <Statistics_kappa>0.050055168</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>2</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_1242</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>59</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>0</Statistics_hsp-len>\n+ <Statistics_eff-space>57087172</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>3</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_333</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>248</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>47</Statistics_hsp-len>\n+ <Statistics_eff-space>75683825</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>4</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_1111</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>70</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_'..b'n_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>46</Statistics_hsp-len>\n+ <Statistics_eff-space>74130676</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>487</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_805</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>209</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>37</Statistics_hsp-len>\n+ <Statistics_eff-space>74930720</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>488</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_685</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>214</Iteration_query-len>\n+<Iteration_hits>\n+<Hit>\n+ <Hit_num>1</Hit_num>\n+ <Hit_id>gnl|CDD|370442</Hit_id>\n+ <Hit_def>pfam09334, tRNA-synt_1g, tRNA synthetases class I (M). This family includes methionyl tRNA synthetases.</Hit_def>\n+ <Hit_accession>370442</Hit_accession>\n+ <Hit_len>391</Hit_len>\n+ <Hit_hsps>\n+ <Hsp>\n+ <Hsp_num>1</Hsp_num>\n+ <Hsp_bit-score>63.8538</Hsp_bit-score>\n+ <Hsp_score>156</Hsp_score>\n+ <Hsp_evalue>1.80219e-14</Hsp_evalue>\n+ <Hsp_query-from>16</Hsp_query-from>\n+ <Hsp_query-to>117</Hsp_query-to>\n+ <Hsp_hit-from>124</Hsp_hit-from>\n+ <Hsp_hit-to>157</Hsp_hit-to>\n+ <Hsp_query-frame>-2</Hsp_query-frame>\n+ <Hsp_hit-frame>0</Hsp_hit-frame>\n+ <Hsp_identity>18</Hsp_identity>\n+ <Hsp_positive>22</Hsp_positive>\n+ <Hsp_gaps>0</Hsp_gaps>\n+ <Hsp_align-len>34</Hsp_align-len>\n+ <Hsp_qseq>PKKGMFLSDRFIKGTCPKCKSEDQYGDSCEDIGT</Hsp_qseq>\n+ <Hsp_hseq>PSDERFLPDRYVEGTCPHCGSEDARGDQCENCGR</Hsp_hseq>\n+ <Hsp_midline>P FL DR+++GTCP C SED GD CE+ G </Hsp_midline>\n+ </Hsp>\n+ </Hit_hsps>\n+</Hit>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>39</Statistics_hsp-len>\n+ <Statistics_eff-space>73783904</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+</Iteration>\n+<Iteration>\n+ <Iteration_iter-num>489</Iteration_iter-num>\n+ <Iteration_query-ID>ds2020-267_60</Iteration_query-ID>\n+ <Iteration_query-def>No definition line</Iteration_query-def>\n+ <Iteration_query-len>471</Iteration_query-len>\n+<Iteration_hits>\n+</Iteration_hits>\n+ <Iteration_stat>\n+ <Statistics>\n+ <Statistics_db-num>17919</Statistics_db-num>\n+ <Statistics_db-len>3004588</Statistics_db-len>\n+ <Statistics_hsp-len>79</Statistics_hsp-len>\n+ <Statistics_eff-space>123940986</Statistics_eff-space>\n+ <Statistics_kappa>0.041</Statistics_kappa>\n+ <Statistics_lambda>0.267</Statistics_lambda>\n+ <Statistics_entropy>0.14</Statistics_entropy>\n+ </Statistics>\n+ </Iteration_stat>\n+ <Iteration_message>No hits found</Iteration_message>\n+</Iteration>\n+ </BlastOutput_iterations>\n+</BlastOutput>\n' |
| b |
| diff -r 000000000000 -r bbaa89f070f4 virAnnot_rps2tsv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/virAnnot_rps2tsv.xml Mon Mar 04 19:56:16 2024 +0000 |
| [ |
| @@ -0,0 +1,57 @@ +<tool id="virAnnot_rps2tsv" name="virAnnot Rps2tsv" version="1.0.0+galaxy0" profile="21.05"> + <description>Convert xml rpstblast results to tab file with taxonomic informations</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="aggressive"><![CDATA[ + python '$__tool_directory__/rps2tsv.py' + -x '$rps_xml' + -e '$max_evalue' + -o '$output' + ]]></command> + <inputs> + <param type="data" name="rps_xml" format="xml" label="RPSTBlast results file" /> + <param type="select" name="max_evalue" label="Maximum evalue"> + <option value="0">0</option> + <option value="0.1">0.1</option> + <option value="0.01">0.01</option> + <option value="0.001" selected="true">0.001</option> + <option value="0.0001">0.0001</option> + </param> + </inputs> + <outputs> + <data name="output" format="tabular" from_work_dir="rps_result.tab" /> + </outputs> + <tests> + <test> + <param name="rps_xml" value="rps_test.xml"/> + <param name="max_evalue" value="0.0001"/> + <output name="output" file="rps_test.tab"> + <assert_contents> + <has_n_columns n="10" /> + <has_n_lines n="105" /> + </assert_contents> + </output> + </test> + <test> + <param name="rps_xml" value="rps_test.xml"/> + <param name="max_evalue" value="0.0001"/> + <output name="output" file="rps_test.tab"> + <assert_contents> + <has_text text="pfam01333, Apocytochr_F_C, Apocytochrome F" /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + +This module takes as input rps XML file from rps motives annotation. +The standard maximum evalue is 0.0001 [default value]. +The expected result is a tabular file. See example: +#query_id query_length cdd_id hit_id evalue startQ endQ frame description superkingdom +ds2020-267_120 339 pfam01333 gnl|CDD|366578 0.000848733 197 325 -3 pfam01333, Apocytochr_F_C, Apocytochrome F, C-terminal. This is a sub-family of cytochrome C. See pfam00034. Eukaryota(227);Bacteria(73); + + ]]></help> + <expand macro="citations" /> +</tool> |