Repository 'diamond'
hg clone https://toolshed.g2.bx.psu.edu/repos/bgruening/diamond

Changeset 16:1faba1aa14c1 (2025-12-12)
Previous changeset 15:0cdcf7e99b62 (2025-11-10) Next changeset 17:9553180705b7 (2025-12-12)
Commit message:
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/diamond commit 3b8d4b833ee2bd2a99b23b7389def84cd3de84cb
modified:
diamond.xml
diamond_makedb.xml
macros.xml
test-data/db-wtax.dmnd
test-data/db.dmnd
test-data/db.fasta
test-data/db.fasta.gz
test-data/diamond_results.pairwise
test-data/diamond_results.tabular
test-data/diamond_results.wtax.tabular
test-data/diamond_results_algorithm.tabular
test-data/diamond_results_freq_masking.tabular
test-data/diamond_results_global_ranking.tabular
test-data/diamond_results_iterate.tabular
test-data/diamond_results_log_test.tabular
test-data/diamond_results_max_hsps.tabular
test-data/diamond_results_motif_masking.tabular
test-data/diamond_results_soft_masking.tabular
test-data/names.dmp
test-data/nodes.dmp
test-data/nucleotide.fasta
test-data/prot.accession2taxid
test-data/protein.fasta
test-data/protein.fasta.gz
tool_data_table_conf.xml.sample
tool_data_table_conf.xml.test
added:
test-data/blastdb/README
test-data/blastdb/db.fasta
test-data/blastdb/db.fasta.pdb
test-data/blastdb/db.fasta.phr
test-data/blastdb/db.fasta.pin
test-data/blastdb/db.fasta.pjs
test-data/blastdb/db.fasta.pnd
test-data/blastdb/db.fasta.pni
test-data/blastdb/db.fasta.pog
test-data/blastdb/db.fasta.pos
test-data/blastdb/db.fasta.pot
test-data/blastdb/db.fasta.psq
test-data/blastdb/db.fasta.ptf
test-data/blastdb/db.fasta.pto
test-data/blastdb/filter_and_map_ids.py
test-data/blastdb/gen.sh
test-data/blastdb/map.txt
test-data/blastdb/prot.accession2taxid
test-data/blastdb/taxdb.btd
test-data/blastdb/taxdb.bti
test-data/blastdb/taxdb.py
test-data/blastdb/taxonomy4blast.sqlite3
test-data/blastdb_p.loc
test-data/ncbi_taxonomy.loc
test-data/ncbi_taxonomy/README.md
test-data/ncbi_taxonomy/names.dmp
test-data/ncbi_taxonomy/nodes.dmp
test-data/ncbi_taxonomy/prot.accession2taxid
test-data/taxon.tsv
tool-data/blastdb_p.loc
tool-data/ncbi_taxonomy.loc.sample
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 diamond.xml
--- a/diamond.xml Mon Nov 10 15:12:53 2025 +0000
+++ b/diamond.xml Fri Dec 12 11:14:34 2025 +0000
[
b'@@ -13,18 +13,34 @@\n <![CDATA[\n \n     #if $ref_db_source.db_source == "history":\n-        ln -s \'$ref_db_source.reference_database\' ./database.dmnd\n+        ln -s \'$ref_db_source.reference_database\' ./database.dmnd &&\n+        #set database="database.dmnd"\n+    #else if $ref_db_source.db_source == "blast":\n+        #import os.path\n+        #set basename =  os.path.basename($ref_db_source.reference_database.fields.path)\n+        #set dirname =  os.path.dirname($ref_db_source.reference_database.fields.path)\n+        #set database="./db/" + basename\n+        mkdir ./db &&\n+        ## symlink all files in the directory containing the BLAST DB\n+        ## in newer BLAST DBs there is a file taxonomy4blast.sqlite3\n+        ## that is needed\n+        ln -s \'$dirname/\'* ./db/ &&\n+        #if $ref_db_source.ncbi_taxonomy\n+            ## symlink names and nodes dump files from NCBI taxonomy\n+            ## need to be in the same dir as the BLAST DB (which is the\n+            ## reason why we need to create .,/db/)\n+            ln -s \'$ref_db_source.ncbi_taxonomy.fields.path/nodes.dmp\' ./db/nodes.dmp &&\n+            ln -s \'$ref_db_source.ncbi_taxonomy.fields.path/names.dmp\' ./db/names.dmp &&\n+        #end if\n     #else:\n-        ln -s \'${ref_db_source.index.fields.db_path}\' ./database.dmnd\n+        ln -s \'${ref_db_source.reference_database.fields.db_path}\' ./database.dmnd &&\n+        #set database="database.dmnd"\n     #end if\n \n-    &&\n-\n     diamond\n         $method_cond.method_select\n-        --quiet\n         --threads "\\${GALAXY_SLOTS:-12}"\n-        --db ./database\n+        --db \'$database\'\n         --query \'$query\'\n         #if $method_cond.method_select == "blastx"\n           --query-gencode \'$method_cond.query_gencode\'\n@@ -39,17 +55,11 @@\n         #end if\n \n         @OUTPUT_ARGS@\n-\n         #if $output_section.output.outfmt != \'100\'\n             --compress \'0\'\n         #end if\n+\n         $sens_cond.sensitivity\n-        $iterate\n-        $swipe\n-        --algo $algo\n-        #if $global_ranking\n-            --global-ranking $global_ranking\n-        #end if\n         #if str($gapopen) != "":\n           --gapopen \'$gapopen\'\n         #end if\n@@ -69,6 +79,7 @@\n         #end if\n \n         --id \'$id\'\n+        --approx-id \'$approx_id\'    \n         --query-cover \'$query_cover\'\n         --subject-cover \'$subject_cover\'\n         --block-size \'$sens_cond.block_size\'\n@@ -94,22 +105,32 @@\n             --max-hsps $output_section.max_hsps\n         #end if\n         #if $tax_cond.tax_select == \'file\':\n-            --taxonlist `cat \'$tax_cond.taxonlistfile\' | grep -v "^#" | grep -v "^$" | tr "\\n" "," | sed \'s/,$//\'`\n+            --taxonlist \\$(cat \'$tax_cond.taxonlist\' | grep -v "^#" | grep -v "^$" | tr "\\n" "," | sed \'s/,$//\')\n         #else if  $tax_cond.tax_select == \'list\':\n             --taxonlist \'$tax_cond.taxonlist\'\n         #end if\n+        #if $tax_exclude_cond.tax_select == \'file\':\n+            --taxon_exclude \\$(cat \'$tax_exclude_cond.taxon_exclude\' | grep -v "^#" | grep -v "^$" | tr "\\n" "," | sed \'s/,$//\')\n+        #else if  $tax_exclude_cond.tax_select == \'list\':\n+            --taxon_exclude \'$tax_exclude_cond.taxon_exclude\'\n+        #end if\n+\n         #if $advanced_section.seed_cut\n             --seed-cut $advanced_section.seed_cut\n         #end if\n         $advanced_section.freq_masking\n         --motif-masking $advanced_section.motif_masking\n         --soft-masking $advanced_section.soft_masking\n+        $advanced_section.iterate\n+        $advanced_section.swipe\n+        --algo $advanced_section.algo\n+        #if $advanced_section.global_ranking\n+            --global-ranking $advanced_section.global_ranking\n+        #end if\n         --index-chunks "\\${DIAMOND_INDEX_CHUNKS:-4}"\n         --file-buffer-size "\\${DIAMOND_FILE_BUFFER_SIZE:-67108864}"\n         $log\n-        \n-]]>\n-    </command>\n+    ]]></command>\n     <inputs>\n         <conditional name="method_cond">\n             <param name="method_select" type="select" label="Alignment '..b't,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,staxids,cigar"/>\n+                </conditional>\n+            </section>\n+            <conditional name="sens_cond">\n+                <param name="sensitivity" value=""/>\n+            </conditional>\n+            <param name="matrix" value="BLOSUM62"/>\n+            <param name="masking" value="seg"/>\n+            <conditional name="hit_filter">\n+                <param name="hit_filter_select" value="max"/>\n+                <param name="max_target_seqs" value="25"/>\n+            </conditional>\n+            <conditional name="filter_score">\n+                <param name="filter_score_select" value="evalue"/>\n+                <param name="evalue" value="0.001"/>\n+            </conditional>\n+            <param name="id" value="0"/>\n+            <param name="query_cover" value="0"/>\n+            <conditional name="sens_cond">\n+                <param name="block_size" value="2"/>\n+            </conditional>\n+            <output name="blast_tabular">\n+                <assert_contents>\n+                    <has_n_columns n="15"/>\n+                    <has_n_lines n="5"/>\n+                </assert_contents>\n+            </output>\n+            <assert_command>\n+                <!-- ensure that NCBI taxonomy is really not used-->\n+                <has_text text="nodes.dmp" negate="true"/>\n+                <has_text text="names.dmp" negate="true"/>\n+            </assert_command>\n+        </test>\n+\n+        <!--Test 17 test blastx against cached BLAST DB  + tax columns in output + tax filtering file (tetrapoda and ray finned fished should result in mouse, human, zebra fish) -->\n+        <test expect_num_outputs="1">\n+            <conditional name="method_cond">\n+                <param name="method_select" value="blastp"/>\n+                <param name="comp_based_stats" value="1"/>\n+            </conditional>\n+            <param name="query" value="protein.fasta.gz" ftype="fasta.gz"/>\n+            <conditional name="ref_db_source">\n+                <param name="db_source" value="blast"/>\n+                <param name="reference_database" value="test"/>\n+                <param name="ncbi_taxonomy" value="test"/>\n+            </conditional>\n+            <conditional name="tax_cond">\n+                <param name="tax_select" value="file"/>\n+                <param name="taxonlist" value="taxon.tsv"/>\n+            </conditional>\n+            <section name="output_section">\n+                <conditional name="output">\n+                    <param name="outfmt" value="6"/>\n+                    <param name="fields" value="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,scovhsp,sskingdoms,skingdoms,sphylums,cigar"/>\n+                </conditional>\n+            </section>\n+            <conditional name="sens_cond">\n+                <param name="sensitivity" value=""/>\n+            </conditional>\n+            <param name="matrix" value="BLOSUM62"/>\n+            <param name="masking" value="seg"/>\n+            <conditional name="hit_filter">\n+                <param name="hit_filter_select" value="max"/>\n+                <param name="max_target_seqs" value="25"/>\n+            </conditional>\n+            <conditional name="filter_score">\n+                <param name="filter_score_select" value="evalue"/>\n+                <param name="evalue" value="0.001"/>\n+            </conditional>\n+            <param name="id" value="0"/>\n+            <param name="query_cover" value="0"/>\n+            <conditional name="sens_cond">\n+                <param name="block_size" value="2"/>\n+            </conditional>\n+            <output name="blast_tabular">\n+                <assert_contents>\n+                    <has_n_columns n="17"/>\n+                    <has_n_lines n="3"/>\n+                    <has_text text="Metazoa" n="3"/>\n+                    <has_text text="Viridiplantae" n="0"/>\n                 </assert_contents>\n             </output>\n         </test>\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 diamond_makedb.xml
--- a/diamond_makedb.xml Mon Nov 10 15:12:53 2025 +0000
+++ b/diamond_makedb.xml Fri Dec 12 11:14:34 2025 +0000
[
@@ -9,15 +9,21 @@
     <command detect_errors="aggressive">
         <!-- DB has two files, *.dmnd and *.tx -->
     <![CDATA[
+    ln -s '$infile' database.$infile.ext &&
+
     diamond makedb
         --threads \${GALAXY_SLOTS:-12}
-        --in '$infile'
+        --in database.$infile.ext
         --db ./database
 
-      #if str($tax_cond.tax_select) == 'yes':
+      #if $tax_cond.tax_select == 'yes':
         --taxonmap '$tax_cond.taxonmap'
         --taxonnodes '$tax_cond.taxonnodes'
         --taxonnames '$tax_cond.taxonnames'
+      #else if $tax_cond.tax_select == 'yes_cached':
+        --taxonmap '$tax_cond.ncbi_taxonomy.fields.path'/prot.accession2taxid
+        --taxonnodes '$tax_cond.ncbi_taxonomy.fields.path'/nodes.dmp
+        --taxonnames '$tax_cond.ncbi_taxonomy.fields.path'/names.dmp
       #end if
     ]]>
     </command>
@@ -25,7 +31,8 @@
         <param name="infile" type="data" format="fasta,fasta.gz" label="Input reference file in FASTA format"/>
         <conditional name="tax_cond">
             <param name="tax_select" type="select" label="Add taxonomic data?" help="Needs to be supplied in order to provide taxonomy features of the aligner">
-                <option value="yes">Yes</option>
+                <option value="yes_cached">Using built in NCBI taxonomy</option>
+                <option value="yes">Yes using datasets from history</option>
                 <option value="no" selected="true">No</option>
             </param>
             <when value="yes">
@@ -33,6 +40,13 @@
                 <param argument="--taxonnodes" type="data" format="tabular" label="Taxonomy nodes.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/>
                 <param argument="--taxonnames" type="data" format="tabular" label="Taxonomy names.dmp from NCBI" help="This parameter is optional and needs to be supplied in order to provide taxonomy features"/>
             </when>
+            <when value="yes_cached">
+                <param name="ncbi_taxonomy" type="select" optional="true" label="NCBI taxonomy database" help="Needed for output of taxonomy columns in tabular output">
+                    <options from_data_table="ncbi_taxonomy">
+                        <validator message="No NCBI database is available. Ask your Galaxy adin" type="no_options"/>
+                    </options>
+                </param>
+            </when>
             <when value="no"/>
         </conditional>
     </inputs>
@@ -43,11 +57,18 @@
         <test>
             <param name="infile" value="db.fasta" ftype="fasta"/>
             <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Database sequences +5"/>
+                <has_text_matching expression="Database letters +2578"/>
+            </assert_stderr>
         </test>
-
         <test>
             <param name="infile" value="db.fasta.gz" ftype="fasta.gz"/>
             <output name="outfile" value="db.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Database sequences +5"/>
+                <has_text_matching expression="Database letters +2578"/>
+            </assert_stderr>
         </test>
         <test>
             <param name="infile" value="db.fasta" ftype="fasta"/>
@@ -57,7 +78,32 @@
                 <param name="taxonnodes" ftype="tabular" value="nodes.dmp"/>
                 <param name="taxonnames" ftype="tabular" value="names.dmp"/>
             </conditional>
+            <!-- this test uses a taxdb with consecutive taxIDs which creates the small dmnd test file -->
             <output name="outfile" value="db-wtax.dmnd" compare="sim_size" delta="2"/>
+            <assert_stderr>
+                <has_text_matching expression="Entries in accession to taxid file +5"/>
+                <has_text_matching expression="Database accessions mapped to taxid +5"/>
+                <has_text_matching expression="Database sequences mapped to taxid +5"/>
+            </assert_stderr>
+        </test>
+        <test>
+            <param name="infile" value="db.fasta" ftype="fasta"/>
+            <conditional name="tax_cond">
+                <param name="tax_select" value="yes_cached"/>
+                <param name="ncbi_taxonomy" value="test"/>
+            </conditional>
+            <!-- note that this test uses a different taxDB (original taxIDs - not consecutive)
+                 and therefore we get a larger dmnd file -->
+            <output name="outfile">
+                <assert_contents>
+                    <has_size size="20279226"/>
+                </assert_contents>
+            </output>
+            <assert_stderr>
+                <has_text_matching expression="Entries in accession to taxid file +5"/>
+                <has_text_matching expression="Database accessions mapped to taxid +5"/>
+                <has_text_matching expression="Database sequences mapped to taxid +5"/>
+            </assert_stderr>
         </test>
     </tests>
     <help>
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 macros.xml
--- a/macros.xml Mon Nov 10 15:12:53 2025 +0000
+++ b/macros.xml Fri Dec 12 11:14:34 2025 +0000
[
b'@@ -1,6 +1,6 @@\n <macros>\n-    <token name="@TOOL_VERSION@">2.1.13</token>\n-    <token name="@VERSION_SUFFIX@">1</token>\n+    <token name="@TOOL_VERSION@">2.1.16</token>\n+    <token name="@VERSION_SUFFIX@">0</token>\n     <xml name="requirements">\n         <requirements>\n             <requirement type="package" version="@TOOL_VERSION@">diamond</requirement>\n@@ -28,45 +28,55 @@\n             <when value="0"/>\n             <when value="5"/>\n             <when value="6">\n-                <param name="fields" type="select" label="Tabular fields" help="" multiple="true">\n+                <param argument="--fields" type="select" label="Tabular fields" help="" multiple="true">\n                     <option value="qseqid" selected="true">Query Seq - id</option>\n+                    <option value="qlen">Query sequence length</option>\n                     <option value="sseqid" selected="true">Subject Seq - id</option>\n                     <option value="sallseqid">All subject Seq - id(s)</option>\n-                    <option value="qlen">Query sequence length</option>\n                     <option value="slen">Subject sequence length</option>\n+                    <option value="qstart" selected="true">Start of alignment in query</option>\n+                    <option value="qend" selected="true">End of alignment in query</option>\n+                    <option value="sstart" selected="true">Start of alignment in subject</option>\n+                    <option value="send" selected="true">End of alignment in subject</option>\n+                    <option value="qseq">Aligned part of query sequence</option>\n+                    <option value="qseq_gapped">Aligned part of query sequence (with gaps)</option>\n+                    <option value="qseq_translated">Translation of the aligned part of query sequence</option>\n+                    <option value="full_qseq">Query sequence</option>\n+                    <option value="full_qseq_mate">Query sequence of the mate</option>\n+                    <option value="sseq">Aligned part of subject sequence</option>\n+                    <option value="sseq_gapped">Aligned part of subject sequence (with gaps)</option>\n+                    <option value="full_sseq">Subject sequence</option>\n+                    <option value="evalue" selected="true">Expect value</option>\n+                    <option value="bitscore" selected="true">Bit score</option>\n+                    <option value="corrected_bitscore" selected="true">Bit score corrected for edge effects</option>\n+                    <option value="score">Raw score</option>\n+                    <option value="length" selected="true">Alignment length</option>\n                     <option value="pident" selected="true">Percentage of identical matches</option>\n-                    <option value="length" selected="true">Alignment length</option>\n+                    <option value="approx_pident">Approximate percentage of identical matches</option>\n                     <option value="nident">Number of identical matches</option>\n                     <option value="mismatch" selected="true">Number of mismatches</option>\n                     <option value="positive">Number of positive - scoring matches</option>\n                     <option value="gapopen" selected="true">Number of gap openings</option>\n                     <option value="gaps">Total number of gaps</option>\n                     <option value="ppos">Percentage of positive - scoring matches</option>\n-                    <option value="qstart" selected="true">Start of alignment in query</option>\n-                    <option value="qend" selected="true">End of alignment in query</option>\n-                    <option value="sstart" selected="true">Start of alignment in subject</option>\n-                    <option value="send" selected="true">End of alignment in subject</option>\n-                    <option value="qseq">Aligned part of query sequence</option>\n-                    <option value="sseq">Ali'..b'gnments within the given percentage range of the top alignment score for a query" help="For example, setting this to 10 will report all alignments whose score is at most 10% lower than the best alignment score for a query."/>\n             </when>\n         </conditional>\n+        <param argument="--id" type="float" value="0" min="0" max="100" label="Minimum identity percentage to report an alignment" help="Report only alignments above the given percentage of sequence identity"/>\n+        <param argument="--approx-id" type="float" value="0" min="0" max="100" label="Minimum approx. identity% to report an alignment"/>\n+        <param argument="--query-cover" type="float" value="0" min="0" max="100" label="Minimum query cover percentage to report an alignment" help="Report only alignments above the given percentage of query cover"/>\n+        <param argument="--subject-cover" type="float" value="0" min="0" max="100" label="Minimum subject cover percentage to report an alignment" help="Report only alignments above the given percentage of subject cover"/>\n     </xml>\n-    <xml name="block_size_low_sens">\n-        <param argument="--block-size" type="float" value="2" label="Block size in billions of sequence letters to be processed at a time"\n-            help="This is the main parameter for controlling the program\xe2\x80\x99s memory and disk space usage. Bigger numbers will increase the use of memory and temporary                  disk space, but also improve performance"/>\n-    </xml>\n-    <xml name="block_size_hi_sens">\n-        <param argument="--block-size" type="float" value="0.4" label="Block size in billions of sequence letters to be processed at a time"\n-            help="This is the main parameter for controlling the program\xe2\x80\x99s memory and disk space usage. Bigger numbers will increase the use of memory and temporary                  disk space, but also improve performance"/>\n+    <xml name="block_size" tokens="value">\n+        <param argument="--block-size" type="float" value="@VALUE@" min="0" label="Block size in billions of sequence letters to be processed at a time"\n+            help="This is the main parameter for controlling the program\xe2\x80\x99s memory and disk space usage. Bigger numbers will increase the use of memory and temporary disk space, but also improve performance"/>\n     </xml>\n     <xml name="citations">\n         <citations>\n@@ -138,6 +161,7 @@\n             --out \'$blast_xml\'\n         #else if $output_section.output.outfmt == "6"\n             --outfmt \'6\' #echo \' \'.join(str($output_section.output.fields).split(\',\'))\n+            --header $output_section.output.header\n             --out \'$blast_tabular\'\n         #else if $output_section.output.outfmt == "100"\n             --outfmt \'100\'\n@@ -158,4 +182,23 @@\n             --top \'$hit_filter.top\'\n         #end if\n     </token>\n+\n+    <xml name="taxon_cond_macro" tokens="cond_name,label,help,argument">\n+        <conditional name="@COND_NAME@">\n+            <param name="tax_select" type="select" label="@LABEL@" help="Any taxonomic rank can be used, and only reference sequences matching one of the specified taxon ids will be searched against.">\n+                <option value="no" selected="True">No</option>\n+                <option value="list">List of taxids entered manually</option>\n+                <option value="file">List of taxids from single column tabular file</option>\n+            </param>\n+            <when value="no"/>\n+            <when value="list">\n+                <param argument="@ARGUMENT@" type="text" value="" label="Taxon IDss" help="Comma separated list">\n+                    <validator type="regex" message="Taxonlist needs to be a comma separated list of integers">[0-9,]*</validator>\n+                </param>\n+            </when>\n+            <when value="file">\n+                <param argument="@ARGUMENT@" type="data" format="tabular" label="Taxon id file" help="One taxon ID per line"/>\n+            </when>\n+        </conditional>\n+    </xml>\n </macros>\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/README Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,10 @@
+BLAST DBs come with additional taxonomic data: taxdb.btd, taxdb.bti, taxonomy4blast.sqlite3
+which are quite large. this folder contains small test data covering a few species
+
+Oryza sativa    4530
+Drosophila      7215
+Danio rerio     7955
+Homo sapiens    9606
+Mus musculus    10090
+
+the files have been provided to @bernt-matthias by the NCBI help desk (ticket help #247163)
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/db.fasta Fri Dec 12 11:14:34 2025 +0000
[
@@ -0,0 +1,45 @@
+>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group]
+MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF
+FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS
+HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT
+MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI
+SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF
+TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF
+FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP
+WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS
+>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster]
+SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP
+IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS
+VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD
+RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL
+LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG
+LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN
+LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI
+EWYQNTPPAEHSYSELPLLTN
+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]
+MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI
+GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG
+VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT
+NVEWLHGCPPPYHTFEEPAFVQIQSN
+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM
+NLEWLYGCPPPYHTFEEPVYMKS
+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]
+MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV
+GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG
+VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST
+NLEWLHGCPPPYHTFEEPTYVKVK
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pdb
b
Binary file test-data/blastdb/db.fasta.pdb has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.phr
b
Binary file test-data/blastdb/db.fasta.phr has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pin
b
Binary file test-data/blastdb/db.fasta.pin has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pjs
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/db.fasta.pjs Fri Dec 12 11:14:34 2025 +0000
[
@@ -0,0 +1,27 @@
+{
+  "version": "1.2",
+  "dbname": "db.fasta",
+  "dbtype": "Protein",
+  "db-version": 5,
+  "description": "cox1 blastp DB",
+  "number-of-letters": 2578,
+  "number-of-sequences": 5,
+  "last-updated": "2025-12-09T18:15:00",
+  "number-of-volumes": 1,
+  "number-of-taxids": 5,
+  "bytes-total": 52950,
+  "bytes-to-cache": 2720,
+  "files": [
+    "db.fasta.pdb",
+    "db.fasta.phr",
+    "db.fasta.pin",
+    "db.fasta.pnd",
+    "db.fasta.pni",
+    "db.fasta.pog",
+    "db.fasta.pos",
+    "db.fasta.pot",
+    "db.fasta.psq",
+    "db.fasta.ptf",
+    "db.fasta.pto"
+  ]
+}
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pnd
b
Binary file test-data/blastdb/db.fasta.pnd has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pni
b
Binary file test-data/blastdb/db.fasta.pni has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pog
b
Binary file test-data/blastdb/db.fasta.pog has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pos
b
Binary file test-data/blastdb/db.fasta.pos has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pot
b
Binary file test-data/blastdb/db.fasta.pot has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.psq
b
Binary file test-data/blastdb/db.fasta.psq has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.ptf
b
Binary file test-data/blastdb/db.fasta.ptf has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/db.fasta.pto
b
Binary file test-data/blastdb/db.fasta.pto has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/filter_and_map_ids.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/filter_and_map_ids.py Fri Dec 12 11:14:34 2025 +0000
[
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# filter names and nodes dmp files by a list of given IDs
+# parent node IDs will be added if needed
+#
+# IDs will be renamed to give a consecuive set of IDs: 1,2,...
+# oderwise dmnd databases including taxonomy will be huge
+# also make make sure that the order of the taxids is not changed
+
+from sys import argv
+
+names_file_name = argv[1]
+nodes_file_name = argv[2]
+prot2ids_file_name = argv[3]
+names_file_out_name = argv[4]
+nodes_file_out_name = argv[5]
+prot2ids_file_out_name = argv[6]
+
+parent = dict()
+with open(nodes_file_name) as nodes_file:
+    for line in nodes_file:
+        line = line.strip().split("|")
+        parent[line[0].strip()] = line[1].strip()
+
+initial_ids = set()
+with open(prot2ids_file_name) as prot2ids_file:
+    for i, line in enumerate(prot2ids_file):
+        if i == 0:
+            continue
+        line = line.strip().split()
+        initial_ids.add(line[2].strip())
+
+ids = set()
+while len(initial_ids):
+    i = initial_ids.pop()
+    p = parent[i]
+    if p == i:
+        ids.add(p)
+        continue
+    ids.add(i)
+    initial_ids.add(p)
+
+id_map = dict()
+with open(names_file_name) as names_file, open(names_file_out_name, "w") as names_file_out:
+    for line in names_file:
+        line = line.strip().split("|")
+        id = line[0].strip()
+        if id not in ids:
+            continue
+        if id not in id_map:
+            id_map[id] = len(id_map) + 1
+        names_file_out.write(f'{id_map[id]}\t|{"|".join(line[1:])}\n')
+
+print(f'taxonlist for test 2 needs to be {id_map["33090"]}')
+
+with open(nodes_file_name) as nodes_file, open(nodes_file_out_name, "w") as nodes_file_out:
+    for line in nodes_file:
+        line = line.strip().split("|")
+        node = line[0].strip()
+        parent = line[1].strip()
+        if node not in ids or parent not in ids:
+            continue
+        nodes_file_out.write(f'{id_map[node]}\t|\t{id_map[parent]}\t|{"|".join(line[2:])}\n')
+
+with open(prot2ids_file_name) as prot2ids_file, open(prot2ids_file_out_name, "w") as prot2ids_file_out:
+    for i, line in enumerate(prot2ids_file):
+        if i == 0:
+            prot2ids_file_out.write(line)
+            continue
+        line = line.strip().split()
+        id = line[2].strip()
+        line[2] = str(id_map[id])
+        prot2ids_file_out.write("\t".join(line) + "\n")
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/gen.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/gen.sh Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,37 @@
+#/bin/bash
+
+set -e
+
+wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
+tar -xzf taxdump.tar.gz
+
+
+# create blast DB
+# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170
+# more precisely in the taxdb.bt* files (which are here constructed from the dmp files)
+# we also add the path to the root (guess not needed strictly)
+# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine
+sqlite3 taxonomy4blast.sqlite3 "SELECT * FROM TaxidInfo;" | sed 's/|/\n/g' | sort -n -u | sed 's/^/^/; s/$/\\s/' > grep.txt
+echo "^1234\\s" >> grep.txt
+echo "^189779\\s" >> grep.txt
+echo "^189778\\s" >> grep.txt
+echo "^203693\\s" >> grep.txt
+echo "^40117\\s" >> grep.txt
+echo "^3379134\\s" >> grep.txt
+echo "^2\\s" >> grep.txt
+
+grep -f grep.txt names.dmp > ../ncbi_taxonomy/names.dmp
+grep -f grep.txt nodes.dmp > ../ncbi_taxonomy/nodes.dmp
+
+python taxdb.py 
+makeblastdb -in db.fasta -parse_seqids -blastdb_version 5 -taxid_map map.txt -title "cox1 blastp DB" -dbtype prot
+
+# create small dmnd data base with taxonomy
+# the important thing to get a small DB is to have consecutive taxIDs
+# NOTE: filter_and_map_ids modifies taxIDs (to get a small file), i.e. taxIDs will be different from tests using BLAST DB from above
+python filter_and_map_ids.py names.dmp nodes.dmp prot.accession2taxid ../names.dmp ../nodes.dmp ../prot.accession2taxid
+diamond makedb --in db.fasta --db ./database --taxonmap ../prot.accession2taxid --taxonnodes ../nodes.dmp --taxonnames ../names.dmp
+mv database.dmnd ../db-wtax.dmnd
+
+rm *.dmp readme.txt taxdump.tar.gz gc.prt
+
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/map.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/map.txt Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,11 @@
+# file used to store protein IDs to taxids
+# diamond expects 1234 in the tax data: https://github.com/bbuchfink/diamond/blob/56214dfcb4278f08e935147e8dbea7672997386e/src/data/blastdb/blastdb.cpp#L170
+# more precisely in the taxdb.bt* files (which are here constructed from the dmp files)
+# we also add the path to the root (guess not needed strictly)
+# ideally 1234 should also in the sqlite DB, but since the taxon is not in the fasta it should be fine
+X 1234
+YP_514675.1 4530
+YP_009047267.1 7215
+NP_059333.1 7955
+YP_003024028.1 9606
+NP_904330.1 10090
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/prot.accession2taxid
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/prot.accession2taxid Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,6 @@
+accession accession.version taxid gi
+YP_514675 YP_514675.1 4530 3950761
+YP_009047267 YP_009047267.1 7215 19893533
+NP_059333 NP_059333.1 7955 140539
+YP_003024028 YP_003024028.1 9606 4512
+NP_904330 NP_904330.1 10090 17708
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/taxdb.btd
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/taxdb.btd Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,1 @@
+Bacteria eubacteria bacteria BacteriaNitrospira bacteria BacteriaEukaryota eukaryotes EukaryotaEmbryophyta plants land plants EukaryotaMagnoliopsida angiosperms flowering plants EukaryotaLaurales flowering plants EukaryotaLauraceae laurel family flowering plants EukaryotaPersea flowering plants EukaryotaPersea americana flowering plants EukaryotaLiliopsida monocotyledons monocots EukaryotaPoaceae grass family monocots EukaryotaOryza monocots EukaryotaOryza sativa rice monocots Eukaryotacommelinids monocots EukaryotaEumetazoa animals EukaryotaArthropoda arthropods EukaryotaHexapoda hexapods EukaryotaDiptera flies EukaryotaBrachycera flies EukaryotaDrosophilidae flies EukaryotaDrosophila fruit fly flies EukaryotaPterygota insects EukaryotaChordata chordates EukaryotaVertebrata vertebrates EukaryotaGnathostomata vertebrates EukaryotaActinopterygii fish ray-finned fishes EukaryotaCypriniformes ray-finned fishes EukaryotaDanio ray-finned fishes EukaryotaDanio rerio zebra fish ray-finned fishes EukaryotaSarcopterygii vertebrates EukaryotaEutheria placental mammals placentals EukaryotaPrimates primates EukaryotaCatarrhini primates EukaryotaHominidae primates EukaryotaHomo humans primates EukaryotaHomo sapiens primates EukaryotaRodentia rodent rodents EukaryotaMuridae rodents EukaryotaMus mouse rodents EukaryotaMus musculus mouse rodents EukaryotaCyprinoidei ray-finned fishes EukaryotaTeleostei ray-finned fishes EukaryotaOstariophysi ray-finned fishes EukaryotaTetrapoda vertebrates EukaryotaAmniota vertebrates EukaryotaTheria mammals EukaryotaViridiplantae green plants green plants EukaryotaOpisthokonta eukaryotes EukaryotaMetazoa multicellular animals animals EukaryotaBilateria animals EukaryotaProtostomia animals EukaryotaNeoptera insects EukaryotaEndopterygota insects EukaryotaDeuterostomia deuterostomes animals EukaryotaStreptophyta green plants EukaryotaPoales monocots EukaryotaMurinae rodents EukaryotaNitrospirota bacteria BacteriaMammalia mammals EukaryotaNeopterygii ray-finned fishes EukaryotaMuscomorpha flies EukaryotaSchizophora flies EukaryotaAcalyptratae flies EukaryotaEphydroidea flies EukaryotaDrosophilinae flies EukaryotaDrosophilini flies EukaryotaInsecta true insects insects EukaryotaTracheophyta vascular plants vascular plants EukaryotaSpermatophyta seed plants seed plants EukaryotaEuphyllophyta vascular plants EukaryotaDicondylia insects EukaryotaPanarthropoda animals EukaryotaCraniata chordates EukaryotaTeleostomi vertebrates EukaryotaEuteleostomi vertebrates EukaryotaStreptophytina green plants EukaryotaOryzoideae monocots EukaryotaOryzeae monocots EukaryotaActinopteri ray-finned fishes EukaryotaClupeocephala ray-finned fishes EukaryotaOtophysi ray-finned fishes EukaryotaCypriniphysae ray-finned fishes EukaryotaOtomorpha ray-finned fishes EukaryotaNitrospirales bacteria BacteriaNitrospiraceae bacteria BacteriaPancrustacea arthropods EukaryotaMandibulata mandibulates arthropods EukaryotaNitrospiria bacteria BacteriaHomininae primates EukaryotaMagnoliidae flowering plants EukaryotaEuarchontoglires placentals EukaryotaGlires placentals EukaryotaSimiiformes primates EukaryotaHominoidea ape primates EukaryotaMuroidea rodents EukaryotaBOP clade monocots EukaryotaHaplorrhini primates EukaryotaCyclorrhapha flies EukaryotaEremoneura flies EukaryotaMus rodents EukaryotaEcdysozoa animals EukaryotaDipnotetrapodomorpha vertebrates EukaryotaBoreoeutheria placentals EukaryotaMesangiospermae flowering plants EukaryotaPetrosaviidae monocots EukaryotaOsteoglossocephalai ray-finned fishes EukaryotaOryzinae monocots EukaryotaMyomorpha rodents EukaryotaDanionidae ray-finned fishes EukaryotaDanioninae ray-finned fishes EukaryotaPseudomonadati bacteria Bacteria
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/taxdb.bti
b
Binary file test-data/blastdb/taxdb.bti has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/taxdb.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb/taxdb.py Fri Dec 12 11:14:34 2025 +0000
[
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+build_taxdb_from_nodes.py
+
+Create taxdb.btd and taxdb.bti (NCBI/BLAST/ISAM format) from a pruned nodes.dmp
+and optional names.dmp found in the current directory.
+
+Usage:
+    python3 build_taxdb_from_nodes.py
+
+Output:
+    taxdb.btd
+    taxdb.bti
+
+Notes:
+- Writes integers in BIG-ENDIAN (network order) as required by the ISAM/NCBI format.
+- The btd records are written as:
+    scientific_name<TAB>common_name<TAB>blast_name<TAB>superkingdom_code
+  with no reliance on newlines for delimitation (offsets define length).
+"""
+import struct
+import sys
+from collections import defaultdict
+
+NODES_FILE = "../ncbi_taxonomy/nodes.dmp"
+NAMES_FILE = "../ncbi_taxonomy/names.dmp"   # optional
+OUT_BTD = "taxdb.btd"
+OUT_BTI = "taxdb.bti"
+
+TAXDB_MAGIC = 0x8739
+
+
+# -------------------------
+# Helpers
+# -------------------------
+def read_nodes(nodes_path):
+    """Return dicts: parent[taxid]=parent_taxid, rank[taxid]=rank"""
+    parent = {}
+    rank = {}
+    with open(nodes_path, encoding="utf-8") as fh:
+        for line in fh:
+            parts = [p.strip() for p in line.split("|")]
+            if len(parts) < 3:
+                continue
+            try:
+                taxid = int(parts[0])
+                parent_tax = int(parts[1])
+            except ValueError:
+                continue
+            parent[taxid] = parent_tax
+            rank[taxid] = parts[2]
+    return parent, rank
+
+
+def read_names(names_path):
+    """Return dict: names[taxid] = {'scientific':..., 'common':..., 'blast':...}"""
+    names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""})
+    with open(names_path, encoding="utf-8") as fh:
+        for line in fh:
+            parts = [p.strip() for p in line.split("|")]
+            if len(parts) < 4:
+                continue
+            try:
+                taxid = int(parts[0])
+            except ValueError:
+                continue
+            name_txt = parts[1]
+            name_class = parts[3]
+            if name_class == "scientific name":
+                names[taxid]["scientific"] = name_txt
+            elif name_class == "common name":
+                names[taxid]["common"] = name_txt
+            elif name_class == "blast name":
+                names[taxid]["blast"] = name_txt
+    return names
+
+
+def infer_superkingdom_code(taxid, parent, rank, sci_name_lookup):
+    """
+    Walk ancestors until rank == 'superkingdom', then map name to code:
+    B (Bacteria), A (Archaea), E (Eukaryota), V (Viruses), U (Unknown)
+    """
+    seen = set()
+    cur = taxid
+    while True:
+        if cur in seen:
+            return "Unknown"
+        seen.add(cur)
+        r = rank.get(cur, "")
+        if r == "domain":
+            name = sci_name_lookup.get(cur, "").lower()
+            if "bacteria" in name or "eubacteria" in name:
+                return "Bacteria"
+            if "archaea" in name:
+                return "Archaea"
+            if "eukaryota" in name or "eukaryota" in name or "eukary" in name:
+                return "Eukaryota"
+            if "virus" in name or "viruses" in name:
+                return "Viruses"
+            return "Unknown"
+        if cur not in parent:
+            return "Unknown"
+        cur = parent[cur]
+
+
+def infer_blast_name(taxid, parent, lookup):
+    """
+    """
+    seen = set()
+    cur = taxid
+    while True:
+        if cur in seen:
+            return "Unknown"
+        seen.add(cur)
+        name = lookup.get(cur, "").lower()
+
+        if name:
+            return name
+        if cur not in parent:
+            return "Unknown"
+        cur = parent[cur]
+
+
+# -------------------------
+# Main
+# -------------------------
+def main():
+    # Read nodes.dmp
+    try:
+        parent, rank = read_nodes(NODES_FILE)
+    except FileNotFoundError:
+        print(f"Error: {NODES_FILE} not found in current directory.", file=sys.stderr)
+        sys.exit(2)
+
+    # Read names.dmp if present
+    try:
+        names = read_names(NAMES_FILE)
+    except FileNotFoundError:
+        names = defaultdict(lambda: {"scientific": "", "common": "", "blast": ""})
+        print("Warning: names.dmp not found. scientific_name will be set to the taxid.", file=sys.stderr)
+
+    # Determine the taxids to write:
+    # use taxids present in nodes.dmp (pruned set)
+    taxids = sorted(parent.keys())
+
+    if len(taxids) == 0:
+        print("No taxids found in nodes.dmp; nothing to do.", file=sys.stderr)
+        sys.exit(0)
+
+    # Build scientific-name lookup for superkingdom inference
+    sci_lookup = {}
+    for tid, rec in names.items():
+        sci_lookup[tid] = rec.get("scientific", "")
+
+    # Build blast-name lookup blast name inference
+    bla_lookup = {}
+    for tid, rec in names.items():
+        bla_lookup[tid] = rec.get("blast", "")
+
+    # Build btd records and offsets
+    offsets = []
+    btd_buf = bytearray()
+    for tid in taxids:
+        offsets.append(len(btd_buf))
+        rec = names.get(tid, {"scientific": "", "common": "", "blast": ""})
+        sci = rec.get("scientific", "")
+        com = rec.get("common", "")
+
+        if not sci:
+            # fallback: use numeric taxid as scientific name (ensures non-empty)
+            sci = str(tid)
+
+        # infer superkingdom code from nodes.dmp and names if possible
+        sk = infer_superkingdom_code(tid, parent, rank, sci_lookup)
+        bla = infer_blast_name(tid, parent, bla_lookup)
+
+        # exactly 4 fields, tab-separated; no trailing newline required
+        record = f"{sci}\t{com}\t{bla}\t{sk}"
+        btd_buf.extend(record.encode("utf-8"))
+
+    end_offset = len(btd_buf)
+
+    # Write taxdb.btd
+    with open(OUT_BTD, "wb") as fh:
+        fh.write(btd_buf)
+
+    # Write taxdb.bti
+    with open(OUT_BTI, "wb") as fh:
+        # header: magic, count (number of real taxids), reserved[4]
+        # IMPORTANT: write all integers BIG-ENDIAN (>I)
+        fh.write(struct.pack(">I", TAXDB_MAGIC))
+        fh.write(struct.pack(">I", len(taxids)))     # n (real entries only)
+        fh.write(struct.pack(">IIII", 0, 0, 0, 0))   # reserved
+
+        # index entries: (taxid, offset) pairs
+        for tid, off in zip(taxids, offsets):
+            fh.write(struct.pack(">I", int(tid)))
+            fh.write(struct.pack(">I", int(off)))
+
+        # # sentinel entry: taxid=0, offset=end_of_btd
+        # fh.write(struct.pack(">I", 0))
+        # fh.write(struct.pack(">I", end_offset))
+
+    # Summary
+    print(f"Wrote {OUT_BTD} ({end_offset} bytes)")
+    print(f"Wrote {OUT_BTI} (header + {len(taxids)} entries)")
+    print(f"Taxids written: {len(taxids)}")
+
+
+if __name__ == "__main__":
+    main()
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb/taxonomy4blast.sqlite3
b
Binary file test-data/blastdb/taxonomy4blast.sqlite3 has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/blastdb_p.loc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/blastdb_p.loc Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,1 @@
+test testDB ${__HERE__}/blastdb/db.fasta
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/db-wtax.dmnd
b
Binary file test-data/db-wtax.dmnd has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/db.dmnd
b
Binary file test-data/db.dmnd has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/db.fasta
--- a/test-data/db.fasta Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/db.fasta Fri Dec 12 11:14:34 2025 +0000
[
@@ -1,12 +1,45 @@
->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
-IENY
->gi|5524212|gb|AAD44167.1| cytochrome c [Elephas minimus minimus]
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAGGGGGGGWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKAAAAAAAAAAAAAAAAAAAAAAAAATFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALAAAAAAAAAAAAAAAAAAAAAAATIIGQMASILYFSIILAFLPIAGX
-IENY
+>gi|3950761|gb|YP_514675.1|cytochrome c oxidase subunit 1 (mitochondrion) [Oryza sativa Indica Group]
+MTNLVRWLFSTNHKDIGTLYFIFGAIAGVMGTCFSVLIRMELARPGDQILGGNHQLYNVLITAHAFLMIF
+FMVMPAMIGGFGNWFVPILIGAPDMAFPRLNNISFWLLPPSLLLLLSSALVEVGSGTGWTVYPPLSGITS
+HSGGAVDLAIFSLHLSGVSSILGSINFITTIFNMRGPGMTMHRLPLFVWSVLVTAFLLLLSLPVLAGAIT
+MLLTDRNFNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHIVSTFSRKPVFGYLGMVYAMI
+SIGVLGFLVWAHHMFTVGLDVDTRAYFTAATMIIAVPTGIKIFSWIATMWGGSIQYKTPMLFAVGFIFLF
+TIGGLTGIVLANSGLDIALHDTYYVVAHFHYVLSMGAVFALFAGFYYWVGKIFGRTYPETLGQIHFWITF
+FGVNLTFFPMHFLGLSGMPRRIPDYPDAYAGWNALSSFGSYISVVGIRRFFVVVAITSSSGKNKRCAESP
+WAVEQNPTTLEWLVQSPPAFHTFGELPAIKETKS
+>gi|19893533|gb|YP_009047267.1|cytochrome c oxidase subunit I, partial (mitochondrion) [Drosophila melanogaster]
+SRQWLFSTNHKDIGTLYFIFGAWAGMVGTSLSILIRAELGHPGALIGDDQIYNVIVTAHAFIMIFFMVMP
+IMIGGFGNWLVPLMLGAPDMAFPRMNNMSFWLLPPALSLLLVSSMVENGAGTGWTVYPPLSAGIAHGGAS
+VDLAIFSLHLAGISSILGAVNFITTVINMRSTGISLDRMPLFVWSVVITALLLLLSLPVLAGAITMLLTD
+RNLNTSFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIISQESGKKETFGSLGMIYAMLAIGL
+LGFIVWAHHMFTVGMDVDTRAYFTSATMIIAVPTGIKIFSWLATLHGTQLSYSPAILWALGFVFLFTVGG
+LTGVVLANSSVDIILHDTYYVVAHFHYVLSMGAVFAIMAGFIHWYPLFTGLTLNNKWLKSHFIIMFIGVN
+LTFFPQHFLGLAGMPRRYSDYPDAYTTWNIVSTIGSTISLLGILFFFFIIWESLVSQRQVIYPIQLNSSI
+EWYQNTPPAEHSYSELPLLTN
+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]
+MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQTPLFVWAVLVTAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAI
+GLLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG
+VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKREVLSVELTAT
+NVEWLHGCPPPYHTFEEPAFVQIQSN
+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKRKVLMVEEPSM
+NLEWLYGCPPPYHTFEEPVYMKS
+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]
+MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAG
+ASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQTPLFVWSVLITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSI
+GFLGFIVWAHHMFTVGLDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTV
+GGLTGIVLSNSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG
+VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKREVMSVSYAST
+NLEWLHGCPPPYHTFEEPTYVKVK
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/db.fasta.gz
b
Binary file test-data/db.fasta.gz has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results.pairwise
--- a/test-data/diamond_results.pairwise Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results.pairwise Fri Dec 12 11:14:34 2025 +0000
[
b'@@ -1,34 +1,136 @@\n BLASTP 2.3.0+\n \n \n-Query= sequence more text\n+Query= NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome\n+\n+Length=1540\n+\n+>gi|4512|gb|YP_003024028.1|cytochrome c oxidase subunit I (mitochondrion) [Homo sapiens]\n+Length=513\n+\n+ Score = 897 bits (2318),  Expect = 0.0\n+ Identities = 455/512 (88%), Positives = 490/512 (95%), Gaps = 0/512 (0%)\n+ Frame = 1\n \n-Length=849\n+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180\n+             MFADRWLFSTNHKDIGTLYLLFGA AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA\n+Sbjct     1  MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 60\n+\n+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360\n+             HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PS LLLLASA VEA\n+Sbjct    61  HAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSLLLLLASAMVEA 120\n+\n+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540\n+             GAGTG TVYPPLAGNYSHPGASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ\n+Sbjct   121  GAGTGWTVYPPLAGNYSHPGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180\n \n->gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]\n-Length=284\n+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720\n+             TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH\n+Sbjct   181  TPLFVWSVLITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240\n+\n+Query   721  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900\n+             PEVYILILPGFG+ISHIVT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD\n+Sbjct   241  PEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVD 300\n+\n+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080\n+             TRAYFTSAT+IIAIPTGVKVFS LATLHGSN K SAA+L ALGFIFLFTVGGLTGIVLAN\n+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGSNMKWSAAVLWALGFIFLFTVGGLTGIVLAN 360\n \n- Score = 550 bits (1417),  Expect = 1.44e-205\n- Identities = 283/284 (99%), Positives = 283/284 (99%), Gaps = 1/284 (0%)\n+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260\n+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+GGFIHWFPLFSGYTL+QTYAKIHF  +F+G\n+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLDQTYAKIHFTIMFIG 420\n+\n+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440\n+             VNLTFFPQHFLGLSG+PRRYSDYPDAYTT NILSS GSFISLTAV+L+IF+I EAFASKR\n+Sbjct   421  VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSVGSFISLTAVMLMIFMIWEAFASKR 480\n+\n+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536\n+             KV ++E+PS +LEWLYGCPPPYHTFEEPVY+K\n+Sbjct   481  KVLMVEEPSMNLEWLYGCPPPYHTFEEPVYMK 512\n+\n+>gi|17708|gb|NP_904330.1|cytochrome c oxidase subunit I (mitochondrion) [Mus musculus]\n+Length=514\n+\n+ Score = 847 bits (2189),  Expect = 8.27e-315\n+ Identities = 427/512 (83%), Positives = 476/512 (92%), Gaps = 0/512 (0%)\n  Frame = 1\n \n-Query    1  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 180\n-            LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS\n-Sbjct    1  LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFS 60\n+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180\n+             MF +RWLFSTNHKDIGTLYLLFGA AG++GTALS+LIRAELGQPG LLG+D IYNVIVTA\n+Sbjct     1  MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSILIRAELGQPGALLGDDQIYNVIVTA 60\n+\n+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360\n+             HAFV+IFFMVMP++IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA\n+Sbjct    61  HAFVMIFFMVMPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEA 120\n+\n+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540\n+             GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITTIIN+KPPA++QYQ\n+Sbjct   121  GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMTQYQ 180\n+\n+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720\n+             TPLFV S+LITAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH\n+Sbjct   181  TPLFVWSVLITAVLLLLSLPV'..b'21  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900\n+             PEVYILILPGFGIISH+VT+YSGKKEPFGY+G+V A++SIGFLGFIV AHH+FTVG+DVD\n+Sbjct   241  PEVYILILPGFGIISHVVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGLDVD 300\n+\n+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080\n+             TRAYFTSAT+IIAIPTGVKVFS LATLHG N K S A+L ALGFIFLFTVGGLTGIVL+N\n+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMLWALGFIFLFTVGGLTGIVLSN 360\n+\n+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260\n+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLFSG+TL+ T+AK HF  +FVG\n+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFSGFTLDDTWAKAHFAIMFVG 420\n+\n+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440\n+             VN+TFFPQHFLGLSG+PRRYSDYPDAYTT N +SS GSFISLTAV+++IF+I EAFASKR\n+Sbjct   421  VNMTFFPQHFLGLSGMPRRYSDYPDAYTTWNTVSSMGSFISLTAVLIMIFMIWEAFASKR 480\n+\n+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536\n+             +V  +   ST+LEWL+GCPPPYHTFEEP Y+K\n+Sbjct   481  EVMSVSYASTNLEWLHGCPPPYHTFEEPTYVK 512\n+\n+>gi|140539|gb|NP_059333.1|cytochrome c oxidase subunit I (mitochondrion) [Danio rerio]\n+Length=516\n \n-Query  358  TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 537\n-            TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE\n-Sbjct  121  TSDSDKIPFHPYYTIKDFLGLLILXXXXXXXALLSPDMLGDPDNHMPADPLNTPLHIKPE 180\n+ Score = 810 bits (2091),  Expect = 7.42e-300\n+ Identities = 407/512 (79%), Positives = 459/512 (89%), Gaps = 0/512 (0%)\n+ Frame = 1\n+\n+Query     1  MFADRWLFSTNHKDIGTLYLLFGA*AGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTA 180\n+             M   RW FSTNHKDIGTLYL+FGA AG++GTALSLLIRAEL QPG LLG+D IYNVIVTA\n+Sbjct     1  MTITRWFFSTNHKDIGTLYLVFGAWAGMVGTALSLLIRAELSQPGALLGDDQIYNVIVTA 60\n+\n+Query   181  HAFVIIFFMVMPIIIGGFGN*LVPLIIGAPDMAFPRINNISF*LLLPSFLLLLASATVEA 360\n+             HAFV+IFFMVMPI+IGGFGN LVPL+IGAPDMAFPR+NN+SF LL PSFLLLLAS+ VEA\n+Sbjct    61  HAFVMIFFMVMPILIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSGVEA 120\n+\n+Query   361  GAGTG*TVYPPLAGNYSHPGASVDLTIFSLHLAGISSILGAINFITTIINIKPPAISQYQ 540\n+             GAGTG TVYPPLAGN +H GASVDLTIFSLHLAG+SSILGAINFITT IN+KPP ISQYQ\n+Sbjct   121  GAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTTINMKPPTISQYQ 180\n+\n+Query   541  TPLFV*SILITAVLLLLSLPVLAAGITILLTDRNLNTTFFDPAGGGDPILYQHLF*FFGH 720\n+             TPLFV ++L+TAVLLLLSLPVLAAGIT+LLTDRNLNTTFFDPAGGGDPILYQHLF FFGH\n+Sbjct   181  TPLFVWAVLVTAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGH 240\n \n-Query  538  WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 717\n-            WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD\n-Sbjct  181  WYFLFAYAILRSVPNKLGGVLALFLSIVILGLMPFLHTSKHRSMMLRPLSQALFWTLTMD 240\n+Query   721  PEVYILILPGFGIISHIVTHYSGKKEPFGYIGIV*AIVSIGFLGFIV*AHHIFTVGIDVD 900\n+             PEVYILILPGFGIISH+V +Y+GKKEPFGY+G+V A+++IG LGFIV AHH+FTVG+DVD\n+Sbjct   241  PEVYILILPGFGIISHVVAYYAGKKEPFGYMGMVWAMMAIGLLGFIVWAHHMFTVGMDVD 300\n+\n+Query   901  TRAYFTSATIIIAIPTGVKVFS*LATLHGSNTK*SAAIL*ALGFIFLFTVGGLTGIVLAN 1080\n+             TRAYFTSAT+IIAIPTGVKVFS LATLHG   K    +L ALGFIFLFTVGGLTGIVLAN\n+Sbjct   301  TRAYFTSATMIIAIPTGVKVFSWLATLHGGAIKWETPMLWALGFIFLFTVGGLTGIVLAN 360\n \n-Query  718  LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 849\n-            LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY\n-Sbjct  241  LLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGXIENY 284\n+Query  1081  SSLDIVLHDTYYVVAHFHYVLSIGAVFAIIGGFIHWFPLFSGYTLNQTYAKIHFITIFVG 1260\n+             SSLDIVLHDTYYVVAHFHYVLS+GAVFAI+ GF+HWFPLF+GYTLN  + KIHF  +F+G\n+Sbjct   361  SSLDIVLHDTYYVVAHFHYVLSMGAVFAIMAGFVHWFPLFTGYTLNSVWTKIHFGVMFIG 420\n \n+Query  1261  VNLTFFPQHFLGLSGIPRRYSDYPDAYTT*NILSSAGSFISLTAVILIIFII*EAFASKR 1440\n+             VNLTFFPQHFLGL+G+PRRYSDYPDAY   N +SS GS ISL AVI+ +FI+ EAF +KR\n+Sbjct   421  VNLTFFPQHFLGLAGMPRRYSDYPDAYALWNTVSSIGSLISLVAVIMFLFILWEAFTAKR 480\n+\n+Query  1441  KVPIIEQPSTSLEWLYGCPPPYHTFEEPVYIK 1536\n+             +V  +E  +T++EWL+GCPPPYHTFEEP +++\n+Sbjct   481  EVLSVELTATNVEWLHGCPPPYHTFEEPAFVQ 512\n+\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results.tabular
--- a/test-data/diamond_results.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550 100 0 0 0 94M1D189M
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 283 1 284 5.77e-150 409 100 0 0 0 105M1D178M
+NP_008227.1 gi|4512|gb|YP_003024028.1|cytochrome 95.9 512 21 0 1 512 1 512 0.0 999 99.8 0 Metazoa Chordata 512M
+NP_008227.1 gi|17708|gb|NP_904330.1|cytochrome 89.6 512 53 0 1 512 1 512 0.0 942 99.6 0 Metazoa Chordata 512M
+NP_008227.1 gi|140539|gb|NP_059333.1|cytochrome 84.2 512 81 0 1 512 1 512 0.0 894 99.2 0 Metazoa Chordata 512M
+NP_008227.1 gi|19893533|gb|YP_009047267.1|cytochrome 76.2 505 120 0 3 507 1 505 1.13e-295 799 98.8 0 Metazoa Arthropoda 505M
+NP_008227.1 gi|3950761|gb|YP_514675.1|cytochrome 68.7 511 151 4 5 507 6 515 2.93e-259 707 97.3 0 Viridiplantae Streptophyta 44M2D214M1I202M2D18M4D24M
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results.wtax.tabular
--- a/test-data/diamond_results.wtax.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results.wtax.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,1 +1,2 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 283 1 284 1.44e-205 550
+qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
+NP_008227.1 gi|3950761|gb|YP_514675.1|cytochrome 68.7 511 151 4 5 507 6 515 2.93e-259 707
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_algorithm.tabular
--- a/test-data/diamond_results_algorithm.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_algorithm.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_freq_masking.tabular
--- a/test-data/diamond_results_freq_masking.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_freq_masking.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_global_ranking.tabular
--- a/test-data/diamond_results_global_ranking.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_global_ranking.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_iterate.tabular
--- a/test-data/diamond_results_iterate.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_iterate.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,7 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+# DIAMOND v2.1.16. http://github.com/bbuchfink/diamond
+# Invocation: diamond blastx --threads 1 --db database.dmnd --query /tmp/tmpn1890frb/files/5/2/9/dataset_529e1e94-1186-4385-a242-298cfe957f6a.dat --query-gencode 1 --strand both --min-orf 1 --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore --header verbose --out /tmp/tmpn1890frb/job_working_directory/000/16/outputs/dataset_d67e7f18-9e9b-4100-8fff-c094b2a162ab.dat --compress 0 --iterate --algo 0 --matrix BLOSUM62 --comp-based-stats 1 --masking 1 --max-target-seqs 25 --evalue 0.001 --id 0 --query-cover 0 --subject-cover 0 --block-size 2.0 --motif-masking 0 --soft-masking 0 --index-chunks 4 --file-buffer-size 67108864
+# Fields: Query ID, Subject ID, Percentage of identical matches, Alignment length, Number of mismatches, Number of gap openings, Start of alignment in query, End of alignment in query, Start of alignment in subject, End of alignment in subject, Expected value, Bit score
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_log_test.tabular
--- a/test-data/diamond_results_log_test.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_log_test.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_max_hsps.tabular
--- a/test-data/diamond_results_max_hsps.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_max_hsps.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_motif_masking.tabular
--- a/test-data/diamond_results_motif_masking.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_motif_masking.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/diamond_results_soft_masking.tabular
--- a/test-data/diamond_results_soft_masking.tabular Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/diamond_results_soft_masking.tabular Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,2 +1,5 @@
-sequence gi|5524211|gb|AAD44166.1| 99.6 284 0 1 1 849 1 284 1.44e-205 550
-sequence gi|5524212|gb|AAD44167.1| 79.6 284 57 1 1 849 1 284 5.77e-150 409
+NC_001646.1:5332-6871 gi|4512|gb|YP_003024028.1|cytochrome 88.9 512 57 0 1 1536 1 512 0.0 897
+NC_001646.1:5332-6871 gi|17708|gb|NP_904330.1|cytochrome 83.4 512 85 0 1 1536 1 512 8.27e-315 847
+NC_001646.1:5332-6871 gi|140539|gb|NP_059333.1|cytochrome 79.5 512 105 0 1 1536 1 512 7.42e-300 810
+NC_001646.1:5332-6871 gi|19893533|gb|YP_009047267.1|cytochrome 71.3 505 145 0 7 1521 1 505 2.88e-263 717
+NC_001646.1:5332-6871 gi|3950761|gb|YP_514675.1|cytochrome 65.5 516 169 4 13 1536 6 520 4.52e-237 651
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/names.dmp
--- a/test-data/names.dmp Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/names.dmp Fri Dec 12 11:14:34 2025 +0000
b
b'@@ -1,11 +1,270 @@\n 1\t|\tall\t|\t\t|\tsynonym\t|\n 1\t|\troot\t|\t\t|\tscientific name\t|\n-2\t|\tBacteria\t|\tBacteria <bacteria>\t|\tscientific name\t|\n-2\t|\tbacteria\t|\t\t|\tblast name\t|\n-2\t|\teubacteria\t|\t\t|\tgenbank common name\t|\n-2\t|\tMonera\t|\tMonera <bacteria>\t|\tin-part\t|\n-3\t|\tProcaryotae\t|\tProcaryotae <bacteria>\t|\tin-part\t|\n-3\t|\tProkaryotae\t|\tProkaryotae <bacteria>\t|\tin-part\t|\n-3\t|\tProkaryota\t|\tProkaryota <bacteria>\t|\tin-part\t|\n-3\t|\tprokaryote\t|\tprokaryote <bacteria>\t|\tin-part\t|\n-3\t|\tprokaryotes\t|\tprokaryotes <bacteria>\t|\tin-part\t|\n+2\t|\tEucarya\t|\t\t|\tsynonym\t|\n+2\t|\tEucaryotae\t|\t\t|\tsynonym\t|\n+2\t|\tEukarya\t|\t\t|\tsynonym\t|\n+2\t|\tEukaryotae\t|\t\t|\tsynonym\t|\n+2\t|\tEukaryota\t|\t\t|\tscientific name\t|\n+2\t|\teukaryotes\t|\teukaryotes <blast name>\t|\tblast name\t|\n+2\t|\teukaryotes\t|\teukaryotes <genbank common name>\t|\tgenbank common name\t|\n+3\t|\tEmbryophyta\t|\t\t|\tscientific name\t|\n+3\t|\thigher plants\t|\t\t|\tcommon name\t|\n+3\t|\tland plants\t|\tland plants <blast name>\t|\tblast name\t|\n+3\t|\tland plants\t|\tland plants <genbank common name>\t|\tgenbank common name\t|\n+3\t|\tplants\t|\t\t|\tcommon name\t|\n+4\t|\tAngiospermae\t|\t\t|\tsynonym\t|\n+4\t|\tangiosperms\t|\t\t|\tcommon name\t|\n+4\t|\tflowering plants\t|\tflowering plants <blast name>\t|\tblast name\t|\n+4\t|\tflowering plants\t|\tflowering plants <genbank common name>\t|\tgenbank common name\t|\n+4\t|\tMagnoliophyta\t|\t\t|\tsynonym\t|\n+4\t|\tMagnoliopsida\t|\t\t|\tscientific name\t|\n+5\t|\tLiliopsida\t|\t\t|\tscientific name\t|\n+5\t|\tmonocots\t|\tmonocots <blast name>\t|\tblast name\t|\n+5\t|\tmonocots\t|\tmonocots <genbank common name>\t|\tgenbank common name\t|\n+5\t|\tMonocotyledoneae\t|\t\t|\tsynonym\t|\n+5\t|\tmonocotyledons\t|\t\t|\tcommon name\t|\n+6\t|\tBambusaceae Nakai, 1943\t|\t\t|\tauthority\t|\n+6\t|\tBambusaceae\t|\t\t|\tsynonym\t|\n+6\t|\tGramineae\t|\t\t|\tsynonym\t|\n+6\t|\tgrass family\t|\t\t|\tcommon name\t|\n+6\t|\tPoaceae Barnhart, 1895\t|\t\t|\tauthority\t|\n+6\t|\tPoaceae\t|\t\t|\tscientific name\t|\n+7\t|\tOryza L., 1753\t|\t\t|\tauthority\t|\n+7\t|\tOryza\t|\t\t|\tscientific name\t|\n+7\t|\tPorteresia\t|\t\t|\tincludes\t|\n+8\t|\tAsian cultivated rice\t|\t\t|\tgenbank common name\t|\n+8\t|\tOryza sativa L., 1753\t|\t\t|\tauthority\t|\n+8\t|\tOryza sativa\t|\t\t|\tscientific name\t|\n+8\t|\tred rice\t|\tred rice <Oryza sativa>\t|\tcommon name\t|\n+8\t|\trice\t|\t\t|\tcommon name\t|\n+9\t|\tCommelinidae\t|\t\t|\tsynonym\t|\n+9\t|\tcommelinids\t|\t\t|\tscientific name\t|\n+9\t|\tCommeliniflorae\t|\t\t|\tsynonym\t|\n+10\t|\tEumetazoa\t|\t\t|\tscientific name\t|\n+11\t|\tArthropoda\t|\t\t|\tscientific name\t|\n+11\t|\tarthropods\t|\tarthropods <blast name>\t|\tblast name\t|\n+11\t|\tarthropods\t|\tarthropods <genbank common name>\t|\tgenbank common name\t|\n+12\t|\tAtelocerata\t|\tAtelocerata <hexapods>\t|\tin-part\t|\n+12\t|\tHexapoda\t|\t\t|\tscientific name\t|\n+12\t|\thexapods\t|\thexapods <blast name>\t|\tblast name\t|\n+12\t|\thexapods\t|\thexapods <genbank common name>\t|\tgenbank common name\t|\n+12\t|\tTracheata\t|\tTracheata <hexapods>\t|\tin-part\t|\n+12\t|\tUniramia\t|\tUniramia <hexapods>\t|\tin-part\t|\n+13\t|\tDiptera\t|\t\t|\tscientific name\t|\n+13\t|\tflies\t|\tflies <blast name>\t|\tblast name\t|\n+13\t|\tflies\t|\tflies <genbank common name>\t|\tgenbank common name\t|\n+14\t|\tBrachycera\t|\t\t|\tscientific name\t|\n+15\t|\tDrosophilidae\t|\t\t|\tscientific name\t|\n+15\t|\tpomace flies\t|\t\t|\tgenbank common name\t|\n+16\t|\tDrosophila\t|\tDrosophila <flies,genus>\t|\tscientific name\t|\n+16\t|\tDrosophila Fallen, 1823\t|\t\t|\tauthority\t|\n+16\t|\tfruit flies\t|\tfruit flies <Drosophila>\t|\tgenbank common name\t|\n+16\t|\tfruit fly\t|\tfruit fly <Drosophila>\t|\tcommon name\t|\n+17\t|\tPterygota\t|\tPterygota <insects>\t|\tscientific name\t|\n+17\t|\twinged insects\t|\t\t|\tgenbank common name\t|\n+18\t|\tChordata\t|\t\t|\tscientific name\t|\n+18\t|\tchordates\t|\tchordates <blast name>\t|\tblast name\t|\n+18\t|\tchordates\t|\tchordates <genbank common name>\t|\tgenbank common name\t|\n+19\t|\tVertebrata Cuvier, 1812\t|\t\t|\tauthority\t|\n+19\t|\tVertebrata\t|\tVertebrata <vertebrates>\t|\tscientific name\t|\n+19\t|\tvertebrates\t|\tvertebrates <blast name>\t|\tblast name\t|\n+19\t|\tvertebrates\t|\tvertebrates <genbank common name>\t|\tgenbank common name\t|\n+20\t|\tGnathostomata\t|\tGnathostomata <vertebrates>\t|\tscientific name\t|\n+20\t|\tjawed vertebrates\t|\t\t|\tgenbank common name\t|\n+21\t|\tActinopterygii\t|\t\t|\tscientific name\t|\n+21\t|\tActi'..b'\t\t|\tincludes\t|\n+52\t|\tMurinae\t|\t\t|\tscientific name\t|\n+52\t|\tOtomyinae\t|\t\t|\tincludes\t|\n+53\t|\tMammalia\t|\t\t|\tscientific name\t|\n+53\t|\tmammals\t|\tmammals <blast name>\t|\tblast name\t|\n+53\t|\tmammals\t|\tmammals <genbank common name>\t|\tgenbank common name\t|\n+54\t|\tNeopterygii\t|\t\t|\tscientific name\t|\n+54\t|\tNeopterygi\t|\t\t|\tsynonym\t|\n+55\t|\tAsilomorpha\t|\t\t|\tsynonym\t|\n+55\t|\tMuscomorpha\t|\t\t|\tscientific name\t|\n+56\t|\tSchizophora\t|\t\t|\tscientific name\t|\n+57\t|\tAcalyptratae\t|\t\t|\tscientific name\t|\n+58\t|\tEphydroidea\t|\t\t|\tscientific name\t|\n+59\t|\tDrosophilinae\t|\t\t|\tscientific name\t|\n+60\t|\tDrosophilini\t|\t\t|\tscientific name\t|\n+61\t|\tInsecta\t|\t\t|\tscientific name\t|\n+61\t|\tinsects\t|\tinsects <blast name>\t|\tblast name\t|\n+61\t|\tinsects\t|\tinsects <genbank common name>\t|\tgenbank common name\t|\n+61\t|\ttrue insects\t|\t\t|\tcommon name\t|\n+62\t|\tTracheophyta\t|\t\t|\tscientific name\t|\n+62\t|\tTracheophyta Sinnott ex Cavalier-Smith, 1998\t|\t\t|\tauthority\t|\n+62\t|\tvascular plants\t|\tvascular plants <blast name>\t|\tblast name\t|\n+62\t|\tvascular plants\t|\tvascular plants <common name>\t|\tcommon name\t|\n+63\t|\tseed plants\t|\tseed plants <blast name>\t|\tblast name\t|\n+63\t|\tseed plants\t|\tseed plants <common name>\t|\tcommon name\t|\n+63\t|\tSpermatophyta\t|\t\t|\tscientific name\t|\n+64\t|\tEuphyllophyta\t|\t\t|\tscientific name\t|\n+64\t|\teuphyllophytes\t|\t\t|\tequivalent name\t|\n+65\t|\tDicondylia\t|\t\t|\tscientific name\t|\n+66\t|\tPanarthropoda\t|\t\t|\tscientific name\t|\n+67\t|\tCraniata\t|\tCraniata <chordates>\t|\tscientific name\t|\n+68\t|\tTeleostomi\t|\t\t|\tscientific name\t|\n+69\t|\tbony vertebrates\t|\t\t|\tgenbank common name\t|\n+69\t|\tEuteleostomi\t|\t\t|\tscientific name\t|\n+70\t|\tCharophyta/Embryophyta group\t|\t\t|\tsynonym\t|\n+70\t|\tcharophyte/embryophyte group\t|\t\t|\tequivalent name\t|\n+70\t|\tStreptophytina\t|\t\t|\tscientific name\t|\n+71\t|\tbiota\t|\t\t|\tsynonym\t|\n+71\t|\tcellular organisms\t|\t\t|\tscientific name\t|\n+72\t|\tEhrhartoideae Jacq.-Fel. ex Caro, 1982\t|\t\t|\tauthority\t|\n+72\t|\tEhrhartoideae\t|\t\t|\tsynonym\t|\n+72\t|\tOryzoideae Kunth ex Beilschm., 1833\t|\t\t|\tauthority\t|\n+72\t|\tOryzoideae\t|\t\t|\tscientific name\t|\n+73\t|\tOryzeae Dumort., 1824\t|\t\t|\tauthority\t|\n+73\t|\tOryzeae\t|\t\t|\tscientific name\t|\n+74\t|\tActinopteri\t|\t\t|\tscientific name\t|\n+75\t|\tClupeocephala\t|\t\t|\tscientific name\t|\n+76\t|\tOtophysa\t|\t\t|\tsynonym\t|\n+76\t|\tOtophysi\t|\t\t|\tscientific name\t|\n+77\t|\tCypriniphysae\t|\t\t|\tscientific name\t|\n+77\t|\tCypriniphysi\t|\t\t|\tsynonym\t|\n+78\t|\tOstarioclupeomorpha\t|\t\t|\tsynonym\t|\n+78\t|\tOtocephala\t|\t\t|\tsynonym\t|\n+78\t|\tOtomorpha\t|\t\t|\tscientific name\t|\n+79\t|\tPancrustacea\t|\t\t|\tscientific name\t|\n+80\t|\tMandibulata\t|\t\t|\tscientific name\t|\n+80\t|\tmandibulates\t|\t\t|\tcommon name\t|\n+81\t|\tHomininae\t|\t\t|\tscientific name\t|\n+81\t|\tHomo/Pan/Gorilla group\t|\t\t|\tsynonym\t|\n+82\t|\tEuarchontoglires\t|\t\t|\tscientific name\t|\n+83\t|\tGlires\t|\t\t|\tscientific name\t|\n+83\t|\tRodents and rabbits\t|\t\t|\tgenbank common name\t|\n+84\t|\tAnthropoidea\t|\t\t|\tsynonym\t|\n+84\t|\tSimiiformes\t|\t\t|\tscientific name\t|\n+85\t|\tape\t|\tape <primates>\t|\tcommon name\t|\n+85\t|\tapes\t|\t\t|\tgenbank common name\t|\n+85\t|\tHominoidea\t|\t\t|\tscientific name\t|\n+86\t|\tMuroidea\t|\t\t|\tscientific name\t|\n+87\t|\tBEP clade\t|\t\t|\tequivalent name\t|\n+87\t|\tBOP clade\t|\t\t|\tscientific name\t|\n+88\t|\tHaplorrhini\t|\t\t|\tscientific name\t|\n+89\t|\tCyclorrhapha\t|\t\t|\tscientific name\t|\n+90\t|\tEremoneura\t|\t\t|\tscientific name\t|\n+91\t|\tMus\t|\tMus <subgenus>\t|\tscientific name\t|\n+92\t|\tEcdysozoa\t|\t\t|\tscientific name\t|\n+93\t|\tDipnotetrapodomorpha\t|\t\t|\tscientific name\t|\n+94\t|\tBoreoeutheria\t|\t\t|\tscientific name\t|\n+94\t|\tBoreotheria\t|\t\t|\tsynonym\t|\n+95\t|\tMesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007\t|\t\t|\tauthority\t|\n+95\t|\tMesangiospermae\t|\t\t|\tscientific name\t|\n+96\t|\tPetrosaviidae\t|\t\t|\tscientific name\t|\n+96\t|\tPetrosaviidae S.W.Graham & W.S.Judd, 2007\t|\t\t|\tauthority\t|\n+97\t|\tOsteoglossocephalai\t|\t\t|\tscientific name\t|\n+98\t|\tOryzinae Griseb., 1853\t|\t\t|\tauthority\t|\n+98\t|\tOryzinae\t|\t\t|\tscientific name\t|\n+99\t|\tmice and others\t|\t\t|\tgenbank common name\t|\n+99\t|\tMyomorpha\t|\t\t|\tscientific name\t|\n+99\t|\tSciurognathi\t|\tSciurognathi <Myomorpha>\t|\tin-part\t|\n+100\t|\tDanionidae\t|\t\t|\tscientific name\t|\n+101\t|\tDanioninae\t|\t\t|\tscientific name\t|\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/ncbi_taxonomy.loc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy.loc Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,1 @@
+test testDB ${__HERE__}/ncbi_taxonomy/
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/ncbi_taxonomy/README.md
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/README.md Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,2 @@
+The `*.dmp` files are automatically created by gen.sh (in the blastdb folder).
+`prot.accession2taxid` has been manually curated.
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/ncbi_taxonomy/names.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/names.dmp Fri Dec 12 11:14:34 2025 +0000
b
b'@@ -0,0 +1,327 @@\n+2\t|\tBacteria\t|\tBacteria <bacteria>\t|\tscientific name\t|\n+2\t|\tbacteria\t|\tbacteria <blast name>\t|\tblast name\t|\n+2\t|\tbacteria\t|\tbacteria <genbank common name>\t|\tgenbank common name\t|\n+2\t|\t"Bacteria" Cavalier-Smith 1987\t|\t\t|\tauthority\t|\n+2\t|\tBacteria (ex Cavalier-Smith 1987)\t|\t\t|\tsynonym\t|\n+2\t|\tBacteria Woese et al. 2024\t|\t\t|\tsynonym\t|\n+2\t|\t"Bacteriobiota" Luketa 2012\t|\t\t|\tauthority\t|\n+2\t|\tBacteriobiota\t|\t\t|\tsynonym\t|\n+2\t|\teubacteria\t|\t\t|\tcommon name\t|\n+2\t|\tMonera\t|\tMonera <bacteria>\t|\tin-part\t|\n+2\t|\tProcaryotae\t|\tProcaryotae <bacteria>\t|\tin-part\t|\n+2\t|\tProkaryotae\t|\tProkaryotae <bacteria>\t|\tin-part\t|\n+2\t|\tProkaryota\t|\tProkaryota <bacteria>\t|\tin-part\t|\n+2\t|\tprokaryote\t|\tprokaryote <bacteria>\t|\tin-part\t|\n+2\t|\tprokaryotes\t|\tprokaryotes <bacteria>\t|\tin-part\t|\n+1234\t|\tNitrospira\t|\tNitrospira <Nitrospira>\t|\tscientific name\t|\n+1234\t|\tNitrospira Watson et al. 1986\t|\t\t|\tauthority\t|\n+2759\t|\tEucarya\t|\t\t|\tsynonym\t|\n+2759\t|\tEucaryotae\t|\t\t|\tsynonym\t|\n+2759\t|\tEukarya\t|\t\t|\tsynonym\t|\n+2759\t|\tEukaryotae\t|\t\t|\tsynonym\t|\n+2759\t|\tEukaryota\t|\t\t|\tscientific name\t|\n+2759\t|\teukaryotes\t|\teukaryotes <blast name>\t|\tblast name\t|\n+2759\t|\teukaryotes\t|\teukaryotes <genbank common name>\t|\tgenbank common name\t|\n+3193\t|\tEmbryophyta\t|\t\t|\tscientific name\t|\n+3193\t|\thigher plants\t|\t\t|\tcommon name\t|\n+3193\t|\tland plants\t|\tland plants <blast name>\t|\tblast name\t|\n+3193\t|\tland plants\t|\tland plants <genbank common name>\t|\tgenbank common name\t|\n+3193\t|\tplants\t|\t\t|\tcommon name\t|\n+3398\t|\tAngiospermae\t|\t\t|\tsynonym\t|\n+3398\t|\tangiosperms\t|\t\t|\tcommon name\t|\n+3398\t|\tflowering plants\t|\tflowering plants <blast name>\t|\tblast name\t|\n+3398\t|\tflowering plants\t|\tflowering plants <genbank common name>\t|\tgenbank common name\t|\n+3398\t|\tMagnoliophyta\t|\t\t|\tsynonym\t|\n+3398\t|\tMagnoliopsida\t|\t\t|\tscientific name\t|\n+3432\t|\tLaurales Juss. ex Bercht. & J.Presl, 1820\t|\t\t|\tauthority\t|\n+3432\t|\tLaurales\t|\t\t|\tscientific name\t|\n+3432\t|\tLaurineae\t|\t\t|\tincludes\t|\n+3433\t|\tLauraceae Juss., 1789\t|\t\t|\tauthority\t|\n+3433\t|\tLauraceae\t|\t\t|\tscientific name\t|\n+3433\t|\tlaurel family\t|\t\t|\tcommon name\t|\n+3434\t|\tPersea Mill., 1754\t|\t\t|\tauthority\t|\n+3434\t|\tPersea\t|\t\t|\tscientific name\t|\n+3435\t|\tavocado\t|\t\t|\tgenbank common name\t|\n+3435\t|\tLaurus persea L., 1753\t|\t\t|\tauthority\t|\n+3435\t|\tLaurus persea\t|\t\t|\tsynonym\t|\n+3435\t|\tPersea americana Mill., 1768\t|\t\t|\tauthority\t|\n+3435\t|\tPersea americana\t|\t\t|\tscientific name\t|\n+3435\t|\tPersea americana var. tolimanensis\t|\t\t|\tsynonym\t|\n+3435\t|\tPersea americana var. tolimanensis (Zentmyer & Schieber) Scora, 2002\t|\t\t|\tauthority\t|\n+3435\t|\tPersea gratissima C.F.Gaertn., 1807\t|\t\t|\tauthority\t|\n+3435\t|\tPersea gratissima\t|\t\t|\tsynonym\t|\n+3435\t|\tPersea tolimanensis\t|\t\t|\tsynonym\t|\n+3435\t|\tPersea tolimanensis Zentmyer & Schieber, 1990\t|\t\t|\tauthority\t|\n+4447\t|\tLiliopsida\t|\t\t|\tscientific name\t|\n+4447\t|\tmonocots\t|\tmonocots <blast name>\t|\tblast name\t|\n+4447\t|\tmonocots\t|\tmonocots <genbank common name>\t|\tgenbank common name\t|\n+4447\t|\tMonocotyledoneae\t|\t\t|\tsynonym\t|\n+4447\t|\tmonocotyledons\t|\t\t|\tcommon name\t|\n+4479\t|\tBambusaceae Nakai, 1943\t|\t\t|\tauthority\t|\n+4479\t|\tBambusaceae\t|\t\t|\tsynonym\t|\n+4479\t|\tGramineae\t|\t\t|\tsynonym\t|\n+4479\t|\tgrass family\t|\t\t|\tcommon name\t|\n+4479\t|\tPoaceae Barnhart, 1895\t|\t\t|\tauthority\t|\n+4479\t|\tPoaceae\t|\t\t|\tscientific name\t|\n+4527\t|\tOryza L., 1753\t|\t\t|\tauthority\t|\n+4527\t|\tOryza\t|\t\t|\tscientific name\t|\n+4527\t|\tPorteresia\t|\t\t|\tincludes\t|\n+4530\t|\tAsian cultivated rice\t|\t\t|\tgenbank common name\t|\n+4530\t|\tOryza sativa L., 1753\t|\t\t|\tauthority\t|\n+4530\t|\tOryza sativa\t|\t\t|\tscientific name\t|\n+4530\t|\tred rice\t|\tred rice <Oryza sativa>\t|\tcommon name\t|\n+4530\t|\trice\t|\t\t|\tcommon name\t|\n+4734\t|\tCommelinidae\t|\t\t|\tsynonym\t|\n+4734\t|\tcommelinids\t|\t\t|\tscientific name\t|\n+4734\t|\tCommeliniflorae\t|\t\t|\tsynonym\t|\n+6072\t|\tEumetazoa\t|\t\t|\tscientific name\t|\n+6656\t|\tArthropoda\t|\t\t|\tscientific name\t|\n+6656\t|\tarthropods\t|\tarthropods <blast name>\t|\tblast name\t|\n+6656\t|\tarthropods\t|\tarthropods <genbank common name>\t|\tgenbank common name\t|\n+6960\t|\tAtelocerata\t|\tAtelocerata <hexapods>\t|\tin-part\t|\n+6960\t|\tHexapod'..b'\t|\tSpermatophyta\t|\t\t|\tscientific name\t|\n+78536\t|\tEuphyllophyta\t|\t\t|\tscientific name\t|\n+78536\t|\teuphyllophytes\t|\t\t|\tequivalent name\t|\n+85512\t|\tDicondylia\t|\t\t|\tscientific name\t|\n+88770\t|\tPanarthropoda\t|\t\t|\tscientific name\t|\n+89593\t|\tCraniata\t|\tCraniata <chordates>\t|\tscientific name\t|\n+117570\t|\tTeleostomi\t|\t\t|\tscientific name\t|\n+117571\t|\tbony vertebrates\t|\t\t|\tgenbank common name\t|\n+117571\t|\tEuteleostomi\t|\t\t|\tscientific name\t|\n+131221\t|\tCharophyta/Embryophyta group\t|\t\t|\tsynonym\t|\n+131221\t|\tcharophyte/embryophyte group\t|\t\t|\tequivalent name\t|\n+131221\t|\tStreptophytina\t|\t\t|\tscientific name\t|\n+147367\t|\tEhrhartoideae Jacq.-Fel. ex Caro, 1982\t|\t\t|\tauthority\t|\n+147367\t|\tEhrhartoideae\t|\t\t|\tsynonym\t|\n+147367\t|\tOryzoideae Kunth ex Beilschm., 1833\t|\t\t|\tauthority\t|\n+147367\t|\tOryzoideae\t|\t\t|\tscientific name\t|\n+147380\t|\tOryzeae Dumort., 1824\t|\t\t|\tauthority\t|\n+147380\t|\tOryzeae\t|\t\t|\tscientific name\t|\n+186623\t|\tActinopteri\t|\t\t|\tscientific name\t|\n+186625\t|\tClupeocephala\t|\t\t|\tscientific name\t|\n+186626\t|\tOtophysa\t|\t\t|\tsynonym\t|\n+186626\t|\tOtophysi\t|\t\t|\tscientific name\t|\n+186627\t|\tCypriniphysae\t|\t\t|\tscientific name\t|\n+186627\t|\tCypriniphysi\t|\t\t|\tsynonym\t|\n+186634\t|\tOstarioclupeomorpha\t|\t\t|\tsynonym\t|\n+186634\t|\tOtocephala\t|\t\t|\tsynonym\t|\n+186634\t|\tOtomorpha\t|\t\t|\tscientific name\t|\n+189778\t|\t"Nitrospirales" Garrity and Holt 2001\t|\t\t|\tauthority\t|\n+189778\t|\tNitrospirales Garrity and Holt 2022\t|\t\t|\tauthority\t|\n+189778\t|\tNitrospirales\t|\t\t|\tscientific name\t|\n+189779\t|\t"Nitrospiraceae" Garrity and Holt 2001\t|\t\t|\tauthority\t|\n+189779\t|\tNitrospiraceae Garrity and Holt 2022\t|\t\t|\tauthority\t|\n+189779\t|\tNitrospiraceae\t|\t\t|\tscientific name\t|\n+197562\t|\tPancrustacea\t|\t\t|\tscientific name\t|\n+197563\t|\tMandibulata\t|\t\t|\tscientific name\t|\n+197563\t|\tmandibulates\t|\t\t|\tcommon name\t|\n+203693\t|\t"Nitrospira" Garrity and Holt 2001\t|\t\t|\tauthority\t|\n+203693\t|\tNitrospira\t|\tNitrospira <Nitrospiria>\t|\tsynonym\t|\n+203693\t|\t"Nitrospiria" Cavalier-Smith 2020\t|\t\t|\tauthority\t|\n+203693\t|\tNitrospiria Garrity and Holt 2022\t|\t\t|\tauthority\t|\n+203693\t|\t"Nitrospiria" Oren et al. 2015\t|\t\t|\tauthority\t|\n+203693\t|\tNitrospiria\t|\t\t|\tscientific name\t|\n+207598\t|\tHomininae\t|\t\t|\tscientific name\t|\n+207598\t|\tHomo/Pan/Gorilla group\t|\t\t|\tsynonym\t|\n+232347\t|\tMagnoliidae Novak ex Takht., 1967\t|\t\t|\tauthority\t|\n+232347\t|\tMagnoliidae\t|\t\t|\tscientific name\t|\n+232347\t|\tmagnoliids\t|\t\t|\tequivalent name\t|\n+314146\t|\tEuarchontoglires\t|\t\t|\tscientific name\t|\n+314147\t|\tGlires\t|\t\t|\tscientific name\t|\n+314147\t|\tRodents and rabbits\t|\t\t|\tgenbank common name\t|\n+314293\t|\tAnthropoidea\t|\t\t|\tsynonym\t|\n+314293\t|\tSimiiformes\t|\t\t|\tscientific name\t|\n+314295\t|\tape\t|\tape <primates>\t|\tcommon name\t|\n+314295\t|\tapes\t|\t\t|\tgenbank common name\t|\n+314295\t|\tHominoidea\t|\t\t|\tscientific name\t|\n+337687\t|\tMuroidea\t|\t\t|\tscientific name\t|\n+359160\t|\tBEP clade\t|\t\t|\tequivalent name\t|\n+359160\t|\tBOP clade\t|\t\t|\tscientific name\t|\n+376913\t|\tHaplorrhini\t|\t\t|\tscientific name\t|\n+480117\t|\tCyclorrhapha\t|\t\t|\tscientific name\t|\n+480118\t|\tEremoneura\t|\t\t|\tscientific name\t|\n+862507\t|\tMus\t|\tMus <subgenus>\t|\tscientific name\t|\n+1206794\t|\tEcdysozoa\t|\t\t|\tscientific name\t|\n+1338369\t|\tDipnotetrapodomorpha\t|\t\t|\tscientific name\t|\n+1437010\t|\tBoreoeutheria\t|\t\t|\tscientific name\t|\n+1437010\t|\tBoreotheria\t|\t\t|\tsynonym\t|\n+1437183\t|\tMesangiospermae M.J.Donoghue, J.A.Doyle & P.D.Cantino, 2007\t|\t\t|\tauthority\t|\n+1437183\t|\tMesangiospermae\t|\t\t|\tscientific name\t|\n+1437197\t|\tPetrosaviidae\t|\t\t|\tscientific name\t|\n+1437197\t|\tPetrosaviidae S.W.Graham & W.S.Judd, 2007\t|\t\t|\tauthority\t|\n+1489341\t|\tOsteoglossocephalai\t|\t\t|\tscientific name\t|\n+1648021\t|\tOryzinae Griseb., 1853\t|\t\t|\tauthority\t|\n+1648021\t|\tOryzinae\t|\t\t|\tscientific name\t|\n+1963758\t|\tmice and others\t|\t\t|\tgenbank common name\t|\n+1963758\t|\tMyomorpha\t|\t\t|\tscientific name\t|\n+1963758\t|\tSciurognathi\t|\tSciurognathi <Myomorpha>\t|\tin-part\t|\n+2743709\t|\tDanionidae\t|\t\t|\tscientific name\t|\n+2743711\t|\tDanioninae\t|\t\t|\tscientific name\t|\n+3379134\t|\tPseudomonadati (Gibbons and Murray 1978) Oren and Goker 2024\t|\t\t|\tauthority\t|\n+3379134\t|\tPseudomonadati\t|\t\t|\tscientific name\t|\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/ncbi_taxonomy/nodes.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/nodes.dmp Fri Dec 12 11:14:34 2025 +0000
b
b'@@ -0,0 +1,111 @@\n+2\t|\t131567\t|\tdomain\t|\t\t|\t0\t|\t0\t|\t11\t|\t0\t|\t0\t|\t0\t|\t0\t|\t0\t|\t\t|\n+1234\t|\t189779\t|\tgenus\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+2759\t|\t131567\t|\tdomain\t|\t\t|\t1\t|\t0\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t0\t|\t\t|\n+3193\t|\t131221\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t0\t|\t\t|\n+3398\t|\t58024\t|\tclass\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+3432\t|\t232347\t|\torder\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+3433\t|\t3432\t|\tfamily\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+3434\t|\t3433\t|\tgenus\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+3435\t|\t3434\t|\tspecies\t|\tPA\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\tcode compliant; specified\t|\n+4447\t|\t1437183\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|\n+4479\t|\t38820\t|\tfamily\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+4527\t|\t1648021\t|\tgenus\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+4530\t|\t4527\t|\tspecies\t|\tOS\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant; specified\t|\n+4734\t|\t1437197\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t\t|\n+6072\t|\t33208\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t0\t|\t1\t|\t0\t|\t\t|\n+6656\t|\t88770\t|\tphylum\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+6960\t|\t197562\t|\tsubphylum\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7147\t|\t33392\t|\torder\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7203\t|\t7147\t|\tsuborder\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7214\t|\t43746\t|\tfamily\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7215\t|\t46877\t|\tgenus\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7496\t|\t85512\t|\tsubclass\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7711\t|\t33511\t|\tphylum\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7742\t|\t89593\t|\tclade\t|\t\t|\t10\t|\t0\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+7776\t|\t7742\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+7898\t|\t117571\t|\tsuperclass\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7952\t|\t186627\t|\torder\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7954\t|\t2743711\t|\tgenus\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+7955\t|\t7954\t|\tspecies\t|\tDR\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant; specified\t|\n+8287\t|\t117571\t|\tsuperclass\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+9347\t|\t32525\t|\tclade\t|\t\t|\t2\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+9443\t|\t314146\t|\torder\t|\t\t|\t5\t|\t0\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+9526\t|\t314293\t|\tparvorder\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+9604\t|\t314295\t|\tfamily\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+9605\t|\t207598\t|\tgenus\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+9606\t|\t9605\t|\tspecies\t|\tHS\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant; specified\t|\n+9989\t|\t314147\t|\torder\t|\t\t|\t6\t|\t0\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+10066\t|\t337687\t|\tfamily\t|\t\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+10088\t|\t39107\t|\tgenus\t|\t\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+10090\t|\t862507\t|\tspecies\t|\tMM\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant; specified\t|\n+30727\t|\t7952\t|\tsuborder\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+32443\t|\t41665\t|\tinfraclass\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+32519\t|\t186634\t|\tsubcohort\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+32523\t|\t1338369\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+32524\t|\t32523\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+32525\t|\t40674\t|\tclade\t|\t\t|\t2\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+33090\t|\t2759\t|\tkingdom\t|\t\t|\t4\t|\t0\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+33154\t|\t2759\t|\tclade\t|\t\t|\t4\t|\t0\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t\t|\n+33208\t|\t33154\t|\tkingdom\t|\t\t|\t1\t|\t0\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+33213\t|\t6072\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+33317\t|\t33213\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+33340\t|\t7496\t|\tinfraclass\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+33392\t|\t33340\t|\tcohort\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+33511'..b'|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+40117\t|\t3379134\t|\tphylum\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|\n+40674\t|\t32524\t|\tclass\t|\t\t|\t2\t|\t0\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+41665\t|\t186623\t|\tsubclass\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+43733\t|\t7203\t|\tinfraorder\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+43738\t|\t480117\t|\tno rank\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+43741\t|\t43738\t|\tno rank\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+43746\t|\t43741\t|\tsuperfamily\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+43845\t|\t7214\t|\tsubfamily\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+46877\t|\t43845\t|\ttribe\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+50557\t|\t6960\t|\tclass\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+58023\t|\t3193\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|\n+58024\t|\t78536\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|\n+78536\t|\t58023\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t\t|\n+85512\t|\t50557\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+88770\t|\t1206794\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+89593\t|\t7711\t|\tsubphylum\t|\t\t|\t10\t|\t0\t|\t1\t|\t1\t|\t2\t|\t0\t|\t0\t|\t0\t|\tcode compliant\t|\n+117570\t|\t7776\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+117571\t|\t117570\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+131221\t|\t35493\t|\tsubphylum\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+147367\t|\t359160\t|\tsubfamily\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+147380\t|\t147367\t|\ttribe\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+186623\t|\t7898\t|\tclass\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+186625\t|\t1489341\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+186626\t|\t32519\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+186627\t|\t186626\t|\tsuperorder\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+186634\t|\t186625\t|\tcohort\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+189778\t|\t203693\t|\torder\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+189779\t|\t189778\t|\tfamily\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+197562\t|\t197563\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+197563\t|\t6656\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+203693\t|\t40117\t|\tclass\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+207598\t|\t9604\t|\tsubfamily\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+232347\t|\t1437183\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|\n+314146\t|\t1437010\t|\tsuperorder\t|\t\t|\t2\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+314147\t|\t314146\t|\tclade\t|\t\t|\t2\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+314293\t|\t376913\t|\tinfraorder\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+314295\t|\t9526\t|\tsuperfamily\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+337687\t|\t1963758\t|\tclade\t|\t\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\t\t|\n+359160\t|\t4479\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|\n+376913\t|\t9443\t|\tsuborder\t|\t\t|\t5\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+480117\t|\t480118\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+480118\t|\t43733\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t1\t|\t0\t|\t\t|\n+862507\t|\t10088\t|\tsubgenus\t|\t\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+1206794\t|\t33317\t|\tclade\t|\t\t|\t1\t|\t1\t|\t1\t|\t1\t|\t5\t|\t1\t|\t0\t|\t0\t|\t\t|\n+1338369\t|\t8287\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+1437010\t|\t9347\t|\tclade\t|\t\t|\t2\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+1437183\t|\t3398\t|\tclade\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t\t|\n+1437197\t|\t4447\t|\tsubclass\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\tcode compliant\t|\n+1489341\t|\t32443\t|\tclade\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t1\t|\t0\t|\t\t|\n+1648021\t|\t147380\t|\tsubtribe\t|\t\t|\t4\t|\t1\t|\t1\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+1963758\t|\t9989\t|\tsuborder\t|\t\t|\t6\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+2743709\t|\t30727\t|\tfamily\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+2743711\t|\t2743709\t|\tsubfamily\t|\t\t|\t10\t|\t1\t|\t1\t|\t1\t|\t2\t|\t1\t|\t0\t|\t0\t|\tcode compliant\t|\n+3379134\t|\t2\t|\tkingdom\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t0\t|\t0\t|\t\t|\n'
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/ncbi_taxonomy/prot.accession2taxid
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy/prot.accession2taxid Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,6 @@
+accession accession.version taxid gi
+YP_514675 YP_514675.1 12 3950761
+YP_009047267 YP_009047267.1 20 19893533
+NP_059333 NP_059333.1 28 140539
+YP_003024028 YP_003024028.1 35 4512
+NP_904330 NP_904330.1 39 17708
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/nodes.dmp
--- a/test-data/nodes.dmp Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/nodes.dmp Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,3 +1,101 @@
 1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
-2 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
-3 | 1 | species | AC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
+2 | 71 | domain | | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | |
+3 | 70 | clade | | 4 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | |
+4 | 63 | class | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+5 | 95 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | |
+6 | 51 | family | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+7 | 98 | genus | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+8 | 7 | species | OS | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant; specified |
+9 | 96 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | |
+10 | 44 | clade | | 1 | 1 | 1 | 1 | 5 | 0 | 1 | 0 | |
+11 | 66 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+12 | 79 | subphylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+13 | 48 | order | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+14 | 13 | suborder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+15 | 58 | family | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+16 | 60 | genus | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+17 | 65 | subclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+18 | 49 | phylum | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+19 | 67 | clade | | 10 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | |
+20 | 19 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+21 | 69 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+22 | 77 | order | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+23 | 101 | genus | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+24 | 23 | species | DR | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified |
+25 | 69 | superclass | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+26 | 41 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | |
+27 | 82 | order | | 5 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+28 | 84 | parvorder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+29 | 85 | family | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+30 | 81 | genus | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+31 | 30 | species | HS | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified |
+32 | 83 | order | | 6 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+33 | 86 | family | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+34 | 52 | genus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+35 | 91 | species | MM | 6 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant; specified |
+36 | 22 | suborder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+37 | 54 | infraclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+38 | 78 | subcohort | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | |
+39 | 93 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+40 | 39 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+41 | 53 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+42 | 2 | kingdom | | 4 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+43 | 2 | clade | | 4 | 0 | 1 | 1 | 1 | 1 | 1 | 0 | |
+44 | 43 | kingdom | | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+45 | 10 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+46 | 45 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+47 | 17 | infraclass | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+48 | 47 | cohort | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+49 | 45 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+50 | 42 | phylum | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+51 | 9 | order | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+52 | 33 | subfamily | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+53 | 40 | class | | 2 | 0 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+54 | 74 | subclass | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+55 | 14 | infraorder | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+56 | 89 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+57 | 56 | no rank | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+58 | 57 | superfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+59 | 15 | subfamily | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant |
+60 | 59 | tribe | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | code compliant |
+61 | 12 | class | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | code compliant |
+62 | 3 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | |
+63 | 64 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | |
+64 | 62 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | |
+65 | 61 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+66 | 92 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+67 | 18 | subphylum | | 10 | 0 | 1 | 1 | 2 | 0 | 0 | 0 | code compliant |
+68 | 20 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+69 | 68 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | |
+70 | 50 | subphylum | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant |
+71 | 1 | cellular root | CO | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | |
+72 | 87 | subfamily | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+73 | 72 | tribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+74 | 21 | class | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+75 | 97 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+76 | 38 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+77 | 76 | superorder | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+78 | 75 | cohort | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+79 | 80 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+80 | 11 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+81 | 29 | subfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+82 | 94 | superorder | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+83 | 82 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | |
+84 | 88 | infraorder | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+85 | 28 | superfamily | | 5 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | code compliant |
+86 | 99 | clade | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | |
+87 | 6 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | |
+88 | 27 | suborder | | 5 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+89 | 90 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+90 | 55 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 1 | 0 | |
+91 | 34 | subgenus | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+92 | 46 | clade | | 1 | 1 | 1 | 1 | 5 | 1 | 0 | 0 | |
+93 | 25 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+94 | 26 | clade | | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+95 | 4 | clade | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | |
+96 | 5 | subclass | | 4 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | code compliant |
+97 | 37 | clade | | 10 | 1 | 1 | 1 | 2 | 1 | 1 | 0 | |
+98 | 73 | subtribe | | 4 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | code compliant |
+99 | 32 | suborder | | 6 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+100 | 36 | family | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
+101 | 100 | subfamily | | 10 | 1 | 1 | 1 | 2 | 1 | 0 | 0 | code compliant |
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/nucleotide.fasta
--- a/test-data/nucleotide.fasta Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/nucleotide.fasta Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,17 +1,23 @@
->sequence more text
-CTGTGCCTGTACACCCACATCGGCAGAAACATCTACTACGGCAGCTACCTGTACAGCGAG
-ACCTGGAACACCGGCATCATGCTGCTGCTGATCACCATGGCCACCGCCTTCATGGGCTAC
-GTGCTGCCCTGGGGCCAGATGAGCTTCTGGGGCGCCACCGTGATCACCAACCTGTTCAGC
-GCCATCCCCTACATCGGCACCAACCTGGTGGAGTGGATCTGGGGCGGCTTCAGCGTGGAC
-AAGGCCACCCTGAACAGATTCTTCGCCTTCCACTTCATCCTGTTCACCATGGTGGCCCTG
-GCCGGCGTGCACCTGACCTTCCTGCACGAGACCGGCAGCAACAACCCCCTGGGCCTGACC
-AGCGACAGCGACAAGATCCCCTTCCACCCCTACTACACCATCAAGGACTTCCTGGGCCTG
-CTGATCCTGATCCTGCTGCTGCTGCTGCTGGCCCTGCTGAGCCCCGACATGCTGGGCGAC
-CCCGACAACCACATGCCCGCCGACCCCCTGAACACCCCCCTGCACATCAAGCCCGAGTGG
-TACTTCCTGTTCGCCTACGCCATCCTGAGAAGCGTGCCCAACAAGCTGGGCGGCGTGCTG
-GCCCTGTTCCTGAGCATCGTGATCCTGGGCCTGATGCCCTTCCTGCACACCAGCAAGCAC
-AGAAGCATGATGCTGAGACCCCTGAGCCAGGCCCTGTTCTGGACCCTGACCATGGACCTG
-CTGACCCTGACCTGGATCGGCAGCCAGCCCGTGGAGTACCCCTACACCATCATCGGCCAG
-ATGGCCAGCATCCTGTACTTCAGCATCATCCTGGCCTTCCTGCCCATCGCCGGCNNNATC
-GAGAACTAC
-
+>NC_001646.1:5332-6871 Pongo pygmaeus mitochondrion, complete genome
+ATGTTCGCCGACCGCTGGCTATTCTCCACGAACCACAAAGATATTGGAACGCTATACCTGTTGTTCGGCG
+CATGAGCTGGTGTCCTAGGCACTGCCCTAAGCCTCCTCATTCGTGCTGAACTAGGCCAACCCGGCAACCT
+CCTAGGTAATGACCATATTTACAATGTCATCGTCACAGCCCATGCATTCGTAATAATTTTTTTCATGGTC
+ATGCCCATAATAATTGGAGGCTTTGGCAACTGACTAGTGCCCCTGATAATTGGCGCCCCTGATATGGCAT
+TCCCGCGCATAAATAACATAAGCTTCTGACTCCTCCTCCCCTCCTTCCTCCTATTACTCGCTTCTGCTAC
+AGTAGAGGCCGGAGCAGGAACGGGCTGAACAGTCTATCCACCCCTAGCAGGAAACTACTCTCACCCAGGA
+GCCTCTGTAGACTTGACAATCTTCTCTCTACACCTAGCAGGCATTTCCTCAATTCTAGGGGCTATCAATT
+TCATTACAACAATTATTAATATAAAACCCCCTGCAATATCCCAATATCAAACTCCCCTCTTCGTCTGATC
+AATCCTGATCACAGCAGTCCTACTTCTCCTCTCCCTCCCAGTCCTAGCCGCTGGCATCACCATACTACTA
+ACAGACCGCAACTTAAATACTACATTCTTTGACCCGGCTGGAGGTGGGGATCCTATCCTATACCAACACT
+TATTCTGATTTTTCGGCCACCCTGAAGTCTACATTCTCATCCTACCAGGTTTCGGCATAATCTCCCACAT
+CGTAACACACTACTCCGGAAAAAAAGAACCATTTGGGTATATAGGCATAGTCTGAGCCATAGTCTCAATT
+GGTTTCCTGGGTTTTATCGTATGAGCCCACCACATATTCACAGTAGGGATAGACGTGGACACACGAGCCT
+ACTTCACCTCCGCTACCATAATTATTGCCATCCCCACCGGCGTCAAAGTATTTAGCTGACTCGCTACACT
+CCACGGAAGCAACACTAAATGATCTGCCGCAATCCTCTGAGCCTTAGGATTCATTTTCCTCTTCACCGTA
+GGCGGCTTAACAGGCATCGTACTGGCAAACTCATCACTAGACATCGTATTACACGATACATACTACGTTG
+TAGCCCACTTTCACTACGTCTTATCAATAGGAGCTGTATTCGCCATCATAGGAGGCTTCATCCACTGGTT
+CCCACTATTCTCAGGCTACACCTTAAACCAGACCTATGCTAAAATTCACTTCATCACCATATTTGTCGGC
+GTAAATTTAACCTTCTTCCCGCAACATTTCCTTGGCCTATCAGGTATACCCCGACGCTACTCCGATTACC
+CCGACGCATATACCACATGAAATATTTTATCATCCGCAGGCTCATTTATCTCCCTAACAGCAGTTATACT
+AATAATTTTCATAATTTGAGAAGCCTTTGCCTCAAAACGAAAAGTCCCAATAATTGAACAACCTTCCACA
+AGCCTAGAGTGGTTATACGGATGCCCCCCACCCTACCATACGTTTGAAGAACCCGTCTATATAAAACCCG
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/prot.accession2taxid
--- a/test-data/prot.accession2taxid Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/prot.accession2taxid Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,4 +1,6 @@
 accession accession.version taxid gi
-AAD44166 AAD44166.1 2 5524211
-AAD44167 AAD44167.1 3 5524212
-
+YP_514675 YP_514675.1 8 3950761
+YP_009047267 YP_009047267.1 16 19893533
+NP_059333 NP_059333.1 24 140539
+YP_003024028 YP_003024028.1 31 4512
+NP_904330 NP_904330.1 35 17708
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/protein.fasta
--- a/test-data/protein.fasta Mon Nov 10 15:12:53 2025 +0000
+++ b/test-data/protein.fasta Fri Dec 12 11:14:34 2025 +0000
[
@@ -1,9 +1,12 @@
->sequence more text
-LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
-EWIWGGFSVDKATLNRFFAFHFILFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
-LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
-GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
-IENY
+>NP_008227.1 cytochrome c oxidase subunit I (mitochondrion) [Pongo pygmaeus]
+MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGNLLGNDHIYNVIVTAHAFVMIFFMV
+MPMMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLLPSFLLLLASATVEAGAGTGWTVYPPLAGNYSHPG
+ASVDLTIFSLHLAGISSILGAINFITTIINMKPPAMSQYQTPLFVWSILITAVLLLLSLPVLAAGITMLL
+TDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTHYSGKKEPFGYMGMVWAMVSI
+GFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGSNTKWSAAILWALGFIFLFTV
+GGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFIHWFPLFSGYTLNQTYAKIHFITMFVG
+VNLTFFPQHFLGLSGMPRRYSDYPDAYTTWNILSSAGSFISLTAVMLMIFMIWEAFASKRKVPMIEQPST
+SLEWLYGCPPPYHTFEEPVYMKP
 >shuffled sequence that should go to unaligned
 XLPLILMLLGISPGSFEHTVAGGIWTSLMLFLPGYPGVGFLMLLVITVPALNFKFGFMLL
 LKPTTNIIKTLVLALTHADDPLSFPWLNYMPPAADFNGLFTNAGATTTLYQIPYEGSFYL
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/protein.fasta.gz
b
Binary file test-data/protein.fasta.gz has changed
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 test-data/taxon.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxon.tsv Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,2 @@
+32523
+7898
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 tool-data/blastdb_p.loc
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blastdb_p.loc Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,57 @@
+# This is a sample file distributed with Galaxy that is used to define a
+# list of protein domain databases, using three columns tab separated
+# (longer whitespace are TAB characters):
+#
+# <unique_id>{tab}<database_caption>{tab}<base_name_path>
+#
+# The captions typically contain spaces and might end with the build date.
+# It is important that the actual database name does not have a space in
+# it, and that there are only two tabs on each line.
+#
+# You can download the NCBI provided databases as tar-balls from here:
+# ftp://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/
+#
+# For simplicity, many Galaxy servers are configured to offer just a live
+# version of each NCBI BLAST database (updated with the NCBI provided
+# Perl scripts or similar). In this case, we recommend using the case
+# sensistive base-name of the NCBI BLAST databases as the unique id.
+# Consistent naming is important for sharing workflows between Galaxy
+# servers.
+#
+# For example, consider the NCBI Conserved Domains Database (CDD), where
+# you have downloaded and decompressed the files under the directory
+# /data/blastdb/domains/ meaning at the command line BLAST+ would be
+# run as follows any would look at the files /data/blastdb/domains/Cdd.*:
+#
+# $ rpsblast -db /data/blastdb/domains/Cdd -query ...
+#
+# In this case use Cdd (title case to match the NCBI file naming) as the
+# unique id in the first column of blastdb_d.loc, giving an entry like
+# this:
+#
+# Cdd{tab}NCBI Conserved Domains Database (CDD){tab}/data/blastdb/domains/Cdd
+#
+# Your blastdb_d.loc file should include an entry per line for each "base name"
+# you have stored. For example:
+#
+# Cdd{tab}NCBI CDD{tab}/data/blastdb/domains/Cdd
+# Kog{tab}KOG (eukaryotes){tab}/data/blastdb/domains/Kog
+# Cog{tab}COG (prokaryotes){tab}/data/blastdb/domains/Cog
+# Pfam{tab}Pfam-A{tab}/data/blastdb/domains/Pfam
+# Smart{tab}SMART{tab}/data/blastdb/domains/Smart
+# Tigr{tab}TIGR /data/blastdb/domains/Tigr
+# Prk{tab}Protein Clusters database{tab}/data/blastdb/domains/Prk
+# ...etc...
+#
+# Alternatively, rather than a "live" mirror of the NCBI databases which
+# are updated automatically, for full reproducibility the Galaxy Team
+# recommend saving date-stamped copies of the databases. In this case
+# your blastdb_d.loc file should include an entry per line for each
+# version you have stored. For example:
+#
+# Cdd_05Jun2010{tab}NCBI CDD 05 Jun 2010{tab}/data/blastdb/domains/05Jun2010/Cdd
+# Cdd_15Aug2010{tab}NCBI CDD 15 Aug 2010{tab}/data/blastdb/domains/15Aug2010/Cdd
+# ...etc...
+#
+# See also blastdb.loc which is for any nucleotide BLAST database, and
+# blastdb_p.loc which is for any protein BLAST databases.
\ No newline at end of file
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 tool-data/ncbi_taxonomy.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_taxonomy.loc.sample Fri Dec 12 11:14:34 2025 +0000
b
@@ -0,0 +1,5 @@
+# Tab separated fields where
+# value is unique key
+# name is descriptive name
+# path is path to directory containing names.dmp and nodes.dmp files
+#value name path
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Mon Nov 10 15:12:53 2025 +0000
+++ b/tool_data_table_conf.xml.sample Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,8 +1,19 @@
 <!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
 <tables>
-    <!-- Locations of indexes in the Bowtie mapper format -->
+
+    <table name="blastdb_p" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blastdb_p.loc" />
+    </table>
+
     <table name="diamond_database" comment_char="#">
         <columns>value, name, db_path</columns>
         <file path="tool-data/diamond_database.loc" />
     </table>
+
+    <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ncbi_taxonomy.loc" />
+    </table>
 </tables>
b
diff -r 0cdcf7e99b62 -r 1faba1aa14c1 tool_data_table_conf.xml.test
--- a/tool_data_table_conf.xml.test Mon Nov 10 15:12:53 2025 +0000
+++ b/tool_data_table_conf.xml.test Fri Dec 12 11:14:34 2025 +0000
b
@@ -1,7 +1,18 @@
 <tables>
-    <!-- Locations of all fasta files required to build Diamond databases -->
+    <table name="blastdb_p" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/blastdb_p.loc" />
+    </table>
+
     <table name="diamond_database" comment_char="#">
         <columns>value, name, db_path</columns>
         <file path="${__HERE__}/test-data/diamond_database.loc" />
     </table>
+
+    <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/ncbi_taxonomy.loc" />
+    </table>
+    
 </tables>