Repository 'profile2cami'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/profile2cami

Changeset 0:0fd79958fac6 (2024-07-26)
Next changeset 1:b6dd55c620f8 (2024-08-14)
Commit message:
planemo upload for repository https://github.com/shenwei356/taxonkit commit 695ea582a8d3bf7845dd4cddbc8b591e4b6c4e82
added:
macros.xml
taxonkit_profile2cami.xml
test-data/abundance.tsv
test-data/ncbi_taxonomy.loc.test
test-data/output1_basic_functionality.tsv
test-data/output2_percentage_flag.tsv
test-data/output3_recompute_abd.tsv
test-data/output4_all_param.tsv
test-data/test-db/delnodes.dmp
test-data/test-db/division.dmp
test-data/test-db/gc.prt
test-data/test-db/gencode.dmp
test-data/test-db/merged.dmp
test-data/test-db/names.dmp
test-data/test-db/nodes.dmp
test-data/test-db/readme.txt
tool-data/ncbi_taxonomy.loc.sample
tool_data_table_conf.xml.sample
tool_data_table_conf.xml.test
b
diff -r 000000000000 -r 0fd79958fac6 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,22 @@
+ <macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">taxonkit</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.17.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="biotools">
+        <xrefs>
+            <xref type="bio.tools">taxonkit</xref>
+        </xrefs>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1016/j.jgg.2021.03.006</citation>
+            <yield/>
+        </citations>
+    </xml>
+</macros>
\ No newline at end of file
b
diff -r 000000000000 -r 0fd79958fac6 taxonkit_profile2cami.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/taxonkit_profile2cami.xml Fri Jul 26 09:26:02 2024 +0000
[
@@ -0,0 +1,111 @@
+<tool id="profile2cami" name="Profile2CAMI" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Convert metagenomic profile table to CAMI format</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>    
+    <expand macro="biotools"/>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code">
+    <![CDATA[
+        taxonkit profile2cami
+        --data-dir '${taxonomy.fields.path}'
+        --abundance-field '${abundance_field}'
+        --taxid-field '${taxid_field}'
+
+        $percentage
+        $recompute_abd
+        $keep_zero
+        $no_sum_up
+
+        #if $sample_id:
+            -s '${sample_id}'
+        #end if
+        #if $taxonomy_id:
+            -t '${taxonomy_id}'
+        #end if
+        #if $ranks:
+            --show-rank '${ranks}'
+        #end if
+        ${input_file}
+        > '${cami_output}'
+    ]]>
+    </command>
+    <inputs>
+        <param name="input_file" type="data" format="txt" label="Input Profile File" help="A tab-delimited profile file with TaxId and abundance columns." />
+        <param argument="--taxonomy" type="select" label="NCBI taxonomy" help="This NCBI database is used to map human-readable taxon names to TaxId's.">
+            <options from_data_table="ncbi_taxonomy">
+                <validator message="No NCBI database is available" type="no_options"/>
+            </options>
+        </param>
+        <param name="abundance_field" type="integer" value="2" label="Abundance Field Index" help="Field index of abundance in the input data." />
+        <param name="taxid_field" type="integer" value="1" label="TaxId Field Index" help="Field index of TaxId in the input data." />
+        <param name="percentage" type="boolean" value="false" label="Abundance in Percentage" help="Check if the abundance values are in percentage." truevalue="-p" falsevalue=""/>
+        <param name="recompute_abd" type="boolean" value="false" label="Recompute Abundance" help="Check to recompute abundance if some TaxIds are deleted in the current taxonomy version." truevalue="-R" falsevalue=""/>
+        <param name="keep_zero" type="boolean" value="false" label="Keep Zero Abundances" help="Check to keep taxons with abundance of zero." truevalue="-0" falsevalue=""/>
+        <param name="no_sum_up" type="boolean" value="false" label="Do Not Sum Up Abundance" help="Do not sum up abundance from child to parent TaxIds." truevalue="-S" falsevalue="" />
+        <param name="sample_id" type="text" value="" label="Sample ID" help="Optional sample ID to include in the result file." />
+        <param name="taxonomy_id" type="text" value="" label="Taxonomy ID" help="Optional taxonomy ID to include in the result file." />
+        <param name ="ranks" argument="--show-rank" type="select" multiple="true" label="Show Ranks" help="Specify the ranks to show in the result file (default [superkingdom,phylum,class,order,family,genus,species,strain]).">
+            <option value="superkingdom">Superkingdom</option>
+            <option value="phylum">Phylum</option>
+            <option value="class">Class</option>
+            <option value="order">Order</option>
+            <option value="family">Family</option>
+            <option value="genus">Genus</option>
+            <option value="species">Species</option>
+            <option value="strain">Strain</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="cami_output" format="tsv" label="${tool.name} on ${on_string}" />
+    </outputs>
+    <tests>
+        <!-- Test 1: Basic functionality with default parameters -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="abundance.tsv" ftype="tsv" />
+            <output name="cami_output" file="output1_basic_functionality.tsv" />
+        </test>
+
+        <!-- Test 2: Using percentage flag -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="abundance.tsv" ftype="tsv" />
+            <param name="percentage" value="true" />
+            <output name="cami_output" file="output2_percentage_flag.tsv" />
+        </test>
+
+        <!-- Test 3: Recomputing abundance with deleted TaxIds -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="abundance.tsv" ftype="tsv" />
+            <param name="recompute_abd" value="true" />
+            <output name="cami_output" file="output3_recompute_abd.tsv" />
+        </test>
+
+        <!-- Test 4: Profile2Cami with all parameters checked -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="abundance.tsv" ftype="tsv" />
+            <param name="percentage" value="true" />
+            <param name="recompute_abd" value="true" />
+            <param name="keep_zero" value="true" />
+            <param name="no_sum_up" value="true" />
+            <output name="cami_output" file="output4_all_param.tsv" />
+        </test>
+        </tests>
+        <help>
+        <![CDATA[
+            **What is Profile2CAMI**
+
+            Profile2CAMI is a tool for converting metagenomic profile tables to CAMI format.
+
+            **Inputs**
+
+            - A tab-delimited profile file with TaxId and abundance columns.
+
+            **Outputs**
+
+            - A CAMI formatted file.
+
+            For more information, please refer to the tool's documentation.
+        ]]>
+        </help>
+    <expand macro="citations" />
+</tool>
\ No newline at end of file
b
diff -r 000000000000 -r 0fd79958fac6 test-data/abundance.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/abundance.tsv Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,4 @@
+83333 0.2 merged to 562
+83333 0.2 absord 562
+561 0.5 no change
+91347 0.1 deleted
\ No newline at end of file
b
diff -r 000000000000 -r 0fd79958fac6 test-data/ncbi_taxonomy.loc.test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy.loc.test Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,1 @@
+test-db-tox Test Database ${__HERE__}/test-db
\ No newline at end of file
b
diff -r 000000000000 -r 0fd79958fac6 test-data/output1_basic_functionality.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output1_basic_functionality.tsv Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,12 @@
+@SampleID:
+@Version:0.10.0
+@Ranks:superkingdom|phylum|class|order|family|genus|species|strain
+@TaxonomyID:
+@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE
+2 superkingdom 2 Bacteria 100.000000000000000
+1224 phylum 2|1224 Bacteria|Proteobacteria 100.000000000000000
+1236 class 2|1224|1236 Bacteria|Proteobacteria|Gammaproteobacteria 100.000000000000000
+91347 order 2|1224|1236|91347 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales 100.000000000000000
+543 family 2|1224|1236|91347|543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae 90.000000000000000
+561 genus 2|1224|1236|91347|543|561 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia 90.000000000000000
+562 species 2|1224|1236|91347|543|561|562 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia|Escherichia coli 40.000000000000000
b
diff -r 000000000000 -r 0fd79958fac6 test-data/output2_percentage_flag.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output2_percentage_flag.tsv Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,12 @@
+@SampleID:
+@Version:0.10.0
+@Ranks:superkingdom|phylum|class|order|family|genus|species|strain
+@TaxonomyID:
+@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE
+2 superkingdom 2 Bacteria 1.000000000000000
+1224 phylum 2|1224 Bacteria|Proteobacteria 1.000000000000000
+1236 class 2|1224|1236 Bacteria|Proteobacteria|Gammaproteobacteria 1.000000000000000
+91347 order 2|1224|1236|91347 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales 1.000000000000000
+543 family 2|1224|1236|91347|543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae 0.900000000000000
+561 genus 2|1224|1236|91347|543|561 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia 0.900000000000000
+562 species 2|1224|1236|91347|543|561|562 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia|Escherichia coli 0.400000000000000
b
diff -r 000000000000 -r 0fd79958fac6 test-data/output3_recompute_abd.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output3_recompute_abd.tsv Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,12 @@
+@SampleID:
+@Version:0.10.0
+@Ranks:superkingdom|phylum|class|order|family|genus|species|strain
+@TaxonomyID:
+@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE
+2 superkingdom 2 Bacteria 190.000000000000000
+1224 phylum 2|1224 Bacteria|Proteobacteria 190.000000000000000
+1236 class 2|1224|1236 Bacteria|Proteobacteria|Gammaproteobacteria 190.000000000000000
+91347 order 2|1224|1236|91347 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales 190.000000000000000
+543 family 2|1224|1236|91347|543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae 90.000000000000000
+561 genus 2|1224|1236|91347|543|561 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia 90.000000000000000
+562 species 2|1224|1236|91347|543|561|562 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia|Escherichia coli 40.000000000000000
b
diff -r 000000000000 -r 0fd79958fac6 test-data/output4_all_param.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/output4_all_param.tsv Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,12 @@
+@SampleID:
+@Version:0.10.0
+@Ranks:superkingdom|phylum|class|order|family|genus|species|strain
+@TaxonomyID:
+@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE
+2 superkingdom 2 Bacteria 0.500000000000000
+1224 phylum 2|1224 Bacteria|Proteobacteria 0.500000000000000
+1236 class 2|1224|1236 Bacteria|Proteobacteria|Gammaproteobacteria 0.500000000000000
+91347 order 2|1224|1236|91347 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales 0.500000000000000
+543 family 2|1224|1236|91347|543 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae 0.500000000000000
+561 genus 2|1224|1236|91347|543|561 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia 0.500000000000000
+562 species 2|1224|1236|91347|543|561|562 Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacteriales|Enterobacteriaceae|Escherichia|Escherichia coli 0.400000000000000
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/delnodes.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/delnodes.dmp Fri Jul 26 09:26:02 2024 +0000
b
b'@@ -0,0 +1,15000 @@\n+2923441\t|\n+2923440\t|\n+2923439\t|\n+2923438\t|\n+2923437\t|\n+2923436\t|\n+2923435\t|\n+2923434\t|\n+2923433\t|\n+2923432\t|\n+2923431\t|\n+2923430\t|\n+2923429\t|\n+2923428\t|\n+2923427\t|\n+2923426\t|\n+2923425\t|\n+2923424\t|\n+2923423\t|\n+2923422\t|\n+2923421\t|\n+2923420\t|\n+2923419\t|\n+2923418\t|\n+2923417\t|\n+2923416\t|\n+2923415\t|\n+2923414\t|\n+2923413\t|\n+2923412\t|\n+2923411\t|\n+2923410\t|\n+2923409\t|\n+2923408\t|\n+2923407\t|\n+2923406\t|\n+2923405\t|\n+2923404\t|\n+2923403\t|\n+2923402\t|\n+2923401\t|\n+2923400\t|\n+2923399\t|\n+2923398\t|\n+2923397\t|\n+2923396\t|\n+2923395\t|\n+2923394\t|\n+2923393\t|\n+2923392\t|\n+2923391\t|\n+2923390\t|\n+2923389\t|\n+2923388\t|\n+2923387\t|\n+2923386\t|\n+2923385\t|\n+2923384\t|\n+2923383\t|\n+2923382\t|\n+2923381\t|\n+2923380\t|\n+2923379\t|\n+2923378\t|\n+2923377\t|\n+2923376\t|\n+2923375\t|\n+2923374\t|\n+2923373\t|\n+2923372\t|\n+2923371\t|\n+2923370\t|\n+2923369\t|\n+2923367\t|\n+2923366\t|\n+2923365\t|\n+2923364\t|\n+2923363\t|\n+2923362\t|\n+2923361\t|\n+2923360\t|\n+2923359\t|\n+2923358\t|\n+2923357\t|\n+2923356\t|\n+2923355\t|\n+2923354\t|\n+2923353\t|\n+2923351\t|\n+2923350\t|\n+2923349\t|\n+2923348\t|\n+2923347\t|\n+2923346\t|\n+2923345\t|\n+2923344\t|\n+2923343\t|\n+2923342\t|\n+2923341\t|\n+2923340\t|\n+2923339\t|\n+2923338\t|\n+2923337\t|\n+2923336\t|\n+2923335\t|\n+2923334\t|\n+2923333\t|\n+2923332\t|\n+2923331\t|\n+2923330\t|\n+2923329\t|\n+2923328\t|\n+2923327\t|\n+2923326\t|\n+2923324\t|\n+2923323\t|\n+2923322\t|\n+2923321\t|\n+2923320\t|\n+2923319\t|\n+2923318\t|\n+2923317\t|\n+2923316\t|\n+2923315\t|\n+2923314\t|\n+2923313\t|\n+2923312\t|\n+2923311\t|\n+2923310\t|\n+2923309\t|\n+2923308\t|\n+2923307\t|\n+2923306\t|\n+2923305\t|\n+2923304\t|\n+2923303\t|\n+2923302\t|\n+2923301\t|\n+2923300\t|\n+2923299\t|\n+2923298\t|\n+2923297\t|\n+2923296\t|\n+2923295\t|\n+2923294\t|\n+2923293\t|\n+2923292\t|\n+2923291\t|\n+2923287\t|\n+2923286\t|\n+2923285\t|\n+2923284\t|\n+2923283\t|\n+2923282\t|\n+2923281\t|\n+2923280\t|\n+2923279\t|\n+2923278\t|\n+2923277\t|\n+2923276\t|\n+2923275\t|\n+2923274\t|\n+2923273\t|\n+2923272\t|\n+2923271\t|\n+2923270\t|\n+2923269\t|\n+2923268\t|\n+2923267\t|\n+2923266\t|\n+2923264\t|\n+2923263\t|\n+2923262\t|\n+2923261\t|\n+2923260\t|\n+2923259\t|\n+2923258\t|\n+2923257\t|\n+2923256\t|\n+2923255\t|\n+2923254\t|\n+2923253\t|\n+2923252\t|\n+2923251\t|\n+2923250\t|\n+2923249\t|\n+2923247\t|\n+2923246\t|\n+2923245\t|\n+2923244\t|\n+2923243\t|\n+2923242\t|\n+2923241\t|\n+2923240\t|\n+2923239\t|\n+2923238\t|\n+2923237\t|\n+2923236\t|\n+2923235\t|\n+2923234\t|\n+2923233\t|\n+2923232\t|\n+2923231\t|\n+2923230\t|\n+2923229\t|\n+2923228\t|\n+2923227\t|\n+2923226\t|\n+2923225\t|\n+2923224\t|\n+2923223\t|\n+2923222\t|\n+2923221\t|\n+2923220\t|\n+2923219\t|\n+2923218\t|\n+2923217\t|\n+2923216\t|\n+2923215\t|\n+2923214\t|\n+2923213\t|\n+2923212\t|\n+2923211\t|\n+2923210\t|\n+2923209\t|\n+2923208\t|\n+2923207\t|\n+2923206\t|\n+2923205\t|\n+2923204\t|\n+2923203\t|\n+2923202\t|\n+2923201\t|\n+2923200\t|\n+2923199\t|\n+2923198\t|\n+2923197\t|\n+2923196\t|\n+2923195\t|\n+2923194\t|\n+2923193\t|\n+2923192\t|\n+2923191\t|\n+2923190\t|\n+2923189\t|\n+2923188\t|\n+2923187\t|\n+2923186\t|\n+2923185\t|\n+2923184\t|\n+2923183\t|\n+2923182\t|\n+2923181\t|\n+2923180\t|\n+2923179\t|\n+2923178\t|\n+2923177\t|\n+2923176\t|\n+2923175\t|\n+2923174\t|\n+2923173\t|\n+2923172\t|\n+2923171\t|\n+2923170\t|\n+2923169\t|\n+2923168\t|\n+2923167\t|\n+2923166\t|\n+2923165\t|\n+2923164\t|\n+2923163\t|\n+2923162\t|\n+2923161\t|\n+2923160\t|\n+2923159\t|\n+2923158\t|\n+2923157\t|\n+2923156\t|\n+2923155\t|\n+2923154\t|\n+2923153\t|\n+2923152\t|\n+2923151\t|\n+2923150\t|\n+2923149\t|\n+2923148\t|\n+2923147\t|\n+2923146\t|\n+2923145\t|\n+2923144\t|\n+2923143\t|\n+2923142\t|\n+2923141\t|\n+2923140\t|\n+2923139\t|\n+2923138\t|\n+2923137\t|\n+2923136\t|\n+2923135\t|\n+2923134\t|\n+2923133\t|\n+2923132\t|\n+2923131\t|\n+2923130\t|\n+2923129\t|\n+2923128\t|\n+2923127\t|\n+2923126\t|\n+2923125\t|\n+2923124\t|\n+2923123\t|\n+2923122\t|\n+2923121\t|\n+2923120\t|\n+2923119\t|\n+2923118\t|\n+2923117\t|\n+2923116\t|\n+2923115\t|\n+2923114\t|\n+2923113\t|\n+2923112\t|\n+2923111\t|\n+2923110\t|\n+2923109\t|\n+2923108\t|\n+2923107\t|\n+2923106\t|\n+2923105\t|\n+2923104\t|\n+2923103\t|\n+2923102\t|\n+2923101\t|\n+2923100\t|\n+2923099\t|\n+2923098\t|\n+2923097\t|\n+2923096\t|\n+2923095\t|\n+2923094\t|\n+2923093\t|\n+2923092\t|\n+2923091\t|\n+2923090\t|\n+2923089\t|\n+2923088\t|\n+2923087\t|\n+2923086\t|\n+2923085\t|\n+2923084\t|\n+2923083\t|\n+2923082\t|\n+2923081\t|\n+2923080\t|\n+2923078\t|\n+2923077\t|\n+2923076\t|\n+2923075\t|\n+2923074\t|\n+2923073\t|\n+2923072\t|\n+2923071'..b'302\t|\n+2901301\t|\n+2901300\t|\n+2901297\t|\n+2901292\t|\n+2901289\t|\n+2901278\t|\n+2901277\t|\n+2901276\t|\n+2901274\t|\n+2901273\t|\n+2901272\t|\n+2901265\t|\n+2901256\t|\n+2901255\t|\n+2901254\t|\n+2901253\t|\n+2901252\t|\n+2901251\t|\n+2901250\t|\n+2901249\t|\n+2901248\t|\n+2901247\t|\n+2901246\t|\n+2901245\t|\n+2901244\t|\n+2901243\t|\n+2901242\t|\n+2901238\t|\n+2901237\t|\n+2901235\t|\n+2901234\t|\n+2901233\t|\n+2901232\t|\n+2901231\t|\n+2901230\t|\n+2901229\t|\n+2901222\t|\n+2901221\t|\n+2901220\t|\n+2901219\t|\n+2901218\t|\n+2901217\t|\n+2901216\t|\n+2901215\t|\n+2901214\t|\n+2901213\t|\n+2901212\t|\n+2901211\t|\n+2901210\t|\n+2901208\t|\n+2901207\t|\n+2901206\t|\n+2901205\t|\n+2901204\t|\n+2901202\t|\n+2901201\t|\n+2901200\t|\n+2901199\t|\n+2901198\t|\n+2901197\t|\n+2901193\t|\n+2901191\t|\n+2901188\t|\n+2901186\t|\n+2901185\t|\n+2901184\t|\n+2901183\t|\n+2901182\t|\n+2901181\t|\n+2901180\t|\n+2901179\t|\n+2901178\t|\n+2901173\t|\n+2901171\t|\n+2901170\t|\n+2901169\t|\n+2901168\t|\n+2901167\t|\n+2901166\t|\n+2901165\t|\n+2901164\t|\n+2901163\t|\n+2901162\t|\n+2901161\t|\n+2901160\t|\n+2901159\t|\n+2901157\t|\n+2901156\t|\n+2901155\t|\n+2901147\t|\n+2901145\t|\n+2901144\t|\n+2901143\t|\n+2901139\t|\n+2901138\t|\n+2901137\t|\n+2901136\t|\n+2901135\t|\n+2901134\t|\n+2901133\t|\n+2901132\t|\n+2901131\t|\n+2901130\t|\n+2901129\t|\n+2901128\t|\n+2901127\t|\n+2901126\t|\n+2901125\t|\n+2901124\t|\n+2901123\t|\n+2901122\t|\n+2901121\t|\n+2901120\t|\n+2901119\t|\n+2901118\t|\n+2901117\t|\n+2901116\t|\n+2901115\t|\n+2901114\t|\n+2901113\t|\n+2901112\t|\n+2901111\t|\n+2901110\t|\n+2901109\t|\n+2901108\t|\n+2901107\t|\n+2901106\t|\n+2901105\t|\n+2901104\t|\n+2901103\t|\n+2901102\t|\n+2901101\t|\n+2901100\t|\n+2901099\t|\n+2901098\t|\n+2901097\t|\n+2901096\t|\n+2901095\t|\n+2901094\t|\n+2901093\t|\n+2901092\t|\n+2901091\t|\n+2901090\t|\n+2901089\t|\n+2901088\t|\n+2901087\t|\n+2901086\t|\n+2901085\t|\n+2901084\t|\n+2901083\t|\n+2901082\t|\n+2901081\t|\n+2901080\t|\n+2901079\t|\n+2901078\t|\n+2901077\t|\n+2901076\t|\n+2901075\t|\n+2901074\t|\n+2901073\t|\n+2901072\t|\n+2901071\t|\n+2901070\t|\n+2901069\t|\n+2901068\t|\n+2901067\t|\n+2901066\t|\n+2901065\t|\n+2901064\t|\n+2901063\t|\n+2901062\t|\n+2901061\t|\n+2901060\t|\n+2901059\t|\n+2901058\t|\n+2901057\t|\n+2901056\t|\n+2901055\t|\n+2901054\t|\n+2901053\t|\n+2901052\t|\n+2901051\t|\n+2901050\t|\n+2901049\t|\n+2901048\t|\n+2901047\t|\n+2901046\t|\n+2901045\t|\n+2901044\t|\n+2901043\t|\n+2901042\t|\n+2901041\t|\n+2901040\t|\n+2901039\t|\n+2901038\t|\n+2901037\t|\n+2901036\t|\n+2901035\t|\n+2901034\t|\n+2901033\t|\n+2901032\t|\n+2901031\t|\n+2901030\t|\n+2901029\t|\n+2901028\t|\n+2901027\t|\n+2901026\t|\n+2901025\t|\n+2901024\t|\n+2901023\t|\n+2901022\t|\n+2901021\t|\n+2901020\t|\n+2901019\t|\n+2901018\t|\n+2901017\t|\n+2901016\t|\n+2901015\t|\n+2901014\t|\n+2901013\t|\n+2901012\t|\n+2901011\t|\n+2901010\t|\n+2901009\t|\n+2901008\t|\n+2901007\t|\n+2901006\t|\n+2901005\t|\n+2901004\t|\n+2901003\t|\n+2901002\t|\n+2901001\t|\n+2901000\t|\n+2900999\t|\n+2900998\t|\n+2900997\t|\n+2900996\t|\n+2900995\t|\n+2900994\t|\n+2900993\t|\n+2900992\t|\n+2900991\t|\n+2900990\t|\n+2900989\t|\n+2900988\t|\n+2900987\t|\n+2900986\t|\n+2900985\t|\n+2900984\t|\n+2900983\t|\n+2900982\t|\n+2900981\t|\n+2900980\t|\n+2900979\t|\n+2900978\t|\n+2900977\t|\n+2900976\t|\n+2900975\t|\n+2900974\t|\n+2900973\t|\n+2900972\t|\n+2900971\t|\n+2900970\t|\n+2900969\t|\n+2900968\t|\n+2900967\t|\n+2900966\t|\n+2900965\t|\n+2900964\t|\n+2900963\t|\n+2900962\t|\n+2900961\t|\n+2900960\t|\n+2900959\t|\n+2900958\t|\n+2900957\t|\n+2900956\t|\n+2900955\t|\n+2900954\t|\n+2900953\t|\n+2900952\t|\n+2900951\t|\n+2900950\t|\n+2900949\t|\n+2900948\t|\n+2900947\t|\n+2900946\t|\n+2900945\t|\n+2900944\t|\n+2900943\t|\n+2900942\t|\n+2900941\t|\n+2900940\t|\n+2900939\t|\n+2900938\t|\n+2900937\t|\n+2900936\t|\n+2900935\t|\n+2900934\t|\n+2900933\t|\n+2900932\t|\n+2900931\t|\n+2900930\t|\n+2900929\t|\n+2900928\t|\n+2900927\t|\n+2900926\t|\n+2900925\t|\n+2900924\t|\n+2900923\t|\n+2900922\t|\n+2900921\t|\n+2900920\t|\n+2900919\t|\n+2900918\t|\n+2900917\t|\n+2900916\t|\n+2900915\t|\n+2900914\t|\n+2900913\t|\n+2900912\t|\n+2900911\t|\n+2900910\t|\n+2900909\t|\n+2900908\t|\n+2900907\t|\n+2900906\t|\n+2900905\t|\n+2900904\t|\n+2900903\t|\n+2900902\t|\n+2900901\t|\n+2900900\t|\n+2900899\t|\n+2900898\t|\n+2900897\t|\n+2900896\t|\n+2900895\t|\n+2900894\t|\n+2900893\t|\n+2900892\t|\n+2900891\t|\n+2900890\t|\n+2900889\t|\n+2900888\t|\n+2900887\t|\n+2900886\t|\n+2900885\t|\n+2900884\t|\n+2900883\t|\n+2900882\t|\n+2900881\t|\n+2900880\t|\n+2900879\t|\n+2900878\t|\n+2900877\t|\n+2900876\t|\n+2900875\t|\n+2900874\t|\n+2900873\t|\n+2900872\t|\n+2900871\t|\n+2900870\t|\n'
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/division.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/division.dmp Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,12 @@
+0 | BCT | Bacteria |   |
+1 | INV | Invertebrates |   |
+2 | MAM | Mammals |   |
+3 | PHG | Phages |   |
+4 | PLN | Plants and Fungi |   |
+5 | PRI | Primates |   |
+6 | ROD | Rodents |   |
+7 | SYN | Synthetic and Chimeric |   |
+8 | UNA | Unassigned | No species nodes should inherit this division assignment |
+9 | VRL | Viruses |   |
+10 | VRT | Vertebrates |   |
+11 | ENV | Environmental samples | Anonymous sequences cloned directly from the environment |
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/gc.prt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/gc.prt Fri Jul 26 09:26:02 2024 +0000
b
b'@@ -0,0 +1,358 @@\n+--**************************************************************************\n+--  This is the NCBI genetic code table\n+--  Initial base data set from Andrzej Elzanowski while at PIR International\n+--  Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI\n+--  Base 1-3 of each codon have been added as comments to facilitate\n+--    readability at the suggestion of Peter Rice, EMBL\n+--  Later additions by Taxonomy Group staff at NCBI\n+--\n+--  Version 4.6\n+--     Renamed genetic code 24 to Rhabdopleuridae Mitochondrial\n+--\n+--  Version 4.5\n+--     Added Cephalodiscidae mitochondrial genetic code 33\n+--\n+--  Version 4.4\n+--     Added GTG as start codon for genetic code 3\n+--     Added Balanophoraceae plastid genetic code 32\n+--\n+--  Version 4.3\n+--     Change to CTG -> Leu in genetic codes 27, 28, 29, 30\n+--\n+--  Version 4.2\n+--     Added Karyorelict nuclear genetic code 27\n+--     Added Condylostoma nuclear genetic code 28\n+--     Added Mesodinium nuclear genetic code 29\n+--     Added Peritrich nuclear genetic code 30\n+--     Added Blastocrithidia nuclear genetic code 31\n+--\n+--  Version 4.1\n+--     Added Pachysolen tannophilus nuclear genetic code 26\n+--\n+--  Version 4.0\n+--     Updated version to reflect numerous undocumented changes:\n+--     Corrected start codons for genetic code 25\n+--     Name of new genetic code is Candidate Division SR1 and Gracilibacteria\n+--     Added candidate division SR1 nuclear genetic code 25\n+--     Added GTG as start codon for genetic code 24\n+--     Corrected Pterobranchia Mitochondrial genetic code (24)\n+--     Added genetic code 24, Pterobranchia Mitochondrial\n+--     Genetic code 11 is now Bacterial, Archaeal and Plant Plastid\n+--     Fixed capitalization of mitochondrial in codes 22 and 23\n+--     Added GTG, ATA, and TTG as alternative start codons to code 13\n+--\n+--  Version 3.9\n+--     Code 14 differs from code 9 only by translating UAA to Tyr rather than\n+--     STOP.  A recent study (Telford et al, 2000) has found no evidence that\n+--     the codon UAA codes for Tyr in the flatworms, but other opinions exist.\n+--     There are very few GenBank records that are translated with code 14,\n+--     but a test translation shows that retranslating these records with code\n+--     9 can cause premature terminations.  Therefore, GenBank will maintain\n+--     code 14 until further information becomes available.\n+--\n+--  Version 3.8\n+--     Added GTG start to Echinoderm mitochondrial code, code 9\n+--\n+--  Version 3.7\n+--     Added code 23 Thraustochytrium mitochondrial code\n+--        formerly OGMP code 93\n+--        submitted by Gertraude Berger, Ph.D.\n+--\n+--  Version 3.6\n+--     Added code 22 TAG-Leu, TCA-stop\n+--        found in mitochondrial DNA of Scenedesmus obliquus\n+--        submitted by Gertraude Berger, Ph.D.\n+--        Organelle Genome Megasequencing Program, Univ Montreal\n+--\n+--  Version 3.5\n+--     Added code 21, Trematode Mitochondrial\n+--       (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)\n+--     Added code 16, Chlorophycean Mitochondrial\n+--       (TAG can translated to Leucine instaed to STOP in chlorophyceans\n+--        and fungi)\n+--\n+--  Version 3.4\n+--     Added CTG,TTG as allowed alternate start codons in Standard code.\n+--        Prats et al. 1989, Hann et al. 1992\n+--\n+--  Version 3.3 - 10/13/95\n+--     Added alternate intiation codon ATC to code 5\n+--        based on complete mitochondrial genome of honeybee\n+--        Crozier and Crozier (1993)\n+--\n+--  Version 3.2 - 6/24/95\n+--  Code       Comments\n+--   10        Alternative Ciliate Macronuclear renamed to Euplotid Macro...\n+--   15        Blepharisma Macro.. code added\n+--    5        Invertebrate Mito.. GTG allowed as alternate initiator\n+--   11        Eubacterial renamed to Bacterial as most alternate starts\n+--               have been found in Archea\n+--\n+--\n+--  Version 3.1 - 1995\n+--  Updated as per Andrzej Elzanowski at NCBI\n+--     Complete documentation in NCBI'..b'5 ,\n+  ncbieaa  "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "---M------**-----------------------M---------------M------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Pachysolen tannophilus Nuclear" ,\n+  id 26 ,\n+  ncbieaa  "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "----------**--*----M---------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Karyorelict Nuclear" ,\n+  id 27 ,\n+  ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "--------------*--------------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Condylostoma Nuclear" ,\n+  id 28 ,\n+  ncbieaa  "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "----------**--*--------------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Mesodinium Nuclear" ,\n+  id 29 ,\n+  ncbieaa  "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "--------------*--------------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Peritrich Nuclear" ,\n+  id 30 ,\n+  ncbieaa  "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "--------------*--------------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Blastocrithidia Nuclear" ,\n+  id 31 ,\n+  ncbieaa  "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "----------**-----------------------M----------------------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Balanophoraceae Plastid" ,\n+  id 32 ,\n+  ncbieaa  "FFLLSSSSYY*WCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",\n+  sncbieaa "---M------*---*----M------------MMMM---------------M------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ } ,\n+ {\n+  name "Cephalodiscidae Mitochondrial" ,\n+  id 33 ,\n+  ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",\n+  sncbieaa "---M-------*-------M---------------M---------------M------------"\n+  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG\n+  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG\n+  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG\n+ }\n+}\n'
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/gencode.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/gencode.dmp Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,28 @@
+0 | | Unspecified |                                                                   |                                                                   |
+1 | | Standard | FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ---M------**--*----M---------------M----------------------------  |
+2 | | Vertebrate Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG  | ----------**--------------------MMMM----------**---M------------  |
+3 | | Yeast Mitochondrial | FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**----------------------MM---------------M------------  |
+4 | | Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --MM------**-------M------------MMMM---------------M------------  |
+5 | | Invertebrate Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG  | ---M------**--------------------MMMM---------------M------------  |
+6 | | Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear | FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --------------*--------------------M----------------------------  |
+9 | | Echinoderm Mitochondrial; Flatworm Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG  | ----------**-----------------------M---------------M------------  |
+10 | | Euplotid Nuclear | FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**-----------------------M----------------------------  |
+11 | | Bacterial, Archaeal and Plant Plastid | FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ---M------**--*----M------------MMMM---------------M------------  |
+12 | | Alternative Yeast Nuclear | FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**--*----M---------------M----------------------------  |
+13 | | Ascidian Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG  | ---M------**----------------------MM---------------M------------  |
+14 | | Alternative Flatworm Mitochondrial | FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG  | -----------*-----------------------M----------------------------  |
+15 | | Blepharisma Macronuclear | FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------*---*--------------------M----------------------------  |
+16 | | Chlorophycean Mitochondrial | FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------*---*--------------------M----------------------------  |
+21 | | Trematode Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG  | ----------**-----------------------M---------------M------------  |
+22 | | Scenedesmus obliquus mitochondrial | FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ------*---*---*--------------------M----------------------------  |
+23 | | Thraustochytrium mitochondrial code | FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --*-------**--*-----------------M--M---------------M------------  |
+24 | | Rhabdopleuridae Mitochondrial | FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG | ---M------**-------M---------------M---------------M------------ |
+25 | | Candidate Division SR1 and Gracilibacteria | FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ---M------**-----------------------M---------------M------------  |
+26 | | Pachysolen tannophilus Nuclear | FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**--*----M---------------M----------------------------  |
+27 | | Karyorelict Nuclear | FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --------------*--------------------M----------------------------  |
+28 | | Condylostoma Nuclear | FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**--*--------------------M----------------------------  |
+29 | | Mesodinium Nuclear | FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --------------*--------------------M----------------------------  |
+30 | | Peritrich Nuclear | FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | --------------*--------------------M----------------------------  |
+31 | | Blastocrithidia Nuclear | FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ----------**-----------------------M----------------------------  |
+32 | | Balanophoraceae Plastid | FFLLSSSSYY*WCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG  | ---M------*---*----M------------MMMM---------------M------------  |
+33 | | Cephalodiscidae Mitochondrial | FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG  | ---M-------*-------M---------------M---------------M------------  |
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/merged.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/merged.dmp Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,2 @@
+2824115 | 483329 |
+2923440 | 2824115   |
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/names.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/names.dmp Fri Jul 26 09:26:02 2024 +0000
[
@@ -0,0 +1,75 @@
+83333 | Escherichia coli K-12 | | scientific name |
+83333 | Escherichia coli K12 | | equivalent name |
+562 | "Bacillus coli" Migula 1895 | | authority |
+562 | "Bacterium coli commune" Escherich 1885 | | authority |
+562 | "Bacterium coli" (Migula 1895) Lehmann and Neumann 1896 | | authority |
+562 | ATCC 11775 | | type material |
+562 | Bacillus coli | | synonym |
+562 | Bacterium coli | | synonym |
+562 | Bacterium coli commune | | synonym |
+562 | CCUG 24 | | type material |
+562 | CCUG 29300 | | type material |
+562 | CIP 54.8 | | type material |
+562 | DSM 30083 | | type material |
+562 | Enterococcus coli | | synonym |
+562 | Escherchia coli | | misspelling |
+562 | Escherichia coli | | scientific name |
+562 | Escherichia coli (Migula 1895) Castellani and Chalmers 1919 | | authority |
+562 | Escherichia sp. MAR | | includes |
+562 | Escherichia/Shigella coli | | equivalent name |
+562 | Eschericia coli | | misspelling |
+562 | JCM 1649 | | type material |
+562 | LMG 2092 | | type material |
+562 | NBRC 102203 | | type material |
+562 | NCCB 54008 | | type material |
+562 | NCTC 9001 | | type material |
+562 | bacterium 10a | | includes |
+562 | bacterium E3 | | includes |
+561 | Escherchia | | misspelling |
+561 | Escherichia | | scientific name |
+561 | Escherichia Castellani and Chalmers 1919 | | authority |
+543 | Enterobacteraceae | | synonym |
+543 | Enterobacteraceae (ex Lapage 1979) Lapage 1982, fam. nov., nom. rev. | | synonym |
+543 | Enterobacteriaceae | | scientific name |
+543 | Enterobacteriaceae (ex Rahn 1937) Ewing et al. 1980, fam. nov., nom. rev. | | synonym |
+543 | Enterobacteriaceae Rahn 1937 | | synonym |
+543 | gamma-3 proteobacteria | gamma-3 proteobacteria <#1> | in-part |
+91347 | 'Enterobacteriales' | | synonym |
+91347 | Enterobacteriaceae and related endosymbionts | | synonym |
+91347 | Enterobacteriaceae group | | synonym |
+91347 | Enterobacteriales | | scientific name |
+91347 | enterobacteria | enterobacteria<blast91347> | blast name |
+91347 | gamma-3 proteobacteria | gamma-3 proteobacteria <#5> | in-part |
+1236 | Gammaproteobacteria | | scientific name |
+1236 | Gammaproteobacteria Garrity et al. 2005 | | synonym |
+1236 | Proteobacteria gamma subdivision | | synonym |
+1236 | Purple bacteria, gamma subdivision | | synonym |
+1236 | g-proteobacteria | gamma proteos<blast1236> | blast name |
+1236 | gamma proteobacteria | | synonym |
+1236 | gamma subdivision | | synonym |
+1236 | gamma subgroup | | synonym |
+1224 | Proteobacteria | | scientific name |
+1224 | Proteobacteria Garrity et al. 2005 | | authority |
+1224 | Proteobacteria [class] Stackebrandt et al. 1988 | | authority |
+1224 | not Proteobacteria Cavalier-Smith 2002 | | authority |
+1224 | proteobacteria | proteobacteria<blast1224> | blast name |
+1224 | purple bacteria | | common name |
+1224 | purple bacteria and relatives | | common name |
+1224 | purple non-sulfur bacteria | | common name |
+1224 | purple photosynthetic bacteria | | common name |
+1224 | purple photosynthetic bacteria and relatives | | common name |
+2 | Bacteria | Bacteria <prokaryote> | scientific name |
+2 | Monera | Monera <Bacteria> | in-part |
+2 | Procaryotae | Procaryotae <Bacteria> | in-part |
+2 | Prokaryota | Prokaryota <Bacteria> | in-part |
+2 | Prokaryotae | Prokaryotae <Bacteria> | in-part |
+2 | bacteria | bacteria <blast2> | blast name |
+2 | eubacteria | | genbank common name |
+2 | not Bacteria Haeckel 1894 | | synonym |
+2 | prokaryote | prokaryote <Bacteria> | in-part |
+2 | prokaryotes | prokaryotes <Bacteria> | in-part |
+1 | all | | synonym |
+1 | root | | scientific name |
+131567 | biota | | synonym |
+131567 | cellular organisms | | scientific name |
+
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/nodes.dmp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/nodes.dmp Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,10 @@
+83333 | 562 | no rank | | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
+562 | 561 | species | EC | 0 | 1 | 11 | 1 | 0 | 1 | 1 | 0 | |
+561 | 543 | genus | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
+543 | 91347 | family | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
+91347 | 1236 | order | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
+1236 | 1224 | class | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
+1224 | 2 | phylum | | 0 | 1 | 11 | 1 | 0 | 1 | 0 | 0 | |
+2 | 131567 | superkingdom | | 0 | 0 | 11 | 0 | 0 | 0 | 0 | 0 | |
+131567 | 1 | no rank | | 8 | 1 | 1 | 1 | 0 | 1 | 1 | 0 | |
+1 | 1 | no rank | | 8 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | |
b
diff -r 000000000000 -r 0fd79958fac6 test-data/test-db/readme.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-db/readme.txt Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,61 @@
+*.dmp files are bcp-like dump from GenBank taxonomy database.
+
+General information.
+Field terminator is "\t|\t"
+Row terminator is "\t|\n"
+
+nodes.dmp file consists of taxonomy nodes. The description for each node includes the following
+fields:
+ tax_id -- node id in GenBank taxonomy database
+  parent tax_id -- parent node id in GenBank taxonomy database
+  rank -- rank of this node (superkingdom, kingdom, ...) 
+  embl code -- locus-name prefix; not unique
+  division id -- see division.dmp file
+  inherited div flag  (1 or 0) -- 1 if node inherits division from parent
+  genetic code id -- see gencode.dmp file
+  inherited GC  flag  (1 or 0) -- 1 if node inherits genetic code from parent
+  mitochondrial genetic code id -- see gencode.dmp file
+  inherited MGC flag  (1 or 0) -- 1 if node inherits mitochondrial gencode from parent
+  GenBank hidden flag (1 or 0)            -- 1 if name is suppressed in GenBank entry lineage
+  hidden subtree root flag (1 or 0)       -- 1 if this subtree has no sequence data yet
+  comments -- free-text comments and citations
+
+Taxonomy names file (names.dmp):
+ tax_id -- the id of node associated with this name
+ name_txt -- name itself
+ unique name -- the unique variant of this name if name not unique
+ name class -- (synonym, common name, ...)
+
+Divisions file (division.dmp):
+ division id -- taxonomy database division id
+ division cde -- GenBank division code (three characters)
+ division name -- e.g. BCT, PLN, VRT, MAM, PRI...
+ comments
+
+Genetic codes file (gencode.dmp):
+ genetic code id -- GenBank genetic code id
+ abbreviation -- genetic code name abbreviation
+ name -- genetic code name
+ cde -- translation table for this genetic code
+ starts -- start codons for this genetic code
+
+Deleted nodes file (delnodes.dmp):
+ tax_id -- deleted node id
+
+Merged nodes file (merged.dmp):
+ old_tax_id                              -- id of nodes which has been merged
+ new_tax_id                              -- id of nodes which is result of merging
+
+Citations file (citations.dmp):
+ cit_id -- the unique id of citation
+ cit_key -- citation key
+ pubmed_id -- unique id in PubMed database (0 if not in PubMed)
+ medline_id -- unique id in MedLine database (0 if not in MedLine)
+ url -- URL associated with citation
+ text -- any text (usually article name and authors).
+ -- The following characters are escaped in this text by a backslash:
+ -- newline (appear as "\n"),
+ -- tab character ("\t"),
+ -- double quotes ('\"'),
+ -- backslash character ("\\").
+ taxid_list -- list of node ids separated by a single space
b
diff -r 000000000000 -r 0fd79958fac6 tool-data/ncbi_taxonomy.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_taxonomy.loc.sample Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,2 @@
+#value name path
+# test-db-tox "Test Database"  tool-data/test-db
\ No newline at end of file
b
diff -r 000000000000 -r 0fd79958fac6 tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+        <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ncbi_taxonomy.loc" />
+    </table>
+</tables>
b
diff -r 000000000000 -r 0fd79958fac6 tool_data_table_conf.xml.test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test Fri Jul 26 09:26:02 2024 +0000
b
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+        <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/ncbi_taxonomy.loc.test" />
+    </table>
+</tables>