Repository 'combine_metaphlan2_humann2'
hg clone https://toolshed.g2.bx.psu.edu/repos/bebatut/combine_metaphlan2_humann2

Changeset 3:01ac9954c27f (2023-07-20)
Previous changeset 2:fdfb35745104 (2022-10-19) Next changeset 4:662a334004b4 (2023-11-04)
Commit message:
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/combine_metaphlan2_humann2 commit b84cbcbffa25c55acd1a31df76e4a4f78772cbd7
modified:
test-data/humann2_gene_families_input.tabular
added:
combine_metaphlan_humann.py
combine_metaphlan_humann.xml
test-data/humann36_gene_families_input.tabular
test-data/humann36_pathways_input.tabular
test-data/metaphlan4_input.txt
removed:
combine_metaphlan2_humann2.py
combine_metaphlan2_humann2.xml
b
diff -r fdfb35745104 -r 01ac9954c27f combine_metaphlan2_humann2.py
--- a/combine_metaphlan2_humann2.py Wed Oct 19 14:44:00 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-import argparse
-
-
-def extract_clade_abundance(metaphlan2_fp):
-    clade_abund = {}
-    with open(metaphlan2_fp, "r") as metaphlan2_f:
-        for line in metaphlan2_f.readlines():
-            if line.find("g__") == -1:
-                continue
-
-            split_line = line[:-1].split("\t")
-            taxo = split_line[0]
-            abundance = split_line[1]
-
-            genus = taxo[(taxo.find("g__") + 3):]
-            if genus.find("|") != -1:
-                genus = genus[: (genus.find("|"))]
-            clade_abund.setdefault(genus, {"abundance": 0, "species": {}})
-            if taxo.find("t__") != -1:
-                continue
-            elif taxo.find("s__") != -1:
-                species = taxo[(taxo.find("s__") + 3):]
-                clade_abund[genus]["species"].setdefault(species, abundance)
-            else:
-                clade_abund[genus]["abundance"] = abundance
-    return clade_abund
-
-
-def compute_overall_abundance(humann2_fp):
-    overall_abundance = 0
-    with open(humann2_fp, "r") as humann2_f:
-        for line in humann2_f.readlines():
-            if line.find("|") != -1 or line.startswith("#"):
-                continue
-            split_line = line[:-1].split("\t")
-            overall_abundance += float(split_line[1])
-    return overall_abundance
-
-
-def format_characteristic_name(name):
-    formatted_n = name
-    formatted_n = formatted_n.replace("/", " ")
-    formatted_n = formatted_n.replace("-", " ")
-    formatted_n = formatted_n.replace("'", "")
-    if formatted_n.find("(") != -1 and formatted_n.find(")") != -1:
-        open_bracket = formatted_n.find("(")
-        close_bracket = formatted_n.find(")") + 1
-        formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
-    return formatted_n
-
-
-def combine_metaphlan2_humann2(args):
-    clade_abund = extract_clade_abundance(args.metaphlan2_fp)
-    overall_abund = compute_overall_abundance(args.humann2_fp)
-
-    with open(args.output_fp, "w") as output_f:
-        s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t"
-        s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type)
-        output_f.write(s)
-        with open(args.humann2_fp, "r") as humann2_f:
-            for line in humann2_f.readlines():
-                if line.find("|") == -1:
-                    continue
-
-                split_line = line[:-1].split("\t")
-                abundance = 100 * float(split_line[1]) / overall_abund
-                annotation = split_line[0].split("|")
-                charact = annotation[0].split(":")
-                charact_id = charact[0]
-                char_name = ""
-                if len(charact) > 1:
-                    char_name = format_characteristic_name(charact[-1])
-                taxo = annotation[1].split(".")
-
-                if taxo[0] == "unclassified":
-                    continue
-                genus = taxo[0][3:]
-                species = taxo[1][3:]
-
-                if genus not in clade_abund:
-                    print("no %s found in %s" % (genus, args.metaphlan2_fp))
-                    continue
-                if species not in clade_abund[genus]["species"]:
-                    print(
-                        "no %s found in %s for % s"
-                        % (species, args.metaphlan2_fp, genus)
-                    )
-                    continue
-
-                s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"])
-                s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species])
-                s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
-                output_f.write(s)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--humann2_fp", required=True)
-    parser.add_argument("--metaphlan2_fp", required=True)
-    parser.add_argument("--output_fp", required=True)
-    parser.add_argument("--type", required=True, choices=["gene_families", "pathways"])
-    args = parser.parse_args()
-
-    combine_metaphlan2_humann2(args)
b
diff -r fdfb35745104 -r 01ac9954c27f combine_metaphlan2_humann2.xml
--- a/combine_metaphlan2_humann2.xml Wed Oct 19 14:44:00 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
[
@@ -1,81 +0,0 @@
-<tool id="combine_metaphlan2_humann2" name="Combine MetaPhlAn2 and HUMAnN2 outputs" version="0.2.0">
-    <description>to relate genus/species abundances and gene families/pathways abundances</description>
-    <requirements>
-    </requirements>
-    <stdio>
-        <exit_code range="1:" />
-        <exit_code range=":-1" />
-    </stdio>
-    <version_command></version_command>
-    <command><![CDATA[
-        python $__tool_directory__/combine_metaphlan2_humann2.py 
-            --metaphlan2_fp '$metaphlan2_file'
-            --humann2_fp '$humann2_file'
-            --type '$type'   
-
-            #if str($type) == 'gene_families'
-                --output_fp '$gene_families_output_file'
-            #else
-                --output_fp '$pathway_output_file'
-            #end if
-    ]]></command>
-    <inputs>
-        <param name="metaphlan2_file" format="txt,tabular" type="data" label="Input file corresponding to MetaPhlAn2 output" help="The MetaPhlAn2 output file contains relative abundance of clades at different taxonomic levels (--metaphlan2_file)"/>
-        <param name="humann2_file" format="txt,tabular" type="data" label="Input file corresponding to HUMAnN2 output" help="The HUMAnN2 output file contains relative abundance of gene families or pathways with corresponding taxonomic stratification (--humann2_file)"/>
-        <param name='type' type="select" label="Type of characteristics in HUMAnN2 file" help="(--type)">
-            <option value="gene_families" selected="true">Gene families</option>
-            <option value="pathways">Pathways</option>
-        </param>
-    </inputs>
-    <outputs>
-        <data name="gene_families_output_file" format="tabular"
-            label="${tool.name} on ${on_string}: Gene family abundances related to genus/species abundances" >
-            <filter>type=="gene_families"</filter>
-        </data>
-        <data name="pathway_output_file" format="tabular"
-            label="${tool.name} on ${on_string}: Pathway abundances related to genus/species abundances" >
-            <filter>type=="pathways"</filter>
-        </data>
-    </outputs>
-    <tests>
-     <test>
-     <param name="metaphlan2_file" value="metaphlan2_input.txt"/>
-     <param name="humann2_file" value="humann2_gene_families_input.tabular"/>
-     <param name='type' value="gene_families"/>
-     <output name="gene_families_output_file">
-                <assert_contents>
-                    <has_n_columns n="8"/>
-                    <has_n_lines n="29434"/>
-                    <has_text text="Staphylococcus_epidermidis"/>
-                    <has_text text="Putative transposon Tn552 DNA invertase bin3"/>
-                    <has_size value="3467947"/>
-                </assert_contents>
-            </output>
-     </test>
-     <test>
-      <param name="metaphlan2_file" value="metaphlan2_input.txt"/>
-      <param name="humann2_file" value="humann2_pathways_input.tabular"/>
-      <param name='type' value="pathways"/>
-             <output name="pathway_output_file">
-                <assert_contents>
-                    <has_n_columns n="8"/>
-                    <has_n_lines n="1533"/>
-                    <has_text text="Rhodobacter_sphaeroides"/>
-                    <has_text text="superpathway of acetyl CoA biosynthesis"/>
-                    <has_size value="186363"/>
-                </assert_contents>
-            </output>
-     </test>
-    </tests>
-    <help><![CDATA[
-**What it does**
-
-This tool combine MetaPhlAn2 outputs and HUMANnN2 outputs.
-
-For each gene families/pathways and the corresponding taxonomic stratification, 
-you get relative abundance of this gene family/pathway and the relative abundance 
-of corresponding species and genus.
-    ]]></help>
-    <citations>
-    </citations>
-</tool>
\ No newline at end of file
b
diff -r fdfb35745104 -r 01ac9954c27f combine_metaphlan_humann.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_metaphlan_humann.py Thu Jul 20 10:07:12 2023 +0000
[
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import argparse
+
+
+def extract_clade_abundance(metaphlan_fp):
+    clade_abund = {}
+    with open(metaphlan_fp, "r") as metaphlan_f:
+        is_metaphlan_v4 = False
+        for line in metaphlan_f.readlines():
+            if 'SGB' in line:
+                # New versions of metaphlan against a recent DB contain a header line with DB name, which contains SGB
+                is_metaphlan_v4 = True
+            if line.find("g__") == -1:
+                continue
+
+            split_line = line[:-1].split("\t")
+            taxo = split_line[0]
+            if is_metaphlan_v4:
+                # Column order in new metaphlan versions:
+                # clade_name NCBI_tax_id relative_abundance additional_species
+                abundance = split_line[2]
+            else:
+                # Column order in the old metaphlan versions:
+                # clade_name relative_abundance coverage average_genome_length_in_the_clade estimated_number_of_reads_from_the_clade
+                abundance = split_line[1]
+
+            genus = taxo[(taxo.find("g__") + 3):]
+            if genus.find("|") != -1:
+                genus = genus[: (genus.find("|"))]
+            clade_abund.setdefault(genus, {"abundance": 0, "species": {}})
+            if taxo.find("t__") != -1:
+                continue
+            elif taxo.find("s__") != -1:
+                species = taxo[(taxo.find("s__") + 3):]
+                clade_abund[genus]["species"].setdefault(species, abundance)
+            else:
+                clade_abund[genus]["abundance"] = abundance
+    return clade_abund
+
+
+def compute_overall_abundance(humann_fp):
+    overall_abundance = 0
+    with open(humann_fp, "r") as humann_f:
+        for line in humann_f.readlines():
+            if line.find("|") != -1 or line.startswith("#"):
+                continue
+            split_line = line[:-1].split("\t")
+            overall_abundance += float(split_line[1])
+    return overall_abundance
+
+
+def format_characteristic_name(name):
+    formatted_n = name
+    formatted_n = formatted_n.replace("/", " ")
+    formatted_n = formatted_n.replace("-", " ")
+    formatted_n = formatted_n.replace("'", "")
+    if formatted_n.find("(") != -1 and formatted_n.find(")") != -1:
+        open_bracket = formatted_n.find("(")
+        close_bracket = formatted_n.find(")") + 1
+        formatted_n = formatted_n[:open_bracket] + formatted_n[close_bracket:]
+    return formatted_n
+
+
+def combine_metaphlan_humann(args):
+    clade_abund = extract_clade_abundance(args.metaphlan_fp)
+    overall_abund = compute_overall_abundance(args.humann_fp)
+
+    with open(args.output_fp, "w") as output_f:
+        s = "genus\tgenus_abundance\tspecies\tspecies_abundance\t"
+        s = "%s\t%s_id\t%s_name\t%s_abundance\n" % (s, args.type, args.type, args.type)
+        output_f.write(s)
+        with open(args.humann_fp, "r") as humann_f:
+            for line in humann_f.readlines():
+                if line.find("|") == -1:
+                    continue
+
+                split_line = line[:-1].split("\t")
+                abundance = 100 * float(split_line[1]) / overall_abund
+                annotation = split_line[0].split("|")
+                charact = annotation[0].split(":")
+                charact_id = charact[0]
+                char_name = ""
+                if len(charact) > 1:
+                    char_name = format_characteristic_name(charact[-1])
+                taxo = annotation[1].split(".")
+
+                if taxo[0] == "unclassified":
+                    continue
+                genus = taxo[0][3:]
+                species = taxo[1][3:]
+
+                if genus not in clade_abund:
+                    print("no %s found in %s" % (genus, args.metaphlan_fp))
+                    continue
+                if species not in clade_abund[genus]["species"]:
+                    print(
+                        "No %s found in %s for % s"
+                        % (species, args.metaphlan_fp, genus)
+                    )
+                    continue
+
+                s = "%s\t%s\t" % (genus, clade_abund[genus]["abundance"])
+                s += "%s\t%s\t" % (species, clade_abund[genus]["species"][species])
+                s += "%s\t%s\t%s\n" % (charact_id, char_name, abundance)
+                output_f.write(s)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--humann_fp", required=True)
+    parser.add_argument("--metaphlan_fp", required=True)
+    parser.add_argument("--output_fp", required=True)
+    parser.add_argument("--type", required=True, choices=["gene_families", "pathways"])
+    args = parser.parse_args()
+
+    combine_metaphlan_humann(args)
b
diff -r fdfb35745104 -r 01ac9954c27f combine_metaphlan_humann.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_metaphlan_humann.xml Thu Jul 20 10:07:12 2023 +0000
[
@@ -0,0 +1,109 @@
+<tool id="combine_metaphlan_humann" name="Combine MetaPhlAn and HUMAnN outputs" version="0.3.0" profile="22.01">
+    <description>to relate genus/species abundances and gene families/pathways abundances</description>
+    <requirements>
+        <requirement type="package" version="3.10">python</requirement>
+    </requirements>
+    <version_command></version_command>
+    <command detect_errors="aggressive"><![CDATA[
+        python '$__tool_directory__/combine_metaphlan_humann.py'
+            --metaphlan_fp '$metaphlan_file'
+            --humann_fp '$humann_file'
+            --type '$type'   
+
+            #if str($type) == 'gene_families'
+                --output_fp '$gene_families_output_file'
+            #else
+                --output_fp '$pathway_output_file'
+            #end if
+    ]]></command>
+    <inputs>
+        <param argument="--metaphlan_file" format="txt,tabular" type="data" label="Input file corresponding to MetaPhlAn output" help="The MetaPhlAn output file contains relative abundance of clades at different taxonomic levels"/>
+        <param argument="--humann_file" format="txt,tabular" type="data" label="Input file corresponding to HUMAnN output" help="The HUMAnN output file contains relative abundance of gene families or pathways with corresponding taxonomic stratification"/>
+        <param argument='--type' type="select" label="Type of characteristics in HUMAnN file">
+            <option value="gene_families" selected="true">Gene families</option>
+            <option value="pathways">Pathways</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="gene_families_output_file" format="tabular"
+            label="${tool.name} on ${on_string}: Gene family abundances related to genus/species abundances" >
+            <filter>type=="gene_families"</filter>
+        </data>
+        <data name="pathway_output_file" format="tabular"
+            label="${tool.name} on ${on_string}: Pathway abundances related to genus/species abundances" >
+            <filter>type=="pathways"</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- Using the output of old (i.e. v2) Humann and metaphlan -->
+     <test expect_num_outputs="1">
+     <param name="metaphlan_file" value="metaphlan2_input.txt"/>
+     <param name="humann_file" value="humann2_gene_families_input.tabular"/>
+     <param name='type' value="gene_families"/>
+     <output name="gene_families_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="3043"/>
+                    <has_text text="Staphylococcus_epidermidis"/>
+                    <has_text text="Putative transposon Tn552 DNA invertase bin3"/>
+                    <has_size value="374787"/>
+                </assert_contents>
+            </output>
+     </test>
+     <test expect_num_outputs="1">
+      <param name="metaphlan_file" value="metaphlan2_input.txt"/>
+      <param name="humann_file" value="humann2_pathways_input.tabular"/>
+      <param name='type' value="pathways"/>
+             <output name="pathway_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="1533"/>
+                    <has_text text="Rhodobacter_sphaeroides"/>
+                    <has_text text="superpathway of acetyl CoA biosynthesis"/>
+                    <has_size value="186363"/>
+                </assert_contents>
+            </output>
+     </test>
+    
+     <!-- Using the output of v3 Humann and v4 metaphlan -->
+     <test expect_num_outputs="1">
+     <param name="metaphlan_file" value="metaphlan4_input.txt"/>
+     <param name="humann_file" value="humann36_gene_families_input.tabular"/>
+     <param name='type' value="gene_families"/>
+     <output name="gene_families_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="2242"/>
+                    <has_text text="Acetivibrio_thermocellus"/>
+                    <has_size value="213828"/>
+                </assert_contents>
+            </output>
+     </test>
+     <test expect_num_outputs="1">
+      <param name="metaphlan_file" value="metaphlan4_input.txt"/>
+      <param name="humann_file" value="humann36_pathways_input.tabular"/>
+      <param name='type' value="pathways"/>
+             <output name="pathway_output_file">
+                <assert_contents>
+                    <has_n_columns n="8"/>
+                    <has_n_lines n="49"/>
+                    <has_text text="Acetivibrio_thermocellus"/>
+                    <has_text text="preQ0 biosynthesis"/>
+                    <has_size value="6102"/>
+                </assert_contents>
+            </output>
+     </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+This tool combine MetaPhlAn outputs and HUMANnN outputs.
+
+For each gene families/pathways and the corresponding taxonomic stratification, 
+you get relative abundance of this gene family/pathway and the relative abundance 
+of corresponding species and genus.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.7554/eLife.65088</citation>
+    </citations>
+</tool>
b
diff -r fdfb35745104 -r 01ac9954c27f test-data/humann2_gene_families_input.tabular
--- a/test-data/humann2_gene_families_input.tabular Wed Oct 19 14:44:00 2022 +0000
+++ b/test-data/humann2_gene_families_input.tabular Thu Jul 20 10:07:12 2023 +0000
b
b'@@ -4998,199100 +4998,3 @@\n UniRef50_Q9KX08: Putative ribosome biogenesis GTPase RsgA|g__Staphylococcus.s__Staphylococcus_aureus\t68.5175802082\n UniRef50_B8IA33: Fumarylacetoacetate (FAA) hydrolase\t141.3231115250\n UniRef50_B8IA33: Fumarylacetoacetate (FAA) hydrolase|g__Rhodobacter.s__Rhodobacter_sphaeroides\t139.6536624432\n-UniRef50_B8IA33: Fumarylacetoacetate (FAA) hydrolase|g__Pseudomonas.s__Pseudomonas_aeruginosa\t1.6694490818\n-UniRef50_Q5HKL3: RarD protein\t141.2937913397\n-UniRef50_Q5HKL3: RarD protein|g__Staphylococcus.s__Staphylococcus_aureus\t75.4443334388\n-UniRef50_Q5HKL3: RarD protein|g__Staphylococcus.s__Staphylococcus_epidermidis\t65.8494579009\n-UniRef50_P98056: Cytochrome c oxidase subunit 1 homolog\t141.2789852789\n-UniRef50_P98056: Cytochrome c oxidase subunit 1 homolog|g__Rhodobacter.s__Rhodobacter_sphaeroides\t141.2789852789\n-UniRef50_J9YR02: BioY family protein\t141.2515684687\n-UniRef50_J9YR02: BioY family protein|g__Streptococcus.s__Streptococcus_mutans\t141.2515684687\n-UniRef50_D3QDA3: Peptidase, M16 family\t141.2203427703\n-UniRef50_D3QDA3: Peptidase, M16 family|g__Staphylococcus.s__Staphylococcus_aureus\t74.7770938901\n-UniRef50_D3QDA3: Peptidase, M16 family|g__Staphylococcus.s__Staphylococcus_epidermidis\t66.4432488801\n-UniRef50_A6UJ92\t141.2154495421\n-UniRef50_A6UJ92|g__Rhodobacter.s__Rhodobacter_sphaeroides\t141.2154495421\n-UniRef50_Q59928: Acetylornithine aminotransferase\t141.1553501126\n-UniRef50_Q59928: Acetylornithine aminotransferase|g__Staphylococcus.s__Staphylococcus_epidermidis\t91.6696965661\n-UniRef50_Q59928: Acetylornithine aminotransferase|g__Streptococcus.s__Streptococcus_mutans\t49.4856535465\n-UniRef50_Q6GH28: Putative oligopeptide transport ATP-binding protein oppF2\t141.1374136621\n-UniRef50_Q6GH28: Putative oligopeptide transport ATP-binding protein oppF2|g__Staphylococcus.s__Staphylococcus_aureus\t76.9039310117\n-UniRef50_Q6GH28: Putative oligopeptide transport ATP-binding protein oppF2|g__Staphylococcus.s__Staphylococcus_epidermidis\t64.2334826504\n-UniRef50_Q18GT4: Ketol-acid reductoisomerase\t141.1077114723\n-UniRef50_Q18GT4: Ketol-acid reductoisomerase|g__Rhodobacter.s__Rhodobacter_sphaeroides\t134.2898797519\n-UniRef50_Q18GT4: Ketol-acid reductoisomerase|g__Acinetobacter.s__Acinetobacter_baumannii\t6.7608317134\n-UniRef50_Q18GT4: Ketol-acid reductoisomerase|unclassified\t0.0570000070\n-UniRef50_Q38YN7: Putative Holliday junction resolvase\t140.8793311776\n-UniRef50_Q38YN7: Putative Holliday junction resolvase|g__Staphylococcus.s__Staphylococcus_aureus\t140.8793311776\n-UniRef50_G7ZR43\t140.8561513628\n-UniRef50_G7ZR43|g__Staphylococcus.s__Staphylococcus_aureus\t133.2227176893\n-UniRef50_G7ZR43|g__Staphylococcus.s__Staphylococcus_epidermidis\t7.6334336734\n-UniRef50_Q2YXZ9: Probable CtpA-like serine protease\t140.8504434391\n-UniRef50_Q2YXZ9: Probable CtpA-like serine protease|g__Staphylococcus.s__Staphylococcus_epidermidis\t88.2267049391\n-UniRef50_Q2YXZ9: Probable CtpA-like serine protease|g__Staphylococcus.s__Staphylococcus_aureus\t52.6237385000\n-UniRef50_H8LDC1: Helix-turn-helix domain protein\t140.8006300981\n-UniRef50_H8LDC1: Helix-turn-helix domain protein|g__Staphylococcus.s__Staphylococcus_epidermidis\t103.1895152246\n-UniRef50_H8LDC1: Helix-turn-helix domain protein|g__Staphylococcus.s__Staphylococcus_aureus\t37.6111148735\n-UniRef50_Q4L675: Anthranilate synthase component II\t140.7381662526\n-UniRef50_Q4L675: Anthranilate synthase component II|g__Staphylococcus.s__Staphylococcus_epidermidis\t88.4762794479\n-UniRef50_Q4L675: Anthranilate synthase component II|g__Staphylococcus.s__Staphylococcus_aureus\t52.2618868047\n-UniRef50_Q3IV03\t140.6498028101\n-UniRef50_Q3IV03|g__Rhodobacter.s__Rhodobacter_sphaeroides\t133.9618423122\n-UniRef50_Q3IV03|unclassified\t6.6879604980\n-UniRef50_F4LW30: Galactitol-specific enzyme IIC component of PTS\t140.6201005299\n-UniRef50_F4LW30: Galactitol-specific enzyme IIC component of PTS|g__Staphylococcus.s__Staphylococcus_aureus\t140.6201005299\n-UniRef50_K0LNC6: Na+transporting ATP synthase\t140.6088040788\n-UniRef'..b'444DC: hypothetical protein\t0.0017973217\n-UniRef50_UPI00037444DC: hypothetical protein|unclassified\t0.0017973217\n-UniRef50_UPI000378A614: hypothetical protein\t0.0017691421\n-UniRef50_UPI000378A614: hypothetical protein|unclassified\t0.0017691421\n-UniRef50_R4LEH4: Yd repeat-containing protein\t0.0017472130\n-UniRef50_R4LEH4: Yd repeat-containing protein|unclassified\t0.0017472130\n-UniRef50_S4YMU8: Filamentous hemagglutinin\t0.0017432148\n-UniRef50_S4YMU8: Filamentous hemagglutinin|unclassified\t0.0017432148\n-UniRef50_UPI000443E2D6: PREDICTED: tetratricopeptide repeat protein 40\t0.0016819660\n-UniRef50_UPI000443E2D6: PREDICTED: tetratricopeptide repeat protein 40|unclassified\t0.0016819660\n-UniRef50_UPI00036DCFC8: hypothetical protein\t0.0016327044\n-UniRef50_UPI00036DCFC8: hypothetical protein|unclassified\t0.0016327044\n-UniRef50_UPI0001BF6B99: 90S preribosome component RRP12\t0.0016313392\n-UniRef50_UPI0001BF6B99: 90S preribosome component RRP12|unclassified\t0.0016313392\n-UniRef50_UPI000365699C: hypothetical protein\t0.0016224188\n-UniRef50_UPI000365699C: hypothetical protein|unclassified\t0.0016224188\n-UniRef50_UPI000344F009: hypothetical protein\t0.0015763429\n-UniRef50_UPI000344F009: hypothetical protein|unclassified\t0.0015763429\n-UniRef50_U6M5E8\t0.0015640805\n-UniRef50_U6M5E8|unclassified\t0.0015640805\n-UniRef50_UPI000349BE1A: hypothetical protein\t0.0015464606\n-UniRef50_UPI000349BE1A: hypothetical protein|unclassified\t0.0015464606\n-UniRef50_UPI0003773ED0: hypothetical protein\t0.0014975465\n-UniRef50_UPI0003773ED0: hypothetical protein|unclassified\t0.0014975465\n-UniRef50_R0ISA3\t0.0014873041\n-UniRef50_R0ISA3|unclassified\t0.0014873041\n-UniRef50_F4GI46\t0.0014628012\n-UniRef50_F4GI46|unclassified\t0.0014628012\n-UniRef50_UPI0002D336FD: hypothetical protein\t0.0014421264\n-UniRef50_UPI0002D336FD: hypothetical protein|unclassified\t0.0014421264\n-UniRef50_C0N8P3: Type I secretion target GGXGXDXXX repeat protein domain protein\t0.0014134994\n-UniRef50_C0N8P3: Type I secretion target GGXGXDXXX repeat protein domain protein|unclassified\t0.0014134994\n-UniRef50_W7A2A5\t0.0013833667\n-UniRef50_W7A2A5|unclassified\t0.0013833667\n-UniRef50_D2QX58: Peptidase domain protein\t0.0013763922\n-UniRef50_D2QX58: Peptidase domain protein|unclassified\t0.0013763922\n-UniRef50_UPI0004446091: PREDICTED: LOW QUALITY PROTEIN: histone-lysine N-methyltransferase 2C\t0.0013693947\n-UniRef50_UPI0004446091: PREDICTED: LOW QUALITY PROTEIN: histone-lysine N-methyltransferase 2C|unclassified\t0.0013693947\n-UniRef50_UPI0003644B17: hypothetical protein, partial\t0.0013488169\n-UniRef50_UPI0003644B17: hypothetical protein, partial|unclassified\t0.0013488169\n-UniRef50_A0A011N6I9\t0.0013414891\n-UniRef50_A0A011N6I9|unclassified\t0.0013414891\n-UniRef50_A0A058ZAA0\t0.0012939818\n-UniRef50_A0A058ZAA0|unclassified\t0.0012939818\n-UniRef50_UPI000468C770: hypothetical protein\t0.0012465542\n-UniRef50_UPI000468C770: hypothetical protein|unclassified\t0.0012465542\n-UniRef50_UPI00036FCEE3: hypothetical protein\t0.0012113651\n-UniRef50_UPI00036FCEE3: hypothetical protein|unclassified\t0.0012113651\n-UniRef50_UPI00035C33AC: hypothetical protein\t0.0010786375\n-UniRef50_UPI00035C33AC: hypothetical protein|unclassified\t0.0010786375\n-UniRef50_N1Q3A9\t0.0009733696\n-UniRef50_N1Q3A9|unclassified\t0.0009733696\n-UniRef50_A0A031GKF7: Polymorphic membrane protein, Filamentous hemagglutinin/Adhesin\t0.0008849280\n-UniRef50_A0A031GKF7: Polymorphic membrane protein, Filamentous hemagglutinin/Adhesin|unclassified\t0.0008849280\n-UniRef50_D8M1X0: Singapore isolate B (sub-type 7) whole genome shotgun sequence assembly, scaffold_16\t0.0008849116\n-UniRef50_D8M1X0: Singapore isolate B (sub-type 7) whole genome shotgun sequence assembly, scaffold_16|unclassified\t0.0008849116\n-UniRef50_A8LV91\t0.0008555354\n-UniRef50_A8LV91|unclassified\t0.0008555354\n-UniRef50_D3E2A1: Adhesin-like protein\t0.0007987026\n-UniRef50_D3E2A1: Adhesin-like protein|unclassified\t0.0007987026\n-UniRef50_U6MJL1\t0.0007379580\n-UniRef50_U6MJL1|unclassified\t0.0007379580\n\\ No newline at end of file\n'
b
diff -r fdfb35745104 -r 01ac9954c27f test-data/humann36_gene_families_input.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/humann36_gene_families_input.tabular Thu Jul 20 10:07:12 2023 +0000
b
b'@@ -0,0 +1,6393 @@\n+# Gene Family\thumann_Abundance-RPKs\n+UNMAPPED\t94157.0000000000\n+UniRef90_A3DCI4\t42213.2758828385\n+UniRef90_A3DCI4|g__Acetivibrio.s__Acetivibrio_thermocellus\t42205.5707425926\n+UniRef90_A3DCI4|unclassified\t7.7051402458\n+UniRef90_A3DCB9\t39287.6314397701\n+UniRef90_A3DCB9|g__Acetivibrio.s__Acetivibrio_thermocellus\t39273.2031556915\n+UniRef90_A3DCB9|unclassified\t14.4282840786\n+UniRef90_A3DC67\t33187.2752874343\n+UniRef90_A3DC67|g__Acetivibrio.s__Acetivibrio_thermocellus\t33173.4939517261\n+UniRef90_A3DC67|unclassified\t13.7813357082\n+UniRef90_A3DBR3\t23099.5863265035\n+UniRef90_A3DBR3|g__Acetivibrio.s__Acetivibrio_thermocellus\t23099.5863265035\n+UniRef90_A3DI60\t19908.5320286869\n+UniRef90_A3DI60|g__Acetivibrio.s__Acetivibrio_thermocellus\t19908.5320286869\n+UniRef90_G2JC59\t17973.0999765074\n+UniRef90_G2JC59|g__Acetivibrio.s__Acetivibrio_thermocellus\t17967.4502589933\n+UniRef90_G2JC59|unclassified\t5.6497175141\n+UniRef90_A3DEF8\t8784.6701267235\n+UniRef90_A3DEF8|g__Acetivibrio.s__Acetivibrio_thermocellus\t8784.6701267235\n+UniRef90_B5Y8J9\t6444.1520775882\n+UniRef90_B5Y8J9|g__Coprothermobacter.s__Coprothermobacter_proteolyticus\t6444.1520775882\n+UniRef90_G2JC80\t6352.4725274725\n+UniRef90_G2JC80|g__Acetivibrio.s__Acetivibrio_thermocellus\t6333.3333333333\n+UniRef90_G2JC80|unclassified\t19.1391941392\n+UniRef90_A3DHD7\t6300.0000000000\n+UniRef90_A3DHD7|g__Acetivibrio.s__Acetivibrio_thermocellus\t6300.0000000000\n+UniRef90_UPI00003C9096\t5440.2852049911\n+UniRef90_UPI00003C9096|g__Acetivibrio.s__Acetivibrio_thermocellus\t5440.2852049911\n+UniRef90_A3DBD1\t5228.5275293678\n+UniRef90_A3DBD1|g__Acetivibrio.s__Acetivibrio_thermocellus\t5228.5275293678\n+UniRef90_B5Y7J1\t4886.8772665304\n+UniRef90_B5Y7J1|g__Coprothermobacter.s__Coprothermobacter_proteolyticus\t4877.5749409490\n+UniRef90_B5Y7J1|unclassified\t9.3023255814\n+UniRef90_A3DK33\t4847.6911046572\n+UniRef90_A3DK33|g__Acetivibrio.s__Acetivibrio_thermocellus\t4847.6911046572\n+UniRef90_A3DI53\t4497.2048074566\n+UniRef90_A3DI53|g__Acetivibrio.s__Acetivibrio_thermocellus\t4497.2048074566\n+UniRef90_A3DDB7\t4259.5094204382\n+UniRef90_A3DDB7|g__Acetivibrio.s__Acetivibrio_thermocellus\t4259.5094204382\n+UniRef90_A3DIZ6\t4066.5364815483\n+UniRef90_A3DIZ6|g__Acetivibrio.s__Acetivibrio_thermocellus\t4066.5364815483\n+UniRef90_A3DF72\t3886.8805031285\n+UniRef90_A3DF72|g__Acetivibrio.s__Acetivibrio_thermocellus\t3886.8805031285\n+UniRef90_A3DEF4\t3725.9575790837\n+UniRef90_A3DEF4|g__Acetivibrio.s__Acetivibrio_thermocellus\t3725.9575790837\n+UniRef90_A3DJT8\t3675.2912070884\n+UniRef90_A3DJT8|g__Acetivibrio.s__Acetivibrio_thermocellus\t3675.2912070884\n+UniRef90_A3DEF7\t3565.1931988761\n+UniRef90_A3DEF7|g__Acetivibrio.s__Acetivibrio_thermocellus\t3550.6874875446\n+UniRef90_A3DEF7|unclassified\t14.5057113315\n+UniRef90_W5SYI9\t3113.2956007559\n+UniRef90_W5SYI9|unclassified\t3113.2956007559\n+UniRef90_Q8GEF9\t2946.4796211538\n+UniRef90_Q8GEF9|unclassified\t2946.4796211538\n+UniRef90_A3DHZ3\t2882.3529411765\n+UniRef90_A3DHZ3|g__Acetivibrio.s__Acetivibrio_thermocellus\t2882.3529411765\n+UniRef90_A3DC76\t2725.8973504340\n+UniRef90_A3DC76|g__Acetivibrio.s__Acetivibrio_thermocellus\t2725.8973504340\n+UniRef90_A3DE79\t2702.1266968326\n+UniRef90_A3DE79|g__Acetivibrio.s__Acetivibrio_thermocellus\t2702.1266968326\n+UniRef90_A3DBK1\t2694.5061274276\n+UniRef90_A3DBK1|g__Acetivibrio.s__Acetivibrio_thermocellus\t2694.5061274276\n+UniRef90_A3DF83\t2692.7087960061\n+UniRef90_A3DF83|g__Acetivibrio.s__Acetivibrio_thermocellus\t2692.7087960061\n+UniRef90_B5Y935\t2440.8093147270\n+UniRef90_B5Y935|g__Coprothermobacter.s__Coprothermobacter_proteolyticus\t2440.8093147270\n+UniRef90_A3DEM9\t2396.7775807888\n+UniRef90_A3DEM9|g__Acetivibrio.s__Acetivibrio_thermocellus\t2396.7775807888\n+UniRef90_B5Y742\t2374.1258357332\n+UniRef90_B5Y742|g__Coprothermobacter.s__Coprothermobacter_proteolyticus\t2374.1258357332\n+UniRef90_UPI0001C1468A\t2330.3621184267\n+UniRef90_UPI0001C1468A|g__Acetivibrio.s__Acetivibrio_thermocellus\t2330.3621184267\n+UniRef90_A3DDQ7\t2277.0867600003\n+UniRef90_A3DDQ7|g__Acetivibrio.s__Acetivibrio_thermocellu'..b'93046292\n+UniRef90_A0A376TLE6\t2.7874315825\n+UniRef90_A0A376TLE6|unclassified\t2.7874315825\n+UniRef90_UPI000DDE62FE\t2.7216667800\n+UniRef90_UPI000DDE62FE|unclassified\t2.7216667800\n+UniRef90_A0A357AIJ3\t2.6461389890\n+UniRef90_A0A357AIJ3|unclassified\t2.6461389890\n+UniRef90_UPI0009875C6E\t2.6315789474\n+UniRef90_UPI0009875C6E|unclassified\t2.6315789474\n+UniRef90_A0A3D3QTT5\t2.4975222993\n+UniRef90_A0A3D3QTT5|unclassified\t2.4975222993\n+UniRef90_UPI000B6039CE\t2.4834437086\n+UniRef90_UPI000B6039CE|unclassified\t2.4834437086\n+UniRef90_A0A1B1YNW6\t2.4739225771\n+UniRef90_A0A1B1YNW6|unclassified\t2.4739225771\n+UniRef90_W5S5F3\t2.4341148999\n+UniRef90_W5S5F3|unclassified\t2.4341148999\n+UniRef90_W5S9S1\t2.4341148999\n+UniRef90_W5S9S1|unclassified\t2.4341148999\n+UniRef90_UPI000C7DA0AE\t2.4268007336\n+UniRef90_UPI000C7DA0AE|unclassified\t2.4268007336\n+UniRef90_A0A3B8J649\t2.4002809077\n+UniRef90_A0A3B8J649|unclassified\t2.4002809077\n+UniRef90_A0A162MND7\t2.3923444976\n+UniRef90_A0A162MND7|unclassified\t2.3923444976\n+UniRef90_UPI0008D9B061\t2.3800084950\n+UniRef90_UPI0008D9B061|unclassified\t2.3800084950\n+UniRef90_W4V9N2\t2.3649971475\n+UniRef90_W4V9N2|unclassified\t2.3649971475\n+UniRef90_M1YXY6\t2.2653263486\n+UniRef90_M1YXY6|unclassified\t2.2653263486\n+UniRef90_O27365\t2.2372806123\n+UniRef90_O27365|unclassified\t2.2372806123\n+UniRef90_A0A2K9E423\t2.2175274658\n+UniRef90_A0A2K9E423|unclassified\t2.2175274658\n+UniRef90_A0A2K2FN24\t2.2095097179\n+UniRef90_A0A2K2FN24|unclassified\t2.2095097179\n+UniRef90_R6CBA5\t2.1551724138\n+UniRef90_R6CBA5|unclassified\t2.1551724138\n+UniRef90_A0A328PEH5\t2.1486843000\n+UniRef90_A0A328PEH5|unclassified\t2.1486843000\n+UniRef90_A0A3D4VQP6\t2.1276595745\n+UniRef90_A0A3D4VQP6|unclassified\t2.1276595745\n+UniRef90_O26139\t2.1196420160\n+UniRef90_O26139|unclassified\t2.1196420160\n+UniRef90_A0A140LCX8\t2.1093529946\n+UniRef90_A0A140LCX8|unclassified\t2.1093529946\n+UniRef90_X1J9F7\t1.8298059965\n+UniRef90_X1J9F7|unclassified\t1.8298059965\n+UniRef90_A0A117KRB1\t1.8119740034\n+UniRef90_A0A117KRB1|unclassified\t1.8119740034\n+UniRef90_A0A1F9CUU1\t1.6058644812\n+UniRef90_A0A1F9CUU1|unclassified\t1.6058644812\n+UniRef90_A5D2N8\t1.5956362082\n+UniRef90_A5D2N8|unclassified\t1.5956362082\n+UniRef90_X1LZV2\t1.5948963317\n+UniRef90_X1LZV2|unclassified\t1.5948963317\n+UniRef90_UPI000DD697AB\t1.5851270470\n+UniRef90_UPI000DD697AB|unclassified\t1.5851270470\n+UniRef90_A0A1B2CU56\t1.5777772956\n+UniRef90_A0A1B2CU56|unclassified\t1.5777772956\n+UniRef90_A0A372HJC7\t1.4739212069\n+UniRef90_A0A372HJC7|unclassified\t1.4739212069\n+UniRef90_F4LVJ1\t1.2944593157\n+UniRef90_F4LVJ1|unclassified\t1.2944593157\n+UniRef90_A0A2S8DIE5\t1.2835673811\n+UniRef90_A0A2S8DIE5|unclassified\t1.2835673811\n+UniRef90_A0A101HUR7\t1.1696034784\n+UniRef90_A0A101HUR7|unclassified\t1.1696034784\n+UniRef90_A4J2K8\t1.1696034784\n+UniRef90_A4J2K8|unclassified\t1.1696034784\n+UniRef90_A5D3F7\t1.1696034784\n+UniRef90_A5D3F7|unclassified\t1.1696034784\n+UniRef90_A0A3B8J8R7\t1.1013215859\n+UniRef90_A0A3B8J8R7|unclassified\t1.1013215859\n+UniRef90_A0A2K9EDL4\t1.0729613734\n+UniRef90_A0A2K9EDL4|unclassified\t1.0729613734\n+UniRef90_A0A1M4SS56\t1.0460251046\n+UniRef90_A0A1M4SS56|unclassified\t1.0460251046\n+UniRef90_E3N9M8\t1.0460251046\n+UniRef90_E3N9M8|unclassified\t1.0460251046\n+UniRef90_UPI000BB7D2FD\t0.8818342152\n+UniRef90_UPI000BB7D2FD|unclassified\t0.8818342152\n+UniRef90_UPI000D6F978D\t0.8503401361\n+UniRef90_UPI000D6F978D|unclassified\t0.8503401361\n+UniRef90_A0A1C0ACE1\t0.8031316965\n+UniRef90_A0A1C0ACE1|unclassified\t0.8031316965\n+UniRef90_R9R0L9\t0.8012635864\n+UniRef90_R9R0L9|unclassified\t0.8012635864\n+UniRef90_A0A037Z7M6\t0.7685924790\n+UniRef90_A0A037Z7M6|unclassified\t0.7685924790\n+UniRef90_A0A0L6ZD62\t0.7685924790\n+UniRef90_A0A0L6ZD62|unclassified\t0.7685924790\n+UniRef90_UPI0009B7FD86\t0.6327050825\n+UniRef90_UPI0009B7FD86|unclassified\t0.6327050825\n+UniRef90_UPI000AB0D734\t0.5923196517\n+UniRef90_UPI000AB0D734|unclassified\t0.5923196517\n+UniRef90_A0A2S8DNM3\t0.5555555556\n+UniRef90_A0A2S8DNM3|unclassified\t0.5555555556\n+UniRef90_A0A2L0NH00\t0.2389765701\n+UniRef90_A0A2L0NH00|unclassified\t0.2389765701\n'
b
diff -r fdfb35745104 -r 01ac9954c27f test-data/humann36_pathways_input.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/humann36_pathways_input.tabular Thu Jul 20 10:07:12 2023 +0000
b
b"@@ -0,0 +1,116 @@\n+# Pathway\thumann_Abundance\n+UNMAPPED\t8349.9735860182\n+UNINTEGRATED\t48556.8350742608\n+UNINTEGRATED|g__Acetivibrio.s__Acetivibrio_thermocellus\t29162.0390929203\n+UNINTEGRATED|unclassified\t3307.1774531381\n+PWY-6609: adenine and adenosine salvage III\t277.0510927865\n+PWY-6609: adenine and adenosine salvage III|g__Acetivibrio.s__Acetivibrio_thermocellus\t66.3688569288\n+PWY-6609: adenine and adenosine salvage III|unclassified\t11.9769595308\n+PWY-1042: glycolysis IV\t194.2219430842\n+PWY-5100: pyruvate fermentation to acetate and lactate II\t129.0053457248\n+PWY-5100: pyruvate fermentation to acetate and lactate II|g__Acetivibrio.s__Acetivibrio_thermocellus\t69.7497669984\n+PWY-7221: guanosine ribonucleotides de novo biosynthesis\t121.7671397650\n+PWY-7221: guanosine ribonucleotides de novo biosynthesis|g__Acetivibrio.s__Acetivibrio_thermocellus\t108.5388331853\n+PWY-7221: guanosine ribonucleotides de novo biosynthesis|unclassified\t13.6910424631\n+PWY-6703: preQ0 biosynthesis\t103.6377362280\n+PWY-6703: preQ0 biosynthesis|g__Acetivibrio.s__Acetivibrio_thermocellus\t98.9807370179\n+PWY-7228: superpathway of guanosine nucleotides de novo biosynthesis I\t83.3982268723\n+PWY-7228: superpathway of guanosine nucleotides de novo biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t48.4938466340\n+RIBOSYN2-PWY: flavin biosynthesis I (bacteria and plants)\t77.2232042652\n+RIBOSYN2-PWY: flavin biosynthesis I (bacteria and plants)|g__Acetivibrio.s__Acetivibrio_thermocellus\t53.2211975186\n+VALSYN-PWY: L-valine biosynthesis\t77.0829759735\n+VALSYN-PWY: L-valine biosynthesis|g__Acetivibrio.s__Acetivibrio_thermocellus\t77.0829759735\n+PWY-7208: superpathway of pyrimidine nucleobases salvage\t77.0563435419\n+PWY-7208: superpathway of pyrimidine nucleobases salvage|g__Acetivibrio.s__Acetivibrio_thermocellus\t45.1294786723\n+PWY-7208: superpathway of pyrimidine nucleobases salvage|unclassified\t17.1808137447\n+ILEUSYN-PWY: L-isoleucine biosynthesis I (from threonine)\t70.6881932740\n+ILEUSYN-PWY: L-isoleucine biosynthesis I (from threonine)|g__Acetivibrio.s__Acetivibrio_thermocellus\t70.6881932740\n+PWY-5981: CDP-diacylglycerol biosynthesis III\t70.6859344494\n+PWY-3841: folate transformations II (plants)\t68.9410605440\n+PWY-3841: folate transformations II (plants)|g__Acetivibrio.s__Acetivibrio_thermocellus\t65.3076297526\n+PWY-5103: L-isoleucine biosynthesis III\t66.6594375601\n+PWY-5103: L-isoleucine biosynthesis III|g__Acetivibrio.s__Acetivibrio_thermocellus\t66.6594375601\n+PWY-7229: superpathway of adenosine nucleotides de novo biosynthesis I\t60.8146639896\n+PWY-7229: superpathway of adenosine nucleotides de novo biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t43.7454671075\n+PWY0-1319: CDP-diacylglycerol biosynthesis II\t60.4369653934\n+PWY-5667: CDP-diacylglycerol biosynthesis I\t60.4369653934\n+COA-PWY-1: superpathway of coenzyme A biosynthesis III (mammals)\t53.6643541116\n+COA-PWY-1: superpathway of coenzyme A biosynthesis III (mammals)|g__Acetivibrio.s__Acetivibrio_thermocellus\t44.5540259674\n+ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)\t53.4593266289\n+TRNA-CHARGING-PWY: tRNA charging\t50.1452719795\n+TRNA-CHARGING-PWY: tRNA charging|g__Acetivibrio.s__Acetivibrio_thermocellus\t40.2804860544\n+UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I\t50.0293370404\n+UDPNAGSYN-PWY: UDP-N-acetyl-D-glucosamine biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t43.2002393401\n+P41-PWY: pyruvate fermentation to acetate and (S)-lactate I\t49.9950777370\n+P41-PWY: pyruvate fermentation to acetate and (S)-lactate I|g__Acetivibrio.s__Acetivibrio_thermocellus\t45.0063828591\n+COA-PWY: coenzyme A biosynthesis I (prokaryotic)\t46.0146775357\n+COA-PWY: coenzyme A biosynthesis I (prokaryotic)|g__Acetivibrio.s__Acetivibrio_thermocellus\t37.7428378807\n+PWY-7851: coenzyme A biosynthesis II (eukaryotic)\t46.0146775357\n+PWY-7851: coenzyme A biosynthesis II (eukaryotic)|g__Acetivibrio.s__Acetivibrio_thermocellus\t37.7428378807\n+PWY-5695: inosine 5'-phosphate degradation\t44.9825390520\n+PW"..b"9529719544\n+NONMEVIPP-PWY: methylerythritol phosphate pathway I\t33.6589833555\n+NONMEVIPP-PWY: methylerythritol phosphate pathway I|g__Acetivibrio.s__Acetivibrio_thermocellus\t29.0719820809\n+PWY-6387: UDP-N-acetylmuramoyl-pentapeptide biosynthesis I (meso-diaminopimelate containing)\t33.0594389993\n+PWY-6387: UDP-N-acetylmuramoyl-pentapeptide biosynthesis I (meso-diaminopimelate containing)|g__Acetivibrio.s__Acetivibrio_thermocellus\t20.8795281789\n+SER-GLYSYN-PWY: superpathway of L-serine and glycine biosynthesis I\t32.8269248922\n+SER-GLYSYN-PWY: superpathway of L-serine and glycine biosynthesis I|unclassified\t12.3167606082\n+PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II\t30.5171500072\n+PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II|g__Acetivibrio.s__Acetivibrio_thermocellus\t23.3388473782\n+PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis\t30.5171500072\n+PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis|g__Acetivibrio.s__Acetivibrio_thermocellus\t23.3388473782\n+PYRIDNUCSYN-PWY: NAD de novo biosynthesis I (from aspartate)\t29.9045452545\n+PYRIDNUCSYN-PWY: NAD de novo biosynthesis I (from aspartate)|g__Acetivibrio.s__Acetivibrio_thermocellus\t20.7694090097\n+PWY-6386: UDP-N-acetylmuramoyl-pentapeptide biosynthesis II (lysine-containing)\t29.7827132121\n+PWY-6386: UDP-N-acetylmuramoyl-pentapeptide biosynthesis II (lysine-containing)|g__Acetivibrio.s__Acetivibrio_thermocellus\t24.3303646214\n+PWY-6124: inosine-5'-phosphate biosynthesis II\t29.6777862771\n+PWY-6124: inosine-5'-phosphate biosynthesis II|g__Acetivibrio.s__Acetivibrio_thermocellus\t27.8207756981\n+PEPTIDOGLYCANSYN-PWY: peptidoglycan biosynthesis I (meso-diaminopimelate containing)\t29.4351322618\n+PEPTIDOGLYCANSYN-PWY: peptidoglycan biosynthesis I (meso-diaminopimelate containing)|g__Acetivibrio.s__Acetivibrio_thermocellus\t21.3306301878\n+PWY-2942: L-lysine biosynthesis III\t28.9072019154\n+PWY-2942: L-lysine biosynthesis III|g__Acetivibrio.s__Acetivibrio_thermocellus\t28.9072019154\n+PWY0-1296: purine ribonucleosides degradation\t28.8114422948\n+PWY0-1296: purine ribonucleosides degradation|g__Acetivibrio.s__Acetivibrio_thermocellus\t21.7825853640\n+PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)\t28.6286133554\n+PANTOSYN-PWY: superpathway of coenzyme A biosynthesis I (bacteria)|g__Acetivibrio.s__Acetivibrio_thermocellus\t25.9318707800\n+PWY-6123: inosine-5'-phosphate biosynthesis I\t28.5956903171\n+PWY-6123: inosine-5'-phosphate biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t27.1410372715\n+PWY-6385: peptidoglycan biosynthesis III (mycobacteria)\t28.0459769653\n+PWY-6385: peptidoglycan biosynthesis III (mycobacteria)|g__Acetivibrio.s__Acetivibrio_thermocellus\t20.5915247014\n+PWY-7953: UDP-N-acetylmuramoyl-pentapeptide biosynthesis III (meso-diaminopimelate containing)\t27.3969889419\n+PWY0-1586: peptidoglycan maturation (meso-diaminopimelate containing)\t25.5073725776\n+PWY0-1586: peptidoglycan maturation (meso-diaminopimelate containing)|g__Acetivibrio.s__Acetivibrio_thermocellus\t25.5073725776\n+PANTO-PWY: phosphopantothenate biosynthesis I\t23.5721171877\n+PANTO-PWY: phosphopantothenate biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t21.8182330339\n+PWY-7220: adenosine deoxyribonucleotides de novo biosynthesis II\t22.4257853098\n+PWY-7220: adenosine deoxyribonucleotides de novo biosynthesis II|g__Acetivibrio.s__Acetivibrio_thermocellus\t19.9656642609\n+PWY-7222: guanosine deoxyribonucleotides de novo biosynthesis II\t22.4257853098\n+PWY-7222: guanosine deoxyribonucleotides de novo biosynthesis II|g__Acetivibrio.s__Acetivibrio_thermocellus\t19.9656642609\n+PWY-6700: queuosine biosynthesis I (de novo)\t20.5232364544\n+PWY-6700: queuosine biosynthesis I (de novo)|g__Acetivibrio.s__Acetivibrio_thermocellus\t19.1237279391\n+PWY-6147: 6-hydroxymethyl-dihydropterin diphosphate biosynthesis I\t18.5007041182\n+PWY-6147: 6-hydroxymethyl-dihydropterin diphosphate biosynthesis I|g__Acetivibrio.s__Acetivibrio_thermocellus\t18.3787528306\n"
b
diff -r fdfb35745104 -r 01ac9954c27f test-data/metaphlan4_input.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/metaphlan4_input.txt Thu Jul 20 10:07:12 2023 +0000
b
@@ -0,0 +1,20 @@
+#mpa_vOct22_CHOCOPhlAnSGB_202212
+#/usr/local/tools/_conda/envs/__metaphlan@4.0.6/bin/metaphlan /data/dnb08/galaxy_db/files/3/4/6/dataset_3461ff07-1b4d-4e57-b746-40d9d8066f01.dat,/data/dnb08/galaxy_db/files/e/a/b/dataset_eab63cfc-85ed-4f05-a3eb-f87cb309da27.dat --input_type fastq --read_min_len 70 --bt2_ps very-sensitive --min_mapq_val 5 --bowtie2db /data/db/data_managers/metaphlan/data/mpa_vOct22_CHOCOPhlAnSGB_202212 --index mpa_vOct22_CHOCOPhlAnSGB_202212 -t rel_ab --tax_lev a --min_cu_len 2000 --add_viruses --stat_q 0.1 --perc_nonzero 0.33 --avoid_disqm --sample_id_key SampleID --sample_id Metaphlan_Analysis -o /data/jwd05e/main/059/485/59485896/outputs/galaxy_dataset_dbfb8cc7-14d1-4318-a3b7-2a988fa2ff7f.dat --bowtie2out bowtie2out -s /data/jwd05e/main/059/485/59485896/outputs/galaxy_dataset_1afff43f-1b78-4574-b961-e7c7e7b902f2.dat --biom /data/jwd05e/main/059/485/59485896/outputs/galaxy_dataset_f4980675-ac4a-44fa-9e21-4c33652e5e87.dat --nproc 1
+#465754 reads processed
+#SampleID Metaphlan_Analysis
+#clade_name NCBI_tax_id relative_abundance additional_species
+k__Bacteria 2 100.0
+k__Bacteria|p__Firmicutes 2|1239 68.23371
+k__Bacteria|p__Coprothermobacterota 2|2138240 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia 2|1239|186801 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria 2|2138240|2138243 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales 2|1239|186801|186802 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria|o__Coprothermobacterales 2|2138240|2138243|2138246 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae 2|1239|186801|186802|216572 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria|o__Coprothermobacterales|f__Coprothermobacteraceae 2|2138240|2138243|2138246|2138247 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Acetivibrio 2|1239|186801|186802|216572|35829 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria|o__Coprothermobacterales|f__Coprothermobacteraceae|g__Coprothermobacter 2|2138240|2138243|2138246|2138247|68335 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Acetivibrio|s__Acetivibrio_thermocellus 2|1239|186801|186802|216572|35829|1515 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria|o__Coprothermobacterales|f__Coprothermobacteraceae|g__Coprothermobacter|s__Coprothermobacter_proteolyticus 2|2138240|2138243|2138246|2138247|68335|35786 31.76629
+k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Acetivibrio|s__Acetivibrio_thermocellus|t__SGB8476 2|1239|186801|186802|216572|35829|1515| 68.23371
+k__Bacteria|p__Coprothermobacterota|c__Coprothermobacteria|o__Coprothermobacterales|f__Coprothermobacteraceae|g__Coprothermobacter|s__Coprothermobacter_proteolyticus|t__SGB8555 2|2138240|2138243|2138246|2138247|68335|35786| 31.76629