Repository 'eggnog_mapper'
hg clone https://toolshed.g2.bx.psu.edu/repos/galaxyp/eggnog_mapper

Changeset 12:9d1fbff733cf (2022-07-19)
Previous changeset 11:5a30ae278db0 (2022-06-20) Next changeset 13:844fa988236b (2023-09-04)
Commit message:
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/eggnog_mapper/eggnog_mapper commit 07877ba6d3fb6e28e94500f2392db6393cf325fd
modified:
eggnog_macros.xml
eggnog_mapper.xml
added:
test-data/DIA_nlim.emapper.annotations_cached
removed:
test-data/DIA_nlim.emapper.annotations
test-data/DIA_nlim.emapper.annotations_orthologs
test-data/nlim_1033.fasta
test-data/scoped.emapper.annotations
test-data/scoped.emapper.annotations_orthologs
test-data/scoped.emapper.seed_orthologs
b
diff -r 5a30ae278db0 -r 9d1fbff733cf eggnog_macros.xml
--- a/eggnog_macros.xml Mon Jun 20 12:49:52 2022 +0000
+++ b/eggnog_macros.xml Tue Jul 19 15:14:52 2022 +0000
[
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <macros>
    <token name="@TOOL_VERSION@">2.1.8</token>
-   <token name="@VERSION_SUFFIX@">2.1.8</token>
+   <token name="@VERSION_SUFFIX@">3</token>
    <token name="@EGGNOG_DB_VERSION@">5.0.2</token>
     <!--
     # DB versionning was super confusing for eggnog-mapper 2.0.x:
@@ -64,10 +64,82 @@
             </output>
         </test>
     </xml>
+
+    <xml name="fasta_input">
+        <param argument="-i" name="input" type="data" format="fasta" label="Fasta sequences to annotate"/>
+        <conditional name="input_trans">
+            <param argument="--itype" type="select" label="Type of sequences">
+                <option value="proteins" selected="true">proteins</option>
+                <option value="CDS">CDS</option>
+                <option value="genome">genome</option>
+                <option value="metagenome">metagenome</option>
+            </param>
+            <when value="proteins"/>
+            <when value="CDS">
+                <param argument="--translate" type="boolean" truevalue="--translate" falsevalue="" checked="false"
+                    label="Translate CDS to proteins before search"/>
+            </when>
+            <when value="genome">
+                <param argument="--translate" type="boolean" truevalue="--translate" falsevalue="" checked="false"
+                    label="Translate predicted CDS from blastx hits to proteins"/>
+                <param argument="--genepred" type="select" label="Type of sequences">
+                    <option value="search">Inferred from Diamond/MMseqs2 blastx hits</option>
+                    <option value="prodigal" selected="true">Performed using Prodigal</option>
+                </param>
+            </when>
+            <when value="metagenome">
+                <param argument="--translate" type="boolean" truevalue="--translate" falsevalue="" checked="false"
+                    label="Translate predicted CDS from blastx hits to proteins"/>
+                <param argument="--genepred" type="select" label="Type of sequences">
+                    <option value="search">Inferred from Diamond/MMseqs2 blastx hits</option>
+                    <option value="prodigal" selected="true">Performed using Prodigal</option>
+                </param>
+            </when>
+        </conditional>
+    </xml>
+    <!-- Common options for search filtering (applies to diamond and mmseqs only) -->
+    <xml name="common_search_options">
+        <param argument="--query_cover" type="integer" optional="true" min="0" max="100" label="Minimum query coverage" help="Report only alignments above the given percentage of query cover" />
+        <param argument="--subject_cover" type="integer" optional="true" min="0" max="100" label="Minimum subject coverage" help="Report only alignments above the given percentage of subject cover" />
+        <param argument="--pident" type="integer" optional="true" min="0" max="100" label="Minimum query coverage" help="Report only alignments above the given percentage of identity" />
+        <param argument="--evalue" type="float" optional="true" min="0" label="Minimum query coverage" help="Report only alignments below or equal the e-value" />
+        <param argument="--score" type="float" value="0.001" optional="true" min="0" label="Minimum query coverage" help="Report only alignments above or equal the score" />
+    </xml>
+    <token name="@SEED_ORTHOLOG_COLUMNS@">query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov</token>
+    
     <xml name="stdout_assertion">
         <assert_stdout>
             <has_line line="#  emapper-@TOOL_VERSION@"/>
             <has_line line="FINISHED"/>
+            <yield/>
         </assert_stdout>
     </xml>
+    <xml name="seed_orthologs_assertion">
+        <output name="seed_orthologs" ftype="tabular">
+            <assert_contents>
+                <has_line line="#qseqid&#009;sseqid&#009;evalue&#009;bitscore&#009;qstart&#009;qend&#009;sstart&#009;send&#009;pident&#009;qcov&#009;scov"/>
+                <has_line_matching expression="(\S+\t){2}[-+.e\d]+\t[.\d]+(\t\d+){4}(\t[.\d]+){3}" n="1"/>
+            </assert_contents>
+        </output>
+    </xml>
+    <xml name="annotations_assertion" token_columns="21" token_add_metadata_columm_names="" token_add_column_names="" token_add_column_re="">
+        <output name="annotations" ftype="tabular">
+            <metadata name="columns" value="@COLUMNS@" />
+            <!-- <metadata name="column_names" value="query,seed_ortholog,evalue,score,max_annot_lvl,COG_category,Description,Preferred_name,GOs,EC,KEGG_ko,KEGG_Pathway,KEGG_Module,KEGG_Reaction,KEGG_rclass,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMseggNOG_OGs@ADD_METADATA_COLUMN_NAMES@" /> -->
+            <assert_contents>
+                <has_line line="#query&#009;seed_ortholog&#009;evalue&#009;score&#009;eggNOG_OGs&#009;max_annot_lvl&#009;COG_category&#009;Description&#009;Preferred_name&#009;GOs&#009;EC&#009;KEGG_ko&#009;KEGG_Pathway&#009;KEGG_Module&#009;KEGG_Reaction&#009;KEGG_rclass&#009;BRITE&#009;KEGG_TC&#009;CAZy&#009;BiGG_Reaction&#009;PFAMs@ADD_COLUMN_NAMES@"/>
+                <has_line_matching expression="(\S+\t){2}[-+.e\d]+\t[.\d]+(\t\S+){7}\tko:\S+(\t\S+){9}@ADD_COLUMN_RE@" n="1"/>
+            </assert_contents>
+        </output>
+    </xml>
+    <xml name="annotations_orthologs_assertion">
+        <output name="annotations_orthologs" ftype="tabular">
+            <metadata name="columns" value="4" />
+            <metadata name="column_names" value="query,orth_type,species,orthologs" />
+            <assert_contents>
+                <has_line line="#query&#009;orth_type&#009;species&#009;orthologs"/>
+                <has_line_matching expression="\S+\t(one2one|many2one|one2many|many2many|seed)(\t[^\t]+){2}" n="2"/>
+            </assert_contents>
+        </output>
+    </xml>
 </macros>
b
diff -r 5a30ae278db0 -r 9d1fbff733cf eggnog_mapper.xml
--- a/eggnog_mapper.xml Mon Jun 20 12:49:52 2022 +0000
+++ b/eggnog_mapper.xml Tue Jul 19 15:14:52 2022 +0000
[
b'@@ -6,370 +6,472 @@\n     <expand macro="requirements"/>\n     <expand macro="version_command"/>\n     <command detect_errors="aggressive"><![CDATA[\n-        emapper.py\n-        --data_dir \'$eggnog_data.fields.path\'\n-        -m \'$seed_ortho_options.ortho_method.m\'\n-        --itype \'${input_trans.itype}\'\n-        #if $input_trans.itype in [\'CDS\', \'genome\', \'metagenome\']:\n-            $input_trans.translate\n-        #end if\n-        #if $input_trans.itype in [\'genome\', \'metagenome\']:\n-            --genepred $input_trans.genepred\n+        #if $ortho_method.m == "no_search"\n+            cat \n+            #for aht in $ortho_method.annotate_hits_table\n+                $aht\n+            #end for\n+            > annotate_hits_table.tsv\n+            &&\n         #end if\n \n-        ## Diamond option\n-        #if $seed_ortho_options.ortho_method.m == "diamond":\n-            --matrix \'$seed_ortho_options.ortho_method.matrix_gapcosts.matrix\'\n-            $seed_ortho_options.ortho_method.matrix_gapcosts.gap_costs\n-            --sensmode $seed_ortho_options.ortho_method.sensmode\n-            $seed_ortho_options.ortho_method.dmnd_iterate\n-            $seed_ortho_options.ortho_method.dmnd_ignore_warnings\n-        #elif $seed_ortho_options.ortho_method.m == "mmseqs":\n-            --start_sens $seed_ortho_options.ortho_method.start_sens\n-            --sens_steps $seed_ortho_options.ortho_method.sens_steps\n-            --final_sens $seed_ortho_options.ortho_method.final_sens\n+        emapper.py\n+        --data_dir \'$eggnog_data.fields.path\'\n+        -m \'$ortho_method.m\'\n+\n+        #if $ortho_method.m in [\'diamond\', \'mmseqs\', \'cache\']:\n+            -i \'$ortho_method.input\'\n+            --itype \'$ortho_method.input_trans.itype\'\n+            #if $ortho_method.input_trans.itype in [\'CDS\', \'genome\', \'metagenome\']:\n+                $ortho_method.input_trans.translate\n+            #end if\n+            #if $ortho_method.input_trans.itype in [\'genome\', \'metagenome\']:\n+                --genepred $ortho_method.input_trans.genepred\n+            #end if\n+        #elif $ortho_method.m == "no_search"\n+            --annotate_hits_table annotate_hits_table.tsv\n         #end if\n-\n-        ## Common options for search filtering\n-        #if $seed_ortho_options.query_cover:\n-        --query_cover $seed_ortho_options.query_cover\n-        #end if\n-        #if $seed_ortho_options.subject_cover:\n-        --subject_cover $seed_ortho_options.subject_cover\n-        #end if\n-        #if $seed_ortho_options.pident:\n-        --pident $seed_ortho_options.pident\n+        \n+        #if $ortho_method.m == \'cache\'\n+            --cache \'$ortho_method.cache\'\n         #end if\n \n-        #if $annotation_options.tax_scope:\n-            --tax_scope=$annotation_options.tax_scope\n-        #end if\n-        #if $annotation_options.target_orthologs:\n-            --target_orthologs=$annotation_options.target_orthologs\n+        #if $ortho_method.m in [\'diamond\', \'mmseqs\']:\n+            ## Diamond option\n+            #if $ortho_method.m == "diamond":\n+                --matrix \'$ortho_method.matrix_gapcosts.matrix\'\n+                $ortho_method.matrix_gapcosts.gap_costs\n+                --sensmode $ortho_method.sensmode\n+                $ortho_method.dmnd_iterate\n+                $ortho_method.dmnd_ignore_warnings\n+            #elif $ortho_method.m == "mmseqs":\n+                --start_sens $ortho_method.start_sens\n+                --sens_steps $ortho_method.sens_steps\n+                --final_sens $ortho_method.final_sens\n+            #end if\n+\n+            ## Common options for search filtering (applies to diamond and mmseqs only)\n+            #if str($ortho_method.query_cover):\n+                --query_cover $ortho_method.query_cover\n+            #end if\n+            #if str($ortho_method.subject_cover):\n+                --subject_cover $ortho_method.subject_cover\n+            #end if\n+            #if str($ortho_method.pident):\n+                --pident $ortho_method.pident\n+            #end if\n'..b'_orthologs" ftype="tabular"/>\n-            <expand macro="stdout_assertion"/>\n+            <section name="output_options">\n+                <param name="report_orthologs" value="true"/>\n+                <param name="no_file_comments" value="true"/>\n+            </section>\n+            <expand macro="seed_orthologs_assertion"/>\n+            <expand macro="annotations_assertion"/>\n+            <expand macro="annotations_orthologs_assertion"/>\n+            <expand macro="stdout_assertion">\n+                <has_text text="--tax_scope=651137"/>\n+            </expand>\n         </test>\n+\n+        <!-- test setting a diamond option-->\n         <test expect_num_outputs="3">\n-            <param name="input" value="Nmar_0135.fa" ftype="fasta"/>\n             <param name="eggnog_data" value="@EGGNOG_DB_VERSION@"/> <!-- not passed in test, but required for test to work -->\n-            <section name="seed_ortho_options">\n-                <conditional name="ortho_method">\n-                    <param name="m" value="diamond" />\n-                    <param name="sensmode" value="fast" />\n-                </conditional>\n+            <conditional name="ortho_method">\n+                <param name="m" value="diamond" />\n+                <param name="input" value="Nmar_0135.fa" ftype="fasta"/>\n+                <param name="sensmode" value="fast" />\n+            </conditional>\n+            <section name="output_options">\n+                <param name="report_orthologs" value="true"/>\n+                <param name="no_file_comments" value="true"/>\n             </section>\n-            <param name="report_orthologs" value="true"/>\n-            <param name="no_file_comments" value="true"/>\n-            <output name="seed_orthologs" file="DIA_nlim.emapper.seed_orthologs" ftype="tabular" compare="sim_size"/>\n-            <output name="annotations" file="DIA_nlim.emapper.annotations" ftype="tabular" compare="sim_size"/>\n-            <output name="annotations_orthologs" file="DIA_nlim.emapper.annotations_orthologs" ftype="tabular"/>\n-            <expand macro="stdout_assertion"/>\n+            <expand macro="seed_orthologs_assertion"/>\n+            <expand macro="annotations_assertion"/>\n+            <expand macro="annotations_orthologs_assertion"/>\n+            <expand macro="stdout_assertion">\n+                <has_text text="--sensmode fast"/>\n+            </expand>\n         </test>\n         <!-- not enabled as it requires a specific .db file, hard to minimize -->\n         <!--test>\n+            <param name="eggnog_data" value="@EGGNOG_DB_VERSION@"/>\n             <param name="input" value="Nmar_0135.fa" ftype="fasta"/>\n-            <param name="eggnog_data" value="@EGGNOG_DB_VERSION@"/>\n             <section name="seed_ortho_options">\n                 <conditional name="ortho_method">\n                     <param name="m" value="mmseqs" />\n@@ -435,6 +537,28 @@\n - ``COG_functional_categories``: COG functional category inferred from best matching OG\n - ``eggNOG_free_text_description``\n \n+**Recommentation for large input data**\n+\n+EggNOG-mapper consists of two phases\n+\n+1. finding seed orthologous sequences (compute intensive)\n+2. expanding annotations (IO intensive)\n+\n+by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*)\n+both phases are executed within one tool run. \n+\n+For large input FASTA datasets in can be favourable to split this in two separate\n+tool runs as follows:\n+\n+1. Split the FASTA (e.g. 1M seqs per data set)\n+2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files.\n+3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*)\n+\n+See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs)\n+\n+Another alternative is to use cached annotations (produced in a run with --md5 enabled).\n+\n+\n     ]]></help>\n     <expand macro="citations"/>\n </tool>\n'
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/DIA_nlim.emapper.annotations
--- a/test-data/DIA_nlim.emapper.annotations Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,2 +0,0 @@
-#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs
-Nmar_0135 436308.Nmar_0135 7.67e-188 503.0 COG1083@1|root,arCOG04817@2157|Archaea,41T2K@651137|Thaumarchaeota 651137|Thaumarchaeota M Cytidylyltransferase - - - ko:K07257 - - - - ko00000 - - - -
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/DIA_nlim.emapper.annotations_cached
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/DIA_nlim.emapper.annotations_cached Tue Jul 19 15:14:52 2022 +0000
b
@@ -0,0 +1,2 @@
+#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs md5
+Nmar_0135 436308.Nmar_0135 7.67e-188 503.0 COG1083@1|root,arCOG04817@2157|Archaea,41T2K@651137|Thaumarchaeota 651137|Thaumarchaeota M Cytidylyltransferase - - - ko:K07257 - - - - ko00000 - - - - 9fb52f96004c566b17893d597c94054e
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/DIA_nlim.emapper.annotations_orthologs
--- a/test-data/DIA_nlim.emapper.annotations_orthologs Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,3 +0,0 @@
-#query orth_type species orthologs
-Nmar_0135 one2one Marine Group I thaumarchaeote SCGC AB-629-I23(1131266) *ARWQ01000003_gene1537
-Nmar_0135 seed Nitrosopumilus maritimus SCM1(436308) *Nmar_0135
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/nlim_1033.fasta
--- a/test-data/nlim_1033.fasta Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,7 +0,0 @@
->Nlim_1033886738.Nlim_1033Cytidyltransferase-like protein 
-MELIKKSILTELYLSGITGKSHIDNLTKKGFTQKLIDLE
-IDELIKNKLVKEDRAILTELGRSSLRVVLAGGVFDIIHP
-GHIYTLNAAKSLGDVLIVVVATDNTALKMKKRQPLHSKE
-QRQELVNSLIMVDLCLIGQEDDIFKTVNLVKPQIIALGY
-DQVHQEKFIIDGCKKIQLDAKVARLQSPIPESSSSKIQK
-EYGESIHGI
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/scoped.emapper.annotations
--- a/test-data/scoped.emapper.annotations Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,2 +0,0 @@
-#query seed_ortholog evalue score eggNOG_OGs max_annot_lvl COG_category Description Preferred_name GOs EC KEGG_ko KEGG_Pathway KEGG_Module KEGG_Reaction KEGG_rclass BRITE KEGG_TC CAZy BiGG_Reaction PFAMs
-Nmar_0135 436308.Nmar_0135 7.67e-188 503.0 COG1083@1|root,arCOG04817@2157|Archaea,41T2K@651137|Thaumarchaeota 651137|Thaumarchaeota M Cytidylyltransferase - - - ko:K07257 - - - - ko00000 - - - -
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/scoped.emapper.annotations_orthologs
--- a/test-data/scoped.emapper.annotations_orthologs Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,3 +0,0 @@
-#query orth_type species orthologs
-Nmar_0135 one2one Marine Group I thaumarchaeote SCGC AB-629-I23(1131266) *ARWQ01000003_gene1537
-Nmar_0135 seed Nitrosopumilus maritimus SCM1(436308) *Nmar_0135
b
diff -r 5a30ae278db0 -r 9d1fbff733cf test-data/scoped.emapper.seed_orthologs
--- a/test-data/scoped.emapper.seed_orthologs Mon Jun 20 12:49:52 2022 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,2 +0,0 @@
-#qseqid sseqid evalue bitscore qstart qend sstart send pident qcov scov
-Nmar_0135 436308.Nmar_0135 7.67e-188 503.0 1 252 1 252 100.0 100.0 100.0