# HG changeset patch # User galaxyp # Date 1693831629 0 # Node ID 844fa988236b9a07be8f1de9931cf7cb1fc806c9 # Parent 9d1fbff733cfbc64fb3538e8c604517d9f43cedb planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/eggnog_mapper/eggnog_mapper commit 468bd31b8858adbba2854f118e4cbe31f4cd68cb diff -r 9d1fbff733cf -r 844fa988236b README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Mon Sep 04 12:47:09 2023 +0000 @@ -0,0 +1,20 @@ +This folder contains three tools: + +1. eggnogg_mapper: which runs the search and annotation phase in a single tool +2. eggnogg_mapper_search: which implements the search phase +3. eggnogg_mapper_annotate: which implements the annotation phase + +While the search phase of eggnog_mapper is very CPU intense and is efficient +also for a larger number of threads, the annotation phase is very IO intensive +and can be very inefficient (depending on the configuration, e.g. if the +reference data is located on a slow partition). + +While for most applications eggnogg_mapper will be sufficient to separate the +two phases can be more efficient: + +- sending eggnogg_mapper_search to a destination using many threads +- and eggnogg_mapper_annotate to a destination using a small number of threads + +Admins can choose to set the environment variable ``EGGNOG_DBMEM=--dbmem`` +which will copy the complete EggNOG annotation DB into memory which is usually +much faster than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file diff -r 9d1fbff733cf -r 844fa988236b eggnog_macros.xml --- a/eggnog_macros.xml Tue Jul 19 15:14:52 2022 +0000 +++ b/eggnog_macros.xml Mon Sep 04 12:47:09 2023 +0000 @@ -3,6 +3,7 @@ 2.1.8 3 5.0.2 + 22.01 + @@ -105,41 +127,494 @@ - query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + value.metadata.columns == 11 + + + + + + + value.metadata.columns == 22 + + + + + + annotate_hits_table.tsv + && + #end if + ]]> + + - - - - + + +

+ - +

+ + + + + + + - + + query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + ortho_method['m'] not in ['no_search', 'cache'] + + + + + + + + - + + + + + + + + + Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. + Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. + + + + + Min bit score expected when searching for seed eggNOG ortholog. + Queries not having a significant seed orthologs will not be annotated. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ortho_method['m'] == 'cache' and ortho_method['output_no_annotations'] + + + + + + ortho_method['m'] != 'cache' + output_options['report_orthologs'] + + + + + + + + - + + + diff -r 9d1fbff733cf -r 844fa988236b eggnog_mapper.xml --- a/eggnog_mapper.xml Tue Jul 19 15:14:52 2022 +0000 +++ b/eggnog_mapper.xml Mon Sep 04 12:47:09 2023 +0000 @@ -1,4 +1,4 @@ - + functional sequence annotation by orthology eggnog_macros.xml @@ -6,86 +6,15 @@ annotate_hits_table.tsv - && - #end if + @MERGE_ANNOTATIONS@ emapper.py - --data_dir '$eggnog_data.fields.path' - -m '$ortho_method.m' - - #if $ortho_method.m in ['diamond', 'mmseqs', 'cache']: - -i '$ortho_method.input' - --itype '$ortho_method.input_trans.itype' - #if $ortho_method.input_trans.itype in ['CDS', 'genome', 'metagenome']: - $ortho_method.input_trans.translate - #end if - #if $ortho_method.input_trans.itype in ['genome', 'metagenome']: - --genepred $ortho_method.input_trans.genepred - #end if - #elif $ortho_method.m == "no_search" - --annotate_hits_table annotate_hits_table.tsv - #end if - - #if $ortho_method.m == 'cache' - --cache '$ortho_method.cache' - #end if - - #if $ortho_method.m in ['diamond', 'mmseqs']: - ## Diamond option - #if $ortho_method.m == "diamond": - --matrix '$ortho_method.matrix_gapcosts.matrix' - $ortho_method.matrix_gapcosts.gap_costs - --sensmode $ortho_method.sensmode - $ortho_method.dmnd_iterate - $ortho_method.dmnd_ignore_warnings - #elif $ortho_method.m == "mmseqs": - --start_sens $ortho_method.start_sens - --sens_steps $ortho_method.sens_steps - --final_sens $ortho_method.final_sens - #end if - - ## Common options for search filtering (applies to diamond and mmseqs only) - #if str($ortho_method.query_cover): - --query_cover $ortho_method.query_cover - #end if - #if str($ortho_method.subject_cover): - --subject_cover $ortho_method.subject_cover - #end if - #if str($ortho_method.pident): - --pident $ortho_method.pident - #end if - #if str($ortho_method.evalue): - --evalue $ortho_method.evalue - #end if - #if str($ortho_method.score): - --score $ortho_method.score - #end if - #end if - + @DB_TOKEN@ + @ORTHO_SEARCH_TOKEN@ #if $annotation_options.no_annot == "--no_annot" --no_annot #else - #if str($annotation_options.seed_ortholog_evalue): - --seed_ortholog_evalue $annotation_options.seed_ortholog_evalue - #end if - #if str($annotation_options.seed_ortholog_score): - --seed_ortholog_score $annotation_options.seed_ortholog_score - #end if - #if $annotation_options.tax_scope: - --tax_scope=$annotation_options.tax_scope - #end if - #if $annotation_options.target_orthologs: - --target_orthologs=$annotation_options.target_orthologs - #end if - #if $annotation_options.go_evidence: - --go_evidence=$annotation_options.go_evidence - #end if + @ANNOTATION_TOKEN@ #end if $output_options.no_file_comments $output_options.report_orthologs @@ -96,261 +25,27 @@ --temp_dir \${TEMP:-\$_GALAXY_JOB_TMP_DIR} ]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value.metadata.columns == 11 - - - - - - - value.metadata.columns == 22 - - - - - + + - - - Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. - Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. - - - - - Min bit score expected when searching for seed eggNOG ortholog. - Queries not having a significant seed orthologs will not be annotated. - - - - - - - - - - - - - - - + -

- - - -

+ - - ortho_method['m'] not in ['no_search', 'cache'] - - - - - + + annotation_options['no_annot'] == '' - - - - - - - - - - - - - ortho_method['m'] != 'cache' and output_options['report_orthologs'] - - - - - - ortho_method['m'] == 'cache' and output_options['output_no_annotations'] - + + @@ -363,7 +58,6 @@

@@ -382,7 +76,6 @@

@@ -397,12 +90,12 @@

- +

- - - + + + @@ -416,7 +109,6 @@

@@ -439,7 +131,6 @@

@@ -459,7 +150,6 @@

@@ -502,40 +192,11 @@ Outputs ------- -**seed_orthologs** - -each line in the file provides the best match of each query within the best Orthologous Group (OG) -reported in the [project].hmm_hits file, obtained running PHMMER against all sequences within the best OG. -The seed ortholog is used to fetch fine-grained orthology relationships from eggNOG. -If using the diamond search mode, seed orthologs are directly -obtained from the best matching sequences by running DIAMOND against the whole eggNOG protein space. - -**annotations** - -This file provides final annotations of each query. Tab-delimited columns in the file are: +@HELP_SEARCH_OUTPUTS@ -- ``query_name``: query sequence name -- ``seed_eggNOG_ortholog``: best protein match in eggNOG -- ``seed_ortholog_evalue``: best protein match (e-value) -- ``seed_ortholog_score``: best protein match (bit-score) -- ``predicted_taxonomic_group`` -- ``predicted_protein_name``: Predicted protein name for query sequences -- ``GO_terms``: Comma delimited list of predicted Gene Ontology terms -- ``EC_number`` -- ``KEGG_KO`` -- ``KEGG_Pathway``: Comma delimited list of predicted KEGG pathways -- ``KEGG_Module`` -- ``KEGG_Reaction`` -- ``KEGG_rclass`` -- ``BRITE`` -- ``KEGG_TC`` -- ``CAZy`` -- ``BiGG_Reactions`` -- ``Annotation_tax_scope``: The taxonomic scope used to annotate this query sequence -- ``Matching_OGs``: Comma delimited list of matching eggNOG Orthologous Groups -- ``best_OG|evalue|score``: Best matching Orthologous Groups (deprecated, use smallest from eggnog OGs) -- ``COG_functional_categories``: COG functional category inferred from best matching OG -- ``eggNOG_free_text_description`` +@HELP_ANNOTATION_OUTPUTS@ + + **Recommentation for large input data** @@ -558,7 +219,6 @@ Another alternative is to use cached annotations (produced in a run with --md5 enabled). - ]]> diff -r 9d1fbff733cf -r 844fa988236b eggnog_mapper_annotate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eggnog_mapper_annotate.xml Mon Sep 04 12:47:09 2023 +0000 @@ -0,0 +1,141 @@ + + annotation phase + + eggnog_macros.xml + + + + + + + + +

+ +

+ + + + + + + + + + + + + + + + +

+ + +

+ + + + + + + + + + + + +

+ + + +

+ + + + + + + + + + + + + + + +

+ +

+ + +

+ + + + + + + + `_. + +Outputs +------- + +@HELP_ANNOTATION_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + + diff -r 9d1fbff733cf -r 844fa988236b eggnog_mapper_search.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/eggnog_mapper_search.xml Mon Sep 04 12:47:09 2023 +0000 @@ -0,0 +1,101 @@ + + search phase + + eggnog_macros.xml + + + + + + + + + + + + + + + + + + + +

+ +

+ + + + + + + + + + + + +

+ +

+ + + + + + + `_. + +Outputs +------- + +@HELP_SEARCH_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + +