diff mmseqs2_taxonomy_assignment.xml @ 2:876d26806584 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/mmsesq2 commit 611b90f1628037f05d85905c88629a422d0a2053
author iuc
date Mon, 14 Apr 2025 18:39:38 +0000
parents d0acde079e2e
children
line wrap: on
line diff
--- a/mmseqs2_taxonomy_assignment.xml	Thu Mar 27 16:43:10 2025 +0000
+++ b/mmseqs2_taxonomy_assignment.xml	Mon Apr 14 18:39:38 2025 +0000
@@ -16,20 +16,25 @@
     --dbtype '$createdb.alph_type.dbtype'
     --shuffle $createdb.shuffle &&
 
-cp -r '$createtaxdb.database_type.mmseqs2_db_select.fields.path'/database* . &&
+##Used only for test 
+#if str($download_tax_db) == 'true':
+cp -r '$database.database_type.mmseqs2_db_select.fields.path'/database* . &&
 
 mmseqs createtaxdb
     database
-    'tmp'
-    #if $createtaxdb.tax_mapping_file
-    --tax-mapping-file '$createtaxdb.tax_mapping_file'
-    #end if
-    --tax-mapping-mode '$createtaxdb.tax_mapping_mode'
-    --threads "\${GALAXY_SLOTS:-1}" &&
+    'tmp' &&
+#end if
+##
 
 #if $filtertaxseqdb.taxon_list
     mmseqs filtertaxseqdb
-        'database'
+        ##Used only for test 
+        #if str($download_tax_db) == 'true':
+            'database'
+        ##
+        #else
+            '$database.database_type.mmseqs2_db_select.fields.path'/database
+        #end if
         'database_filtered'
         --taxon-list '$filtertaxseqdb.taxon_list'
             &&
@@ -40,7 +45,13 @@
     #if $filtertaxseqdb.taxon_list
         'database_filtered'
     #else 
-        'database'
+        ##Used only for test 
+        #if str($download_tax_db) == 'true':
+            'database'
+        ##
+        #else
+            '$database.database_type.mmseqs2_db_select.fields.path'/database
+        #end if
     #end if
     'output_taxonomy'
     'tmp'
@@ -64,6 +75,7 @@
     --mask $taxonomy.prefilter.mask
     --mask-prob $taxonomy.prefilter.mask_prob
     --mask-lower-case $taxonomy.prefilter.mask_lower_case
+    --mask-n-repeat $taxonomy.prefilter.mask_n_repeat
     --min-ungapped-score $taxonomy.prefilter.min_ungapped_score
     --spaced-kmer-mode $taxonomy.prefilter.spaced_kmer_mode
     ##--spaced-kmer-pattern STR        User-specified spaced k-mer pattern []
@@ -137,11 +149,10 @@
     --translate $taxonomy.misc.translate
     --use-all-table-starts $taxonomy.misc.use_all_table_starts
     --id-offset $taxonomy.misc.id_offset
-    --add-orf-stop $taxonomy.misc.add_orf_stop
     --sequence-overlap $taxonomy.misc.sequence_overlap
     --sequence-split-mode $taxonomy.misc.sequence_split_mode
     --headers-split-mode $taxonomy.misc.headers_split_mode
-    --search-type $createtaxdb.database_type.search_type
+    --search-type $database.database_type.search_type
     --prefilter-mode $taxonomy.misc.prefilter_mode
 
     ##Common options
@@ -182,7 +193,13 @@
     #if $filtertaxseqdb.taxon_list
         'database_filtered'
     #else 
-        'database'
+        ##Used only for test 
+        #if str($download_tax_db) == 'true':
+            'database'
+        ##
+        #else
+            '$database.database_type.mmseqs2_db_select.fields.path'/database
+        #end if
     #end if
         'output_taxonomy'
         'taxo_result.txt'
@@ -195,7 +212,13 @@
     #if $filtertaxseqdb.taxon_list
         'database_filtered'
     #else 
-        'database'
+        ##Used only for test 
+        #if str($download_tax_db) == 'true':
+            'database'
+        ##
+        #else
+            '$database.database_type.mmseqs2_db_select.fields.path'/database
+        #end if
     #end if
         'output_taxonomy'
         'taxo_result.html'
@@ -206,6 +229,9 @@
 
     ]]></command>
     <inputs>
+        <!-- used only for tests, this makes it possible to download the taxonomy part of the db without having to have it in the test data directory, which is too large and cannot be reduced -->
+        <param name="download_tax_db" type="hidden" value=""/>
+        <!-- -->
         <section name="createdb" title="Convert FASTA/Q file(s) to MMseqs sequence DB format"  expanded="true">
             <param name="input_fasta" type="data" format="fasta,fastq,fasta.gz,fastq.gz" label="Input fasta file" help="" />
             <conditional name="alph_type">
@@ -224,13 +250,11 @@
             </conditional>
             <param argument="--shuffle" type="boolean" checked="true" label="Shuffle input database" truevalue="1" falsevalue="0" optional="true" help="" />
         </section>
-        <section name="createtaxdb" title="Add taxonomic labels to reference sequence DB" expanded="true">
+        <section name="database" title="Choose the taxonomic reference database that you want to use" expanded="true">
             <conditional name="database_type">
                 <param name="type" type="select" label="Database type" help="" >
                     <option value="amino_acid_tax" selected="true">Amino acid with taxonomy information</option>
                     <option value="nucleotides_tax">Nucleotides with taxonomy information</option>
-                    <option value="amino_acid">Amino acid without taxonomy information</option>
-                    <option value="nucleotides">Nucleotides without taxonomy information</option>
                 </param>
                 <when value="amino_acid_tax">
                     <param name="mmseqs2_db_select" type="select" label="MMseqs2 databases">
@@ -252,32 +276,7 @@
                     </param>
                     <expand macro="search_type_nt" />
                 </when>
-                <when value="amino_acid">
-                    <param name="mmseqs2_db_select" type="select" label="MMseqs2 databases">
-                        <options from_data_table="mmseqs2_databases">
-                            <filter type="static_value" value="aminoacid" column="type"/>
-                            <filter type="static_value" value="no" column="taxonomy"/>
-                            <validator message="No mmseqs2 database is available" type="no_options"/>
-                        </options>
-                    </param>
-                    <expand macro="search_type_aa" />
-                </when>
-                <when value="nucleotides">
-                    <param name="mmseqs2_db_select" type="select" label="MMseqs2 databases">
-                        <options from_data_table="mmseqs2_databases">
-                            <filter type="static_value" value="nucleotide" column="type"/>
-                            <filter type="static_value" value="no" column="taxonomy"/>
-                            <validator message="No mmseqs2 database is available" type="no_options"/>
-                        </options>
-                    </param>
-                    <expand macro="search_type_nt" />
-                </when>
             </conditional>
-            <param argument="--tax-mapping-file" type="data" format="tabular,tsv,txt" label="File to map sequence identifier to taxonomical identifier" optional="true"/>
-            <param argument="--tax-mapping-mode" type="select" label="Map taxonomy based on sequence database" help="" >
-                <option value="0" selected="true">0: .lookup file</option>
-                <option value="1">1: .source file</option>
-            </param>
         </section>
         <section name="filtertaxseqdb" title="Filter taxonomy sequence database">
             <param argument="--taxon-list" type="text" optional="true" value="" label="Taxonomy ID" help="Possibly multiple values separated by ','"/>
@@ -432,7 +431,6 @@
                 <param argument="--translate" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Translate ORF to amino acid" help=""/>
                 <param argument="--use-all-table-starts" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Use all alternatives for a start codon in the genetic table, if false - only ATG (AUG)" help=""/>
                 <param argument="--id-offset" type="integer" min="0" value="0" label="Numeric ids in index file are offset by this value" help=""/>
-                <param argument="--add-orf-stop" type="boolean" checked="false" truevalue="1" falsevalue="0" label="Add stop codon '*' at complete start and end" help=""/>
                 <param argument="--sequence-overlap" type="integer" min="0" value="0" label="Overlap between sequences" help=""/>
                 <param argument="--sequence-split-mode" type="select" label="Sequence split mode" help="" >
                     <option value="0">Copy data</option>
@@ -494,13 +492,14 @@
     <tests>
         <!-- Test with Kraken report -->
         <test expect_num_outputs="2">
+            <param name="download_tax_db" value="true"/>
             <section name="createdb">
                 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
                 <conditional name="alph_type">
                     <param name="dbtype" value="2"/>
                 </conditional>
             </section>
-            <section name="createtaxdb">
+            <section name="database">
                 <conditional name="database_type">
                     <param name="type" value="amino_acid_tax"/>
                     <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
@@ -509,28 +508,33 @@
             <section name="filtertaxseqdb">
                 <param name="taxon_list" value="2" />
             </section>
+            <section name="taxonomy">
+                <section name="prefilter">
+                    <param name="mask_n_repeat" value="1" />
+                </section>
+            </section>
             <conditional name="krona_report">
                 <param name="keep_report" value="No"/>
             </conditional>
             <output name="output_taxonomy_tsv" ftype="tabular">
                 <assert_contents>
-                    <has_line line="MYSTERY.222&#009;1236&#009;class&#009;Gammaproteobacteria&#009;1&#009;1&#009;1&#009;1.000"/>
                     <has_line line="MYSTERY.64&#009;119060&#009;family&#009;Burkholderiaceae&#009;1&#009;1&#009;1&#009;1.000"/>
                     <has_n_columns n="8"/>
                 </assert_contents>
             </output>
             <output name="output_taxonomy_kraken" ftype="txt">
                 <assert_contents>
-                    <has_text text="93.3333"/>
-                    <has_text text="33.3333"/>
+                    <has_text text="kingdom"/>
+                    <has_text text="Pseudomonadati"/>
                 </assert_contents>
             </output>
         </test>
         <test expect_num_outputs="2">
+            <param name="download_tax_db" value="true"/>
             <section name="createdb">
                 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
             </section>
-            <section name="createtaxdb">
+            <section name="database">
                 <conditional name="database_type">
                     <param name="type" value="amino_acid_tax"/>
                     <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
@@ -553,10 +557,11 @@
             </output>
         </test>
         <test expect_num_outputs="3">
+            <param name="download_tax_db" value="true"/>
             <section name="createdb">
                 <param name="input_fasta" value="light_mystery_reads.fasta" ftype="fasta"/>
             </section>
-            <section name="createtaxdb">
+            <section name="database">
                 <conditional name="database_type">
                     <param name="type" value="amino_acid_tax"/>
                     <param name="mmseqs2_db_select" value="UniProtKB/Swiss-Prot-15.6f452-10022025" />
@@ -576,8 +581,8 @@
             </output>
             <output name="output_taxonomy_kraken" ftype="txt">
                 <assert_contents>
-                    <has_text text="93.3333"/>
-                    <has_text text="33.3333"/>
+                    <has_text text="kingdom"/>
+                    <has_text text="Pseudomonadati"/>
                 </assert_contents>
             </output>
         </test>
@@ -596,9 +601,6 @@
 * Convert FASTA/Q file(s) to MMseqs sequence DB format
     *mmseqs createdb <i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]>|<i:stdin> <o:sequenceDB> [options]*
 
-* Add taxonomic labels to sequence DB
-    *mmseqs createtaxdb <i:sequenceDB> <tmpDir> [options]*
-
 * Filter taxonomy sequence database
     *mmseqs filtertaxseqdb <i:taxSeqDB> <o:taxSeqDB> [options]*