Mercurial > repos > earlhaminst > lotus2

--- a/lotus2.xml	Wed May 19 02:38:24 2021 +0000
+++ b/lotus2.xml	Wed May 19 19:15:08 2021 +0000
@@ -1,7 +1,7 @@
 <tool id="lotus2" name="LotuS2" version="@VERSION@" profile="20.01">
     <description>fast OTU processing pipeline</description>
     <macros>
-        <token name="@VERSION@">2.05.1</token>
+        <token name="@VERSION@">2.06</token>
         <xml name="refDB_macro">
             <param argument="-refDB" type="select" label="Reference Database">
                 <option value="SLV" selected="true">Silva LSU (23/28S) or SSU (16/18S) (SLV)</option>
@@ -11,6 +11,10 @@
                 <option value="beetax">Bee gut specific database and tax names (beetax)</option>
                 <option value="HITdb">Human gut microbiota (HITdb)</option>
             </param>
+            <param argument="-useBestBlastHitOnly" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Use the best Blast hit only" help="Do not use LCA (lowest common ancestor) to determine the most likely taxonomic level (not recommended)" />
+        </xml>
+        <xml name="id_macro">
+            <param argument="-id" type="float" min="0" max="1" value="0.97" label="Clustering threshold for OTUs" />
         </xml>
     </macros>
     <requirements>
@@ -18,12 +22,28 @@
     </requirements>
     <version_command>lotus2 --version</version_command>
     <command detect_errors="exit_code"><![CDATA[
+#import os.path
+#import re
+#def symlink_basename($f):
+    #set fn = re.sub('[^\w\-_.]', '_', $f.element_identifier)
+    #if fn.endswith('.gz'):
+        #set fn = fn[:-3]
+    #end if
+    #for ext in ('.fq', '.fastq', '.fastqsanger'):
+        #if fn.endswith($ext):
+            #set fn = fn[:-len($ext)]
+            #break
+        #end if
+    #end for
+$fn#slurp
+#end def
+
 mkdir input
 &&
 #if $inputs.paired_or_single == 'single':
-    #for i, f in enumerate($inputs.input):
+    #for f in $inputs.input:
         #set ext = $f.ext.replace('sanger', '')
-        ln -s '$f' 'input/input${i}.${ext}' &&
+        ln -s '$f' 'input/${symlink_basename(f)}.${ext}' &&
     #end for
 #elif $inputs.paired_or_single == 'paired':
     #for i, f in enumerate($inputs.left_input):
@@ -35,11 +55,11 @@
         ln -s '$f' 'input/input${i}.2.${ext}' &&
     #end for
 #else:
-    #for i, f in enumerate($inputs.pair_input):
+    #for f in $inputs.pair_input:
         #set ext = $f.forward.ext.replace('sanger', '')
-        ln -s '$f.forward' 'input/input${i}.1.${ext}' &&
+        ln -s '$f.forward' 'input/${symlink_basename(f)}.1.${ext}' &&
         #set ext = $f.reverse.ext.replace('sanger', '')
-        ln -s '$f.reverse' 'input/input${i}.2.${ext}' &&
+        ln -s '$f.reverse' 'input/${symlink_basename(f)}.2.${ext}' &&
     #end for
 #end if

@@ -62,9 +82,16 @@
 #if $reversePrimer:
     -reversePrimer '$reversePrimer'
 #end if
+#if $offtarget_cond.offtargetDB != 'no':
+    -offtargetDB '$offtarget_cond.ref_file'
+#end if

--clustering $clu_args.clustering
--id $clu_args.id
+-clustering $clu_args.clu_cond.clustering
+#if $clu_args.clu_cond.clustering in ('1', '3'):
+    -id $clu_args.clu_cond.id
+#elif $clu_args.clu_cond.clustering == '2':
+    -swarm_distance $clu_args.clu_cond.swarm_distance
+#end if
 #if $clu_args.derepMin:
     -derepMin '$clu_args.derepMin'
 #end if
@@ -79,6 +106,7 @@
     -utax_thr $tax_args.aligner_cond.utax_thr
 #else:
     -refDB $tax_args.aligner_cond.refDB
+    -useBestBlastHitOnly $tax_args.aligner_cond.useBestBlastHitOnly
 #end if
 -amplicon_type $tax_args.amplicon_type
 -tax_group $tax_args.tax_group
@@ -87,6 +115,8 @@
 -LCA_cover $tax_args.LCA_cover
 -LCA_frac $tax_args.LCA_frac
 -greengenesSpecies $tax_args.greengenesSpecies
+-lulu $tax_args.lulu
+-buildPhylo $tax_args.buildPhylo

 ; EXIT_VALUE=\$? ;

@@ -99,16 +129,17 @@
         <conditional name="inputs">
             <param name="paired_or_single" type="select" label="Paired or Single-end data?">
                 <option value="single" selected="true">Single-end</option>
-                <option value="paired">Paired-end</option>
                 <option value="paired_collection">Paired-end collection</option>
             </param>
             <when value="single">
                 <param name="input" type="data" format="fastqsanger,fastqsanger.gz" multiple="true" label="Single-end reads" />
             </when>
+<!--
             <when value="paired">
                 <param name="left_input" type="data" format="fastqsanger,fastqsanger.gz" multiple="true" label="Left/Forward strand reads" />
                 <param name="right_input" type="data" format="fastqsanger,fastqsanger.gz" multiple="true" label="Right/Reverse strand reads" />
             </when>
+-->
             <when value="paired_collection">
                 <param name="pair_input" type="data_collection" collection_type="list:paired" format="fastqsanger,fastqsanger.gz" label="List of paired reads" />
             </when>
@@ -122,33 +153,66 @@
         <param argument="-barcode" type="data" format="fastqsanger" optional="true" label="Barcode (MID) sequences (optional)" help="FASTQ file with barcodes (in the processed mi/hiSeq format), if provided by the sequencer" />
         <param argument="-forwardPrimer" type="text" value="" label="Forward primer used to amplify DNA region" help="E.g. 16S primer fwd" />
         <param argument="-reversePrimer" type="text" value="" label="Reverse primer used to amplify DNA region" help="E.g. 16S primer rev" />
+        <conditional name="offtarget_cond">
+            <param argument="-offtargetDB" type="select" label="Remove likely contaminant OTUs/ASVs based on alignment to host genome" help="Useful for low-bacterial biomass samples to remove possible host genome contaminations">
+                <option value="no" selected="true">Disabled</option>
+                <option value="cached">Use a built-in genome</option>
+                <option value="history">Use a genome from history</option>
+            </param>
+            <when value="no" />
+            <when value="cached">
+                <param name="ref_file" type="select" label="Using reference genome" help="Select genome from the list">
+                    <options from_data_table="all_fasta">
+                        <filter type="sort_by" column="2" />
+                        <validator type="no_options" message="No reference genomes are available" />
+                    </options>
+                </param>
+            </when>
+            <when value="history">
+                <param name="ref_file" type="data" format="fasta" label="FASTA reference genome" />
+            </when>
+        </conditional>
         <section name="clu_args" title="Clustering Options">
-            <param argument="-clustering" type="select" label="Clustering algorithm">
-                <option value="1">UPARSE</option>
-                <option value="2">swarm</option>
-                <option value="3">cd-hit</option>
-                <option value="6">unoise3</option>
-                <option value="7" selected="true">dada2</option>
-            </param>
-            <param argument="-id" type="float" min="0" max="1" value="0.97" label="Clustering threshold for OTUs" />
-            <param argument="-derepMin" type="text" value="" label="Minimum size of dereplicated raw reads" help="E.g. 4:1,4:2,3:3 . See http://lotus2.earlham.ac.uk/images/Derep_options.pdf for how to specify this parameter" />
+            <conditional name="clu_cond">
+                <param argument="-clustering" type="select" label="Clustering algorithm">
+                    <option value="1">UPARSE</option>
+                    <option value="2">swarm</option>
+                    <option value="3">cd-hit</option>
+                    <option value="6">unoise3</option>
+                    <option value="7" selected="true">dada2</option>
+                </param>
+                <when value="1">
+                    <expand macro="id_macro" />
+                </when>
+                <when value="2">
+                    <param argument="-swarm_distance" type="integer" min="1" value="1" label="Clustering threshold for OTUs when using swarm clustering" />
+                </when>
+                <when value="3">
+                    <expand macro="id_macro" />
+                </when>
+                <when value="6">
+                </when>
+                <when value="7">
+                </when>
+            </conditional>
+            <param argument="-derepMin" type="text" value="" label="Minimum size of dereplicated raw reads (optional)" help="E.g. 4:1,4:2,3:3 . See http://lotus2.earlham.ac.uk/images/Derep_options.pdf for how to specify this parameter. If not specified, LotuS2 will select an appropriate default for the chosen clustering algorithm." />
             <param argument="-deactivateChimeraCheck" type="select" label="Chimera check">
                 <option value="0" selected="true">OTU chimera checks</option>
                 <option value="1">No chimera check at all</option>
-                <option value="2">Deactivate deNovo chimera check</option>
-                <option value="3">Deactivate ref based chimera check</option>
+                <option value="2">Disable deNovo chimera check</option>
+                <option value="3">Disable ref based chimera check</option>
             </param>
             <param argument="-chim_skew" type="integer" min="0" value="2" label="Skew in chimeric fragment abundance" />
             <param argument="-readOverlap" type="integer" min="0" value="300" label="Maximum number of basepairs that two reads are overlapping" />
         </section>
         <section name="tax_args" title="Taxonomy Options">
             <conditional name="aligner_cond">
-                <param argument="-taxAligner" type="select" label="Taxonomy aligner">
-                    <option value="0" selected="true">Deactivated (just use RDP)</option>
-                    <option value="1">Blast</option>
-                    <option value="2">Use LAMBDA to search against a 16S reference database for taxonomic profiling of OTUs</option>
-                    <option value="3">Use UTAX with custom databases</option>
-                    <option value="4">Use VSEARCH to align OTUs to custom databases</option>
+                <param argument="-taxAligner" type="select" label="Taxonomy aligner for taxonomic profiling of OTUs">
+                    <option value="0" selected="true">RDPclassifier (max likelihood)</option>
+                    <option value="1">Blast LCA against custom reference database</option>
+                    <option value="2">LAMBDA LCA against custom reference database</option>
+                    <option value="3">UTAX likelihood corrected</option>
+                    <option value="4">VSEARCH LCA against custom reference database</option>
                 </param>
                 <when value="0">
                     <param argument="-rdp_thr" type="float" min="0" max="1" value="0.8" label="Confidence threshold for RDP"/>
@@ -182,6 +246,12 @@
             <param argument="-LCA_cover" type="float" min="0" max="1" value="0.9" label="Minimum horizontal coverage of an OTU sequence against ref DB"/>
             <param argument="-LCA_frac" type="float" min="0" max="1" value="0.9" label="Minimum fraction of reads with identical taxonomy"/>
             <param argument="-greengenesSpecies" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Create greengenes output labels instead of OTU" />
+            <param argument="-lulu" type="boolean" truevalue="1" falsevalue="0" checked="false" label="Use LULU to merge OTUs based on their occurence" />
+            <param argument="-buildPhylo" type="select" label="Build OTU phylogeny">
+                <option value="0">Disable</option>
+                <option value="1" selected="true">Use fasttree2</option>
+                <option value="2">Use iqtree2</option>
+            </param>
         </section>
     </inputs>

@@ -190,9 +260,8 @@
         <data name="otu_biom" format="biom" label="${tool.name} on ${on_string}: biom-formatted OTU abundance matrix" from_work_dir="output/OTU.biom" />
         <data name="otu_fna" format="fasta" label="${tool.name} on ${on_string}: FASTA-formatted extended OTU seed sequences" from_work_dir="output/OTU.fna" />
         <data name="OTUphylo_nwk" format="newick" label="${tool.name} on ${on_string}: Newick-formatted phylogenetic tree between sequences" from_work_dir="output/OTUphylo.nwk" />
-        <data name="hiera_blast" format="tabular" label="${tool.name} on ${on_string}: OTU taxonomy assignments based on Blastn" from_work_dir="output/hiera_BLAST.txt" />
-        <data name="hiera_rdp" format="tabular" label="${tool.name} on ${on_string}: OTU taxonomy assignments based on RDP classifier" from_work_dir="output/hiera_RDP.txt" />
-        <data name="primary" format="tar" label="${tool.name} on ${on_string}: All output files" from_work_dir="output.tar.gz" />
+        <data name="mapping" format="tabular" label="${tool.name} on ${on_string}: mapping file" from_work_dir="output/primary/in.map" />
+        <data name="outputs" format="tar" label="${tool.name} on ${on_string}: All output files" from_work_dir="output.tar.gz" />
     </outputs>

     <tests>
@@ -203,7 +272,7 @@
             <param name="clustering" value="3" />
             <output name="otu" file="OTU.txt" compare="sim_size" />
             <output name="otu_fna" file="OTU.fna" compare="sim_size" />
-            <output name="hiera_rdp" file="hiera_RDP.txt" compare="sim_size" />
+            <output name="mapping" file="mapping.txt" />
         </test>
     </tests>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mapping.txt	Wed May 19 19:15:08 2021 +0000
@@ -0,0 +1,3 @@
+#SampleID	fastqFile	SequencingRun
+SMPL0	Anh_sample1.fastq.gz	a
+SMPL1	Anh_sample2.fastq.gz	a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample	Wed May 19 19:15:08 2021 +0000
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>	<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3	/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19	Human (Homo sapiens): hg19 Canonical	/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19	Human (Homo sapiens): hg19 Full	/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed May 19 19:15:08 2021 +0000
@@ -0,0 +1,9 @@
+<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc-->
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>
+