diff umi-tools_extract.xml @ 15:27ac32a22ad2 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:52:06 +0000
parents 9fa7803d1c51
children 7accf7407811
line wrap: on
line diff
--- a/umi-tools_extract.xml	Wed Jun 02 18:27:33 2021 +0000
+++ b/umi-tools_extract.xml	Mon Sep 13 14:52:06 2021 +0000
@@ -1,118 +1,96 @@
-<tool id="umi_tools_extract" name="UMI-tools extract" version="@VERSION@.2">
+<tool id="umi_tools_extract" name="UMI-tools extract" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>Extract UMI from fastq files</description>
+    <expand macro="bio_tools"/>
     <macros>
         <import>macros.xml</import>
-        <macro name="out_conditional">
-            <actions>
-                <conditional name="input_type.type">
-                    <when value="paired_collection" >
-                        <action type="format">
-                            <option type="from_param" name="input_type.input_readpair" param_attribute="forward.ext" />
-                        </action>
-                    </when>
-                    <when value="paired" >
-                        <action type="format">
-                            <option type="from_param" name="input_type.input_read1"  param_attribute="ext" />
-                        </action>
-                    </when>
-                </conditional>
-            </actions>
-        </macro>
     </macros>
     <expand macro="requirements" />
     <command detect_errors="exit_code"><![CDATA[
     @COMMAND_LINK@
 
     umi_tools extract
-            --extract-method='$extract_method.value'
-            --bc-pattern='$bc_pattern'
-
-            #if $input_type.type == 'single':
-                #if $gz:
-                    --stdin=input_single.gz
-                    --stdout out.gz
-                #else
-                    --stdin=input_single.txt
-                    --stdout '$out'
-                #end if
+        
+        @FASTQ_BARCODE_EXTRACTION_OPTIONS@
+        #if $input_type_cond.input_type == 'single':
+            #if $gz:
+                --stdin=input_single.gz
+                --stdout out.gz
+            #else
+                --stdin=input_single.txt
+                --stdout '$out'
+            #end if
+        #else:
+            #if $gz:
+                --stdin=input_read1.gz
+                --read2-in=input_read2.gz
+                --stdout out1.gz
+                --read2-out=out2.gz
             #else:
-                #if $gz:
-                    --stdin=input_read1.gz
-                    --read2-in=input_read2.gz
-                    --stdout out1.gz
-                    --read2-out=out2.gz
-                #else:
-                    --stdin=input_read1.txt
-                    --read2-in=input_read2.txt
-                    --stdout '$out1'
+                --stdin=input_read1.txt
+                --read2-in=input_read2.txt
+                #if $input_type_cond.input_type == 'paired'
+                    --stdout '$out'
                     --read2-out='$out2'
-                #end if
-                #if $input_type.barcode.barcode_select == "both_reads":
-                    --split-barcode
-                    --bc-pattern2='$input_type.barcode.bc_pattern2'
+                #else
+                    --stdout '$out_paired_collection.forward'
+                    --read2-out='$out_paired_collection.reverse'
                 #end if
             #end if
+            $input_type_cond.reconcile_pairs
+        #end if
 
-            #if $barcodes.use_barcodes.value == 'yes':
-                --filter-cell-barcode
-                --whitelist='$barcodes.filter_barcode_file'
-                '$barcodes.filter_correct.value'
-            #end if
+        #if $whitelist
+            --whitelist='$whitelist'
+        #end if
+        #if $blacklist
+            --blacklist='$blacklist'
+        #end if
+        $error_correct_cell.value
 
-            #if not $prime3:
-                --3prime
-            #end if
-            #if $quality.quality_selector =='true':
+        #if $quality.quality_selector =='true':
+            #if str($quality.quality_filter_threshold) != ''
                 --quality-filter-threshold '$quality.quality_filter_threshold'
-                --quality-encoding '$quality.quality_encoding'
+            #end if
+            #if str($quality.quality_filter_mask) != ''
+                --quality-filter-mask '$quality.quality_filter_mask'
+            #end if
+            #if $input_type_cond.input_type != 'paired_collection'
+                #set input=$input_type_cond.input_read1
+            #else
+                #set input=$input_type_cond.input_readpair.forward
             #end if
-            #if $print_log == "1":
-                --log='$out_log'
+            --quality-encoding
+            #if $input.ext.startswith("fastqillumina")
+                phred64
+            #else if $input.ext.startswith("fastqsolexa")
+                solexa
+            #else
+                phred33
             #end if
+        #end if
+        @LOG@
         #if $gz:
-            #if $input_type.type == 'single':
+            #if $input_type_cond.input_type == 'single':
                 && mv out.gz '$out'
+            #else if $input_type_cond.input_type == 'paired' 
+                && mv out1.gz '$out'
+                && mv out2.gz '$out2'
             #else
-                && mv out1.gz '$out1'
-                && mv out2.gz '$out2'
+                && mv out1.gz '$out_paired_collection.forward'
+                && mv out2.gz '$out_paired_collection.reverse'
             #end if
         #end if
     ]]></command>
     <inputs>
-        <expand macro="input_types" />
-
-        <conditional name="barcodes" >
-            <param name="use_barcodes" argument="--filter-cell-barcode" type="select" label="Use Known Barcodes?" >
-                <option value="yes">Yes</option>
-                <option value="no" selected="true" >No</option>
-            </param>
-            <when value="no" />
-            <when value="yes" >
-                <param name="filter_barcode_file" type="data" format="tabular,tsv" label="Barcode File" />
-                <param name="filter_correct" argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command." />
-            </when>
-        </conditional>
+        <expand macro="input_types">
+                <param argument="--reconcile-pairs" type="boolean" truevalue="--reconcile-pairs" falsevalue="" checked="false" label="Allow unpaired reads" help="Allow the presences of reads in read2 input that are not present in read1 input. This allows cell barcode filtering of read1s without considering read2s" />
+        </expand>
+        <expand macro="fastq_barcode_extraction_options_macro"/>
 
-        <param name="extract_method" type="select" label="Method to extract barcodes" >
-            <option value="regex">Regular Expressions</option>
-            <option value="string" selected="true">String</option>
-        </param>
-            
-        <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read"
-            help="Use this option to specify the format of the UMI/barcode. Use Ns to
-                    represent the random positions and Xs to indicate the bc positions.
-                    Bases with Ns will be extracted and added to the read name. Remaining
-                    bases, marked with an X will be reattached to the read.">
-            <expand macro="barcode_sanitizer" />
-        </param>
+        <param argument="--whitelist" type="data" optional="true" format="tabular,tsv" label="Allowlist of accepted barcodes" />
+        <param argument="--blacklist" type="data" optional="true" format="tabular,tsv" label="Denylist of accepted barcodes" />
+        <param argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command" />
 
-        <param name="prime3" argument="--3prime" type="boolean" label="Is the barcode at the 5' end?"
-            truevalue="1" falsevalue="0" checked="true"
-            help="By default the barcode is assumed to be on the 5' end of the read, but
-                use this option to sepecify that it is on the 3' end instead." />
-        <param name="print_log" argument="-L" type="boolean" label="Output log?"
-            truevalue="1" falsevalue="0" checked="true"
-            help="Choose if you want to generate a text file containing logging information." />
         <conditional name="quality">
             <param name="quality_selector" type="select" label="Enable quality filter?" >
                 <option value="false">No</option>
@@ -121,45 +99,46 @@
             <when value="false">
             </when>
             <when value="true">
-                <param name="quality_filter_threshold" label="Phred score threshold"
-                    type="integer" value="20" argument="--quality-filter-threshold"
-                    help="Remove reads where any UMI base quality score falls below this threshold." />
-                <param name="quality_encoding" argument="--quality-encoding" type="select" label="Library type"
-                    help="Quality score encoding. Choose from phred33 [33-77], phred64 [64-106] or solexa [59-106].">
-                    <option value="phred33">phred33 [33-77]</option>
-                    <option value="phred64">phred64 [64-106]</option>
-                    <option value="solexa">solexa [59-106]</option>
-                </param>
+                <param argument="--quality-filter-threshold" label="Phred score threshold"
+                    type="integer" value="" optional="true"
+                    help="Remove reads where any UMI base quality score falls below this threshold" />
+                <param argument="--quality-filter-mask" label="Mask UMI bases below threshold"
+                    type="integer" value="" optional="true"
+                    help="If a UMI base has a quality below this threshold,
+                    replace the base with 'N'" />
             </when>
         </conditional>
+        <expand macro="log_input_macro"/>
     </inputs>
     <outputs>
-        <data name="out" format_source="input_single" label="Reads: ${tool.name} on ${on_string}" >
-            <filter>input_type['type'] == "single"</filter>
+        <data name="out" format_source="input_read1" label="${tool.name} on ${on_string}: Reads" >
+            <filter>input_type_cond['input_type'] in ['single', 'paired']</filter>
         </data>
-        <data name="out1" format_source="input_read1" label="Reads1: ${tool.name} on ${on_string}" >
-            <filter>input_type['type'] != "single"</filter>
-            <expand macro="out_conditional" />
+        <data name="out2" format_source="input_read2" label="${tool.name} on ${on_string}: Reads2" >
+            <filter>input_type_cond['input_type'] == 'paired'</filter>
         </data>
-        <data name="out2" format_source="input_read2" label="Reads2: ${tool.name} on ${on_string}" >
-            <filter>input_type['type'] != "single"</filter>
-            <expand macro="out_conditional" />
-        </data>
-        
-        <data name="out_log" format="txt">
-            <filter>print_log == True</filter>
-        </data>
+        <collection name="out_paired_collection" type="paired" label="${tool.name} on ${on_string}: Reads">
+            <data name="forward" format_source="input_readpair" />
+            <data name="reverse" format_source="input_readpair" />
+            <filter>input_type_cond['input_type'] == 'paired_collection'</filter>
+        </collection>
+        <expand macro="log_output_macro"/>
     </outputs>
     <tests>
         <test expect_num_outputs="2">
-            <param name="type" value="single" />
-            <param name="input_single" value="t_R1.fastq" ftype="fastq" />
-            <param name="bc_pattern" value="XXXNNN" />
-            <param name="prime3" value="0" />
+            <conditional name="input_type_cond">
+                <param name="input_type" value="single" />
+                <param name="input_read1" value="t_R1.fastq" ftype="fastqsanger" />
+                <param name="bc_pattern" value="XXXNNN" />
+            </conditional>
+            <conditional name="extract_method_cond">
+                <param name="prime3" value="true" />
+            </conditional>
             <param name="quality_selector" value="true" />
             <param name="quality_filter_threshold" value="10" />
             <param name="quality_encoding" value="phred33" />
-            <output name="out" file="out_SE.fastq" ftype="fastq" />
+            <param name="log" value="true"/>
+            <output name="out" file="out_SE.fastq" ftype="fastqsanger" />
             <output name="out_log" >
                 <assert_contents>
                     <has_text text="Input Reads: 100" />
@@ -169,12 +148,15 @@
             </output>
         </test>
         <test expect_num_outputs="3">
-            <param name="type" value="paired" />
-            <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" />
-            <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" />
-            <param name="bc_pattern" value="NNNXXX" />
-            <output name="out1" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" />
-            <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" />
+            <conditional name="input_type_cond">
+                <param name="input_type" value="paired" />
+                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" />
+                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
+                <param name="bc_pattern" value="NNNXXX" />
+            </conditional>
+            <param name="log" value="true"/>
+            <output name="out" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" />
+            <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" />
             <output name="out_log" >
                 <assert_contents>
                     <has_text text="Input Reads: 100" />
@@ -182,18 +164,23 @@
                 </assert_contents>
             </output>
         </test>
-        <test expect_num_outputs="3">
-            <param name="type" value="paired_collection" /> <!-- same as before, but uncompressed -->
-            <param name="paired_type" value="no" />
-            <param name="input_readpair" >
-                <collection type="paired" >
-                    <element name="forward" ftype="fastq" value="t_R1.fastq" />
-                    <element name="reverse" ftype="fastq" value="t_R2.fastq" />
-                </collection>
-            </param>
-            <param name="bc_pattern" value="NNNXXX" />
-            <output name="out1" file="out_R1.fastq" ftype="fastq" />
-            <output name="out2" file="out_R2.fastq" ftype="fastq" />
+        <test expect_num_outputs="4">
+            <conditional name="input_type_cond">
+                <param name="input_type" value="paired_collection" /> <!-- same as before, but uncompressed -->
+                <param name="paired_type" value="no" />
+                <param name="input_readpair">
+                    <collection type="paired" >
+                        <element name="forward" ftype="fastqsanger" value="t_R1.fastq" />
+                        <element name="reverse" ftype="fastqsanger" value="t_R2.fastq" />
+                    </collection>
+                </param>
+                <param name="bc_pattern" value="NNNXXX" />
+            </conditional>
+            <param name="log" value="true"/>
+            <output_collection name="out_paired_collection" type="paired">
+                <element name="forward" file="out_R1.fastq" ftype="fastqsanger" />
+                <element name="reverse" file="out_R2.fastq" ftype="fastqsanger" />
+            </output_collection>
             <output name="out_log" >
                 <assert_contents>
                     <has_text text="Input Reads: 100" />
@@ -202,113 +189,87 @@
             </output>
         </test>
         <test expect_num_outputs="3">
-            <param name="type" value="paired" />
-            <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" />
-            <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" />
+            <conditional name="input_type_cond">
+                <param name="input_type" value="paired" />
+                <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" />
+                <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" />
+                <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" />
+            </conditional>
             <param name="extract_method" value="string" />
-            <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" />
-            <param name="use_barcodes" value="yes" />
-            <param name="filter_barcode_file" value="scrb_seq_barcodes" />
-            <output name="out2" file="scrb_extract.fastq.gz" decompress="true"  ftype="fastq.gz" />
+            <param name="whitelist" value="scrb_seq_barcodes" />
+            <param name="log" value="true"/>
+            <output name="out2" file="scrb_extract.fastq.gz" decompress="true"  ftype="fastqsanger.gz" />
         </test>
         <test expect_num_outputs="3"><!-- same as above but with regex barcode-->
-            <param name="type" value="paired" />
-            <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" />
-            <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" />
+            <conditional name="input_type_cond">
+                <param name="input_type" value="paired" />
+                <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" />
+                <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" />
+                <param name="bc_pattern" value="^(?P&lt;cell_1&gt;.{6})(?P&lt;umi_1&gt;.{10})" />
+            </conditional>
             <param name="extract_method" value="regex" />
-            <param name="bc_pattern" value="^(?P&lt;cell_1&gt;.{6})(?P&lt;umi_1&gt;.{10})" />
-            <param name="use_barcodes" value="yes" />
-            <param name="filter_barcode_file" value="scrb_seq_barcodes" />
-            <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastq.gz" />
+            <param name="whitelist" value="scrb_seq_barcodes" />
+            <param name="log" value="true"/>
+            <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastqsanger.gz" />
         </test>
         <test expect_num_outputs="2"><!-- CelSeq2 example -->
-            <param name="type" value="paired" />
-            <param name="input_read1" value="read_R1.200.gz" ftype="fastq.gz" />
-            <param name="input_read2" value="read_R2.200.gz" ftype="fastq.gz" />
+            <conditional name="input_type_cond">
+                <param name="input_type" value="paired" />
+                <param name="input_read1" value="read_R1.200.gz" ftype="fastqsanger.gz" />
+                <param name="input_read2" value="read_R2.200.gz" ftype="fastqsanger.gz" />
+                <param name="bc_pattern" value="NNNNNNCCCCCC" />
+            </conditional>
             <param name="extract_method" value="string" />
-            <param name="bc_pattern" value="NNNNNNCCCCCC" />
-            <output name="out1" file="read_R1.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" />
-            <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" />
-            <param name="print_log" value="false"/>
+            <output name="out" file="read_R1.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" />
+            <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" />
         </test>
     </tests>
     <help><![CDATA[
 
-
-UMI-tools extract.py - Extract UMI from fastq
-=============================================
-
-Purpose
--------
+extract - Extract UMI from fastq
+================================
 
 Extract UMI barcode from a read and add it to the read name, leaving
-any sample barcode in place. Can deal with paired end reads and UMIs
-split across the paired ends
-
-Options
--------
+any sample barcode in place
 
---split-barcode
-       By default the UMI is assumed to be on the first read. Use this
-       option if the UMI is contained on both reads and specify the
-       pattern of the barcode/UMI on the second read using the option
-       ``--bc-pattern2``
+Can deal with paired end reads and UMIs
+split across the paired ends. Can also optionally extract cell
+barcodes and append these to the read name also. See the section below
+for an explanation for how to encode the barcode pattern(s) to
+specficy the position of the UMI +/- cell barcode.
+
 
---bc-pattern
-       Use this option to specify the format of the UMI/barcode. Use Ns to
-       represent the random positions and Xs to indicate the bc positions.
-       Bases with Ns will be extracted and added to the read name. Remaining
-       bases, marked with an X will be reattached to the read.
-
-       E.g. If the pattern is NNXXNN,
-       Then the read:
+Filtering and correcting cell barcodes
+--------------------------------------
 
-       @HISEQ:87:00000000 read1
-       AAGGTTGCTGATTGGATGGGCTAG
-       DA1AEBFGGCG01DFH00B1FF0B
-       +
-
-       will become:
-       @HISEQ:87:00000000_AATT read1
-       GGGCTGATTGGATGGGCTAG
-       1AFGGCG01DFH00B1FF0B
-       +
+``umi_tools extract`` can optionally filter cell barcodes against a user-supplied
+whitelist (``--whitelist``). If a whitelist is not available for your data,
+e.g
+if you have performed droplet-based scRNA-Seq, you can use the
+whitelist tool.
 
---bc-pattern2
-       Use this option to specify the format of the UMI/barcode for
-       the second read pair if required. If --bc-pattern2 is not
-       supplied, this defaults to the same pattern as --bc-pattern
+Cell barcodes which do not match the whitelist (user-generated or
+automatically generated) can also be optionally corrected using the
+``--error-correct-cell`` option.
 
---3prime
-       By default the barcode is assumed to be on the 5' end of the read, but
-       use this option to sepecify that it is on the 3' end instead
-
--L
-       Specify a log file to retain logging information and final statistics
-
---split-barcode
-       barcode is split across read pair
+The whitelist should be in  the following format (tab-separated)::
 
---quality-filter-threshold=QUALITY_FILTER_THRESHOLD
-       Remove reads where any UMI base quality score falls
-       below this threshold
---quality-encoding=QUALITY_ENCODING
-       Quality score encoding. Choose from phred33[33-77]
-       phred64 [64-106] or solexa [59-106]
-
-Usage:
-------
+        AAAAAA    AGAAAA
+        AAAATC
+        AAACAT
+        AAACTA    AAACTN,GAACTA
+        AAATAC
+        AAATCA    GAATCA
+        AAATGT    AAAGGT,CAATGT
 
-For single ended reads:
-        umi_tools extract --bc-pattern=[PATTERN] -L extract.log [OPTIONS]
-
-reads from stdin and outputs to stdout.
+Where column 1 is the whitelisted cell barcodes and column 2 is
+the list (comma-separated) of other cell barcodes which should be
+corrected to the barcode in column 1. If the ``--error-correct-cell``
+option is not used, this column will be ignored. Any additional columns
+in the whitelist input, such as the counts columns from the output of
+umi_tools whitelist, will be ignored.
 
-For paired end reads:
-        umi_tools extract --bc-pattern=[PATTERN] --read2-in=[FASTQIN] --read2-out=[FASTQOUT] -L extract.log [OPTIONS]
-
-reads end one from stdin and end two from FASTQIN and outputs end one to stdin
-and end two to FASTQOUT.
+@FASTQ_BARCODE_EXTRACTION_HELP@
 
     ]]></help>
     <expand macro="citations" />
˭/I-./5@zEe9z%% dqf&& PiCs,nhfhfΠ`@PZ\XRs=7D@QjA+/?%IEyV )i9%\%V y99\ `P