diff umi-tools_whitelist.xml @ 0:b911124762a8 draft

planemo upload commit 8da5246c32d60a49e6b6b9027c9adc0a31d4bc5a
author iuc
date Sun, 25 Feb 2018 13:07:58 -0500
parents
children dac4e7dc837d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/umi-tools_whitelist.xml	Sun Feb 25 13:07:58 2018 -0500
@@ -0,0 +1,285 @@
+<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.0">
+    <description>Extract cell barcodes from FASTQ files</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+        #set $gz = False
+        #if $input_type.type == 'single':
+            #if $input_type.input_single.is_of_type("fastq.gz", "fastqsanger.gz"):
+                ln -s '$input_type.input_single' input_single.gz &&
+                #set $gz = True
+            #end if
+        #else
+            #if $input_type.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"):
+                ln -s '$input_type.input_read1' input_read1.gz &&
+                ln -s '$input_type.input_read2' input_read2.gz &&
+                #set $gz = True
+            #end if
+        #end if
+        umi_tools whitelist
+            --bc-pattern='$bc_pattern'
+            --subset-reads='$subset_reads'
+            #if $input_type.type == 'single':
+                #if $gz:
+                    --stdin=input_single.gz
+                #else
+                    --stdin='$input_type.input_single'
+                #end if
+            #else:
+                #if $gz:
+                    --stdin=input_read1.gz
+                    --read2-in=input_read2.gz
+                #else:
+                    --stdin='$input_type.input_read1'
+                    --read2-in='$input_type.input_read2'
+                #end if
+                #if $input_type.barcode.barcode_select == "1":
+                    --bc-pattern2='$input_type.barcode.bc_pattern2'
+                #end if
+            #end if
+            #if $celloptions.use_cell_opts == "advanced":
+                #if $celloptions.set_cell_number != "0":
+                   --set-cell-number=$celloptions.set_cell_number
+                #end if
+                #if $celloptions.expect_cells != "0":
+                   --expect-cells=$celloptions.expect_cells
+                #end if
+                --error-correct-threshold=$celloptions.error_correct_thresh
+            #end if
+            --method=$method
+            --plot-prefix=OUT
+
+            $prime3
+
+            #if $log:
+                --log='$out_log'
+            #end if
+
+            > '$out_whitelist' &&
+
+            mkdir '${ out_html_report.files_path }' &&
+            cp OUT_*.png '${ out_html_report.files_path }' &&
+
+            echo "<html>
+            <head></head><body>
+            <h1>Cell and Count Metrics</h1>
+            <img src=\"OUT_cell_barcode_count_density.png\" ><br />
+            <img src=\"OUT_cell_barcode_knee.png\" ><br />
+            <img src=\"OUT_cell_barcode_counts.png\" ><br />
+            </body></html>" > '$out_html_report'
+            &&
+            mv OUT_cell_thresholds.tsv '$out_thresh'
+    ]]></command>
+    <inputs>
+        <conditional name="input_type">
+            <param name="type" type="select" label="Library type">
+                <option value="single">Single-end</option>
+                <option value="paired">Paired-end</option>
+            </param>
+            <when value="single">
+                <param name="input_single" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
+            </when>
+            <when value="paired">
+                <param name="input_read1" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
+                <param name="input_read2" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
+                <conditional name="barcode">
+                    <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?">
+                        <option value="first_read_only">Barcode on first read only</option>
+                        <option value="both_reads">Barcode on both reads</option>
+                    </param>
+                    <when value="first_read_only"/>
+                    <when value="both_reads">
+                        <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
+                            help="Use this option to specify the format of the UMI/barcode for
+                                  the second read pair if required.">
+                        </param>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+        <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read"
+            help="Use this option to specify the format of the UMI/barcode. Use Ns to
+                    represent the random positions and Xs to indicate the bc positions.
+                    Bases with Ns will be extracted and added to the read name. Remaining
+                    bases, marked with an X will be reattached to the read.">
+        </param>
+        <param name="method" argument="--method" type="select" label="Count reads or UMIs"
+               help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
+            <option value="reads" selected="true" />
+            <option value="umis" />
+        </param>
+
+        <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
+            truevalue="--3prime" falsevalue=""
+            help="By default the barcode is assumed to be on the 5' end of the read, but
+                use this option to specify that it is on the 3' end instead." />
+        <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." />
+        <conditional name="celloptions" >
+            <param name="use_cell_opts" type="select" label="Cell parameters" >
+                <option value="defaults" selected="True">Use Defaults</option>
+                <option value="advanced">Advanced Options</option>
+            </param>
+            <when value="defaults"/>
+            <when value="advanced">
+                <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" />
+                <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" />
+                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." />
+            </when>
+        </conditional>
+        <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue=""
+            help="Choose if you want to generate a text file containing logging information." />
+
+    </inputs>
+    <outputs>
+        <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
+        <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" >
+            <filter>log</filter>
+        </data>
+        <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
+        <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <param name="type" value="single" />
+            <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" />
+            <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
+            <param name="method" value="reads" />
+            <param name="prime3" value="true" />
+            <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
+            <output name="out_thresh" file="out_wl_single.tresh.tab" />
+            <output name="out_html_report" file="out_wl_single.html" />
+        </test>
+        <test expect_num_outputs="4">
+            <param name="type" value="paired" />
+            <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" />
+            <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" />
+            <param name="barcode_select" value="both_reads" />
+            <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
+            <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
+            <param name="method" value="reads" />
+            <param name="prime3" value="false" />
+            <param name="use_cell_opts" value="advanced" />
+            <param name="expect_cells" value="5" />
+            <param name="error_correct_thresh" value="3" />
+            <param name="log" value="true" />
+            <output name="out_whitelist" file="out_wl_paired.txt" />
+            <output name="out_log" file="out_wl_paired.log" lines_diff="40" />
+            <output name="out_html_report" file="out_wl_paired.html" />
+            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+
+UMI-tools whitelist - Extract barcodes from fastq
+==================================================
+
+Purpose
+-------
+
+Extract cell barcodes and identify the most likely true barcodes using
+the 'knee' method.
+
+Options
+-------
+
+--bc-pattern
+       This should be used where the barcodes are always in the same
+       place in the read.
+
+       - N = UMI position (required)
+       - C = cell barcode position (optional)
+       - X = sample position (optional)
+
+       Bases with Ns and Cs will be extracted and added to the read
+       name. The corresponding sequence qualities will be removed from
+       the read. Bases with an X will be reattached to the read.
+
+       E.g. If the pattern is NNNNCC,
+       Then the read:
+       @HISEQ:87:00000000 read1
+       AAGGTTGCTGATTGGATGGGCTAG
+       DA1AEBFGGCG01DFH00B1FF0B
+       +
+       will become:
+       @HISEQ:87:00000000_TT_AAGG read1
+       GCTGATTGGATGGGCTAG
+       1AFGGCG01DFH00B1FF0B
+       +
+
+       where 'TT' is the cell barcode and 'AAGG' is the UMI.
+
+
+--set-cell-number
+        Use this option to explicity set the number of cell barcodes
+        which should be accepted. Note that the exact number of cell
+        barcodes in the outputted whitelist may be slightly less than
+        this if there are multiple cells observed with the same
+        frequency at the threshold between accepted and rejected cell
+        barcodes.
+
+--expect-cells=[EXPECTED_CELLS]
+        An upper limit estimate for the number of inputted cells. The knee
+        method will now select the first threshold (order ascendingly)
+        which results in the number of cell barcodes accepted being <=
+        EXPECTED_CELLS and > EXPECTED_CELLS * 0.1.
+
+
+--bc-pattern2
+       Use this option to specify the format of the UMI/barcode for
+       the second read pair if required. If --bc-pattern2 is not
+       supplied, this defaults to the same pattern as --bc-pattern
+
+--3prime
+       By default the barcode is assumed to be on the 5' end of the read, but
+       use this option to sepecify that it is on the 3' end instead
+
+Usage:
+------
+
+For single ended reads:
+        umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log
+        [OPTIONS]
+
+reads from stdin and outputs to stdout.
+
+For paired end reads where the cell barcodes is split across the read pairs:
+        umi_tools whitelist --bc-pattern=[PATTERN]
+        --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log
+        [OPTIONS]
+
+reads end one from stdin and end two from FASTQIN and outputs to stdin
+
+
+Output:
+-------
+
+The whitelist is outputted as 4 tab-separated columns:
+
+    1. whitelisted cell barcode
+    2. Other cell barcode(s) (comma-separated) to correct to the
+       whitelisted barcode
+    3. Count for whitelisted cell barcodes
+    4. Count(s) for the other cell barcode(s) (comma-separated)
+
+example output:
+
+    AAAAAA      AGAAAA          146     1
+    AAAATC                      22
+    AAACAT                      21
+    AAACTA      AAACTN,GAACTA   27      1,1
+    AAATAC                      72
+    AAATCA      GAATCA          37      3
+    AAATGT      AAAGGT,CAATGT   41      1,1
+    AAATTG      CAATTG          36      1
+    AACAAT                      18
+    AACATA                      24
+
+If --error-correct-threshold is set to 0, columns 2 and 4 will be empty.
+
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>