view umi-tools_whitelist.xml @ 2:c1743022a8cd draft

planemo upload commit e274ffd67bede657f4f5a2c6524023ff57f0db87
author iuc
date Thu, 29 Mar 2018 18:24:04 -0400
parents dac4e7dc837d
children 853f74e08009
line wrap: on
line source

<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.1">
    <description>Extract cell barcodes from FASTQ files</description>
    <macros>
        <import>macros.xml</import>
        <macro name="barcode2_conditional" >
            <conditional name="barcode">
                <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?">
                    <option value="first_read_only">Barcode on first read only</option>
                    <option value="both_reads">Barcode on both reads</option>
                </param>
                <when value="first_read_only"/>
                <when value="both_reads">
                    <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
                           help="Use this option to specify the format of the UMI/barcode for
                                 the second read pair if required.">
                    </param>
                </when>
            </conditional>
        </macro>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        #set $gz = False
        #if $input_type.type == 'single':
            #if $input_type.input_single.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type.input_single' input_single.gz &&
                #set $gz = True
            #end if
        #elif $input_type.type == 'paired':
            #if $input_type.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type.input_read1' input_read1.gz &&
                ln -s '$input_type.input_read2' input_read2.gz &&
                #set $gz = True
            #end if
        #else  ## paired_collection
            #if $input_type.input_readpair.forward.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type.input_readpair.forward' input_read1.gz &&
                ln -s '$input_type.input_readpair.reverse' input_read2.gz &&
                #set $gz = True
            #end if
        #end if
        umi_tools whitelist
            --bc-pattern='$bc_pattern'
            --subset-reads='$subset_reads'
            #if $input_type.type == 'single':
                #if $gz:
                    --stdin=input_single.gz
                #else
                    --stdin='$input_type.input_single'
                #end if
            #else:  ## equally valid for both 'paired' and 'paired_collection'
                #if $gz:
                    --stdin=input_read1.gz
                    --read2-in=input_read2.gz
                #else:
                    --stdin='$input_type.input_read1'
                    --read2-in='$input_type.input_read2'
                #end if
                #if $input_type.barcode.barcode_select == "1":
                    --bc-pattern2='$input_type.barcode.bc_pattern2'
                #end if
            #end if
            #if $celloptions.use_cell_opts == "advanced":
                #if $celloptions.set_cell_number != "0":
                   --set-cell-number=$celloptions.set_cell_number
                #end if
                #if $celloptions.expect_cells != "0":
                   --expect-cells=$celloptions.expect_cells
                #end if
                --error-correct-threshold=$celloptions.error_correct_thresh
            #end if
            --method=$method
            --plot-prefix=OUT

            $prime3

            #if $log:
                --log='$out_log'
            #end if

            --log2stderr

            > '$out_whitelist' &&

            mkdir '${ out_html_report.files_path }' &&
            cp OUT_*.png '${ out_html_report.files_path }' &&

            echo "<html>
            <head></head><body>
            <h1>Cell and Count Metrics</h1>
            <img src=\"OUT_cell_barcode_count_density.png\" ><br />
            <img src=\"OUT_cell_barcode_knee.png\" ><br />
            <img src=\"OUT_cell_barcode_counts.png\" ><br />
            </body></html>" > '$out_html_report'
            &&
            mv OUT_cell_thresholds.tsv '$out_thresh'
    ]]></command>
    <inputs>
        <conditional name="input_type">
            <param name="type" type="select" label="Library type">
                <option value="single">Single-end</option>
                <option value="paired">Paired-end</option>
                <option value="paired_collection">Paired-end Dataset Collection</option>
            </param>
            <when value="single">
                <param name="input_single" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
            </when>
            <when value="paired">
                <param name="input_read1" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
                <param name="input_read2" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" />
                <expand macro="barcode2_conditional" />
            </when>
            <when value="paired_collection">
                <param name="input_readpair" type="data_collection" collection_type="paired" format="fastq,fastq.gz" label="Reads in FASTQ format" />
                <expand macro="barcode2_conditional" />
            </when>
        </conditional>
        <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read"
            help="Use this option to specify the format of the UMI/barcode. Use Ns to
                    represent the random positions and Xs to indicate the bc positions.
                    Bases with Ns will be extracted and added to the read name. Remaining
                    bases, marked with an X will be reattached to the read.">
        </param>
        <param name="method" argument="--method" type="select" label="Count reads or UMIs"
               help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." >
            <option value="reads" selected="true" />
            <option value="umis" />
        </param>

        <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
            truevalue="--3prime" falsevalue=""
            help="By default the barcode is assumed to be on the 5' end of the read, but
                use this option to specify that it is on the 3' end instead." />
        <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." />
        <conditional name="celloptions" >
            <param name="use_cell_opts" type="select" label="Cell parameters" >
                <option value="defaults" selected="True">Use Defaults</option>
                <option value="advanced">Advanced Options</option>
            </param>
            <when value="defaults"/>
            <when value="advanced">
                <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" />
                <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" />
                <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." />
            </when>
        </conditional>
        <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue=""
            help="Choose if you want to generate a text file containing logging information." />

    </inputs>
    <outputs>
        <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
        <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" >
            <filter>log</filter>
        </data>
        <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
        <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="type" value="single" />
            <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" />
            <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
            <param name="method" value="reads" />
            <param name="prime3" value="true" />
            <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
            <output name="out_thresh" file="out_wl_single.tresh.tab" />
            <output name="out_html_report" file="out_wl_single.html" />
        </test>
        <test expect_num_outputs="4">
            <param name="type" value="paired" />
            <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" />
            <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" />
            <param name="barcode_select" value="both_reads" />
            <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
            <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
            <param name="method" value="reads" />
            <param name="prime3" value="false" />
            <param name="use_cell_opts" value="advanced" />
            <param name="expect_cells" value="5" />
            <param name="error_correct_thresh" value="3" />
            <param name="log" value="true" />
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="40" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
        <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
            <param name="type" value="paired_collection" />
            <param name="input_readpair" >
                <collection type="paired">
                    <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" />
                    <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" />
                </collection>
            </param>
            <param name="barcode_select" value="both_reads" />
            <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
            <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" />
            <param name="method" value="reads" />
            <param name="prime3" value="false" />
            <param name="use_cell_opts" value="advanced" />
            <param name="expect_cells" value="5" />
            <param name="error_correct_thresh" value="3" />
            <param name="log" value="true" />
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="40" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
    </tests>
    <help><![CDATA[


UMI-tools whitelist - Extract barcodes from fastq
==================================================

Purpose
-------

Extract cell barcodes and identify the most likely true barcodes using
the 'knee' method.

Options
-------

--bc-pattern
       This should be used where the barcodes are always in the same
       place in the read.

       - N = UMI position (required)
       - C = cell barcode position (optional)
       - X = sample position (optional)

       Bases with Ns and Cs will be extracted and added to the read
       name. The corresponding sequence qualities will be removed from
       the read. Bases with an X will be reattached to the read.

       E.g. If the pattern is NNNNCC,
       Then the read:
       @HISEQ:87:00000000 read1
       AAGGTTGCTGATTGGATGGGCTAG
       DA1AEBFGGCG01DFH00B1FF0B
       +
       will become:
       @HISEQ:87:00000000_TT_AAGG read1
       GCTGATTGGATGGGCTAG
       1AFGGCG01DFH00B1FF0B
       +

       where 'TT' is the cell barcode and 'AAGG' is the UMI.


--set-cell-number
        Use this option to explicity set the number of cell barcodes
        which should be accepted. Note that the exact number of cell
        barcodes in the outputted whitelist may be slightly less than
        this if there are multiple cells observed with the same
        frequency at the threshold between accepted and rejected cell
        barcodes.

--expect-cells=[EXPECTED_CELLS]
        An upper limit estimate for the number of inputted cells. The knee
        method will now select the first threshold (order ascendingly)
        which results in the number of cell barcodes accepted being <=
        EXPECTED_CELLS and > EXPECTED_CELLS * 0.1.


--bc-pattern2
       Use this option to specify the format of the UMI/barcode for
       the second read pair if required. If --bc-pattern2 is not
       supplied, this defaults to the same pattern as --bc-pattern

--3prime
       By default the barcode is assumed to be on the 5' end of the read, but
       use this option to sepecify that it is on the 3' end instead

Usage:
------

For single ended reads:
        umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log
        [OPTIONS]

reads from stdin and outputs to stdout.

For paired end reads where the cell barcodes is split across the read pairs:
        umi_tools whitelist --bc-pattern=[PATTERN]
        --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log
        [OPTIONS]

reads end one from stdin and end two from FASTQIN and outputs to stdin


Output:
-------

The whitelist is outputted as 4 tab-separated columns:

    1. whitelisted cell barcode
    2. Other cell barcode(s) (comma-separated) to correct to the
       whitelisted barcode
    3. Count for whitelisted cell barcodes
    4. Count(s) for the other cell barcode(s) (comma-separated)

example output:

    AAAAAA      AGAAAA          146     1
    AAAATC                      22
    AAACAT                      21
    AAACTA      AAACTN,GAACTA   27      1,1
    AAATAC                      72
    AAATCA      GAATCA          37      3
    AAATGT      AAAGGT,CAATGT   41      1,1
    AAATTG      CAATTG          36      1
    AACAAT                      18
    AACATA                      24

If --error-correct-threshold is set to 0, columns 2 and 4 will be empty.


    ]]></help>
    <expand macro="citations" />
</tool>