view umi-tools_whitelist.xml @ 16:953b4821b183 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit 31bad8c5bf75981eafcd19ec6b00593f184fdeb8"
author iuc
date Thu, 18 Nov 2021 08:26:33 +0000
parents 345bdf4546fd
children
line wrap: on
line source

<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Extract cell barcodes from FASTQ files</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        #import json
        @COMMAND_LINK@
        umi_tools whitelist
            
            @FASTQ_BARCODE_EXTRACTION_OPTIONS@ 

            --subset-reads='$subset_reads'
            $allow_threshold_error
            #if $ed_above_threshold
                --ed-above-threshold=$ed_above_threshold
            #end if
            --knee-method='$knee_method'
            #if $input_type_cond.input_type == 'single':
                #if $gz:
                    --stdin=input_single.gz
                #else
                    --stdin=input_single.txt
                #end if
            #else:  ## equally valid for both 'paired' and 'paired_collection'
                #if $gz:
                    --stdin=input_read1.gz
                    --read2-in=input_read2.gz
                #else:
                    --stdin=input_read1.txt
                    --read2-in=input_read2.txt
                #end if
            #end if
            #if $celloptions.use_cell_opts == "advanced":
                #if str($celloptions.set_cell_number) != '':
                   --set-cell-number=$celloptions.set_cell_number
                #end if
                #if str($celloptions.expect_cells) != '':
                   --expect-cells=$celloptions.expect_cells
                #end if
                --error-correct-threshold=$celloptions.error_correct_threshold
            #end if
            --method=$method
            --plot-prefix=OUT

            @LOG@
            > '$out_whitelist' &&

            mkdir '${ out_html_report.files_path }' &&
            cp OUT_*.png '${ out_html_report.files_path }' &&

            echo "<html>
            <head></head><body>
            <h1>Cell and Count Metrics</h1>
            <img src=\"OUT_cell_barcode_count_density.png\" ><br />
            <img src=\"OUT_cell_barcode_knee.png\" ><br />
            <img src=\"OUT_cell_barcode_counts.png\" ><br />
            </body></html>" > '$out_html_report'
            &&
            mv OUT_cell_thresholds.tsv '$out_thresh'
    ]]></command>
    <inputs>
        <expand macro="input_types" />
        <expand macro="fastq_barcode_extraction_options_macro"/>

        <param argument="--method" type="select" label="Count reads or UMIs"
               help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent" >
            <option value="reads" selected="true">Reads</option>
            <option value="umis">UMIs</option>
        </param>
        <!-- TODO Cannot use expect-cells with 'distance' knee method.-->
        <param argument="--knee-method" type="select" label="Method for detection of knee">
            <option value="distance" selected="true">Distance</option>
            <option value="density">Density</option>
        </param>
        <param argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes" />
        <param argument="--allow-threshold-error" type="boolean" truevalue="--allow-threshold-error" falsevalue="" label="Don't select a threshold" help="Will still output the plots if requested"/> 
        <param argument="--ed-above-threshold" type="select" optional="true" label="Detect and correct CBs above the threshold" help="which may be sequence errors from another CB">
            <option value="correct">Correct</option>
            <option value="discard">Discard</option>
        </param>
        <conditional name="celloptions" >
            <param name="use_cell_opts" type="select" label="Cell parameters" >
                <option value="defaults" selected="True">Use Defaults</option>
                <option value="advanced">Advanced Options</option>
            </param>
            <when value="defaults"/>
            <when value="advanced">
                <param argument="--set-cell-number" type="integer" min="0" optional="true" label="Specify the number of cell barcodes to accept" />
                <param argument="--expect-cells" type="integer" min="0" optional="true" label="Prior expectation on the upper limit on the number of cells sequenced" />
                <param argument="--error-correct-threshold" type="integer" min="0" value="1" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics" />
            </when>
        </conditional>
        <expand macro="log_input_macro"/>
    </inputs>
    <outputs>
        <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
        
        <data name="filtered_out" format_source="input_read1" label="${tool.name} on ${on_string}: reads not matching regex pattern">
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] in ['single', 'paired']</filter>
        </data>
        <data name="filtered_out_paired" format_source="input_read2" label="${tool.name} on ${on_string}: reads not matching regex pattern">
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired'</filter>
        </data>
        <collection name="filtered_out_paired_collection" type="paired" label="${tool.name} on ${on_string}: reads not matching regex pattern">
            <data name="forward" format_source="input_readpair" />
            <data name="reverse" format_source="input_readpair" />
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired_collection'</filter>
        </collection>
        <expand macro="log_output_macro"/>
        <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
        <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <conditional name="input_type_cond" >
                <param name="input_type" value="single" />
                <param name="input_read1" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
            </conditional>
            <conditional name="extract_method_cond">
                <param name="prime3" value="true" />
            </conditional>
            <param name="method" value="reads" />
            <param name="knee_method" value="density"/>
            <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
            <output name="out_thresh" file="out_wl_single.tresh.tab" />
            <output name="out_html_report" file="out_wl_single.html" />
        </test>
        <test expect_num_outputs="4">
            <conditional name="input_type_cond" >
                <param name="input_type" value="paired" />
                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
                <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
            </conditional>
            <param name="method" value="reads" />
            <conditional name="extract_method_cond">
                <param name="prime3" value="false" />
            </conditional>
            <param name="use_cell_opts" value="advanced" />
            <param name="knee_method" value="density"/>
            <param name="expect_cells" value="5" />
            <param name="error_correct_threshold" value="3" />
            <param name="log" value="true" />
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
        <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
            <conditional name="input_type_cond" >
                <param name="input_type" value="paired_collection" />
                <param name="input_readpair" >
                    <collection type="paired">
                        <element name="forward" ftype="fastqsanger.gz" value="t_R1.fastq.gz" />
                        <element name="reverse" ftype="fastqsanger.gz" value="t_R2.fastq.gz" />
                    </collection>
                </param>
                <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
                <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
            </conditional>
            <param name="method" value="reads" />
            <conditional name="extract_method_cond">
                <param name="prime3" value="false" />
            </conditional>
            <param name="use_cell_opts" value="advanced" />
            <param name="knee_method" value="density"/>
            <param name="expect_cells" value="5" />
            <param name="error_correct_threshold" value="3" />
            <param name="log" value="true" />
            <param name="filtered_out_bool" value="true"/>
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
        <!-- Error report on not accepting regex and lt and gt symbols -->
        <test expect_num_outputs="4">
            <conditional name="input_type_cond" >
                <param name="input_type" value="single" />
                <param name="input_read1" value="testYYY.40k.fastq.gz" ftype="fastqsanger.gz" />
            </conditional>
            <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" />
            <param name="extract_method" value="regex" />
            <param name="method" value="umis" />
            <param name="knee_method" value="density"/>
            <param name="prime3" value="true" />
            <param name="filtered_out_bool" value="true"/>
            <output name="out_whitelist" file="out_wl_user.single.txt" />
            <output name="out_thresh" file="out_wl_user.single.tresh.tab" />
            <output name="out_html_report" file="out_wl_user.single.html" />
            <output name="filtered_out">
                <assert_contents>
                    <has_text text="@A00250:74:HFMFVDSXX:2:1101:1027:1016 1:N:0:GTAGAGGA"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[


UMI-tools whitelist - Extract barcodes from fastq
==================================================

Purpose
-------

Extract cell barcodes and identify the most likely true barcodes using
the 'knee' method.

@FASTQ_BARCODE_EXTRACTION_HELP@



Output:
-------

The whitelist is outputted as 4 tab-separated columns:

    1. whitelisted cell barcode
    2. Other cell barcode(s) (comma-separated) to correct to the
       whitelisted barcode
    3. Count for whitelisted cell barcodes
    4. Count(s) for the other cell barcode(s) (comma-separated)

example output:

    AAAAAA      AGAAAA          146     1
    AAAATC                      22
    AAACAT                      21
    AAACTA      AAACTN,GAACTA   27      1,1
    AAATAC                      72
    AAATCA      GAATCA          37      3
    AAATGT      AAAGGT,CAATGT   41      1,1
    AAATTG      CAATTG          36      1
    AACAAT                      18
    AACATA                      24

If --error-correct-threshold is set to 0, columns 2 and 4 will be empty.


    ]]></help>
    <expand macro="citations" />
</tool>