view umi-tools_whitelist.xml @ 18:e4e12f0b3af1 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bc1b362f6783d3fc0ed0f42c14687001d7ff5f7a
author iuc
date Sat, 05 Oct 2024 13:07:41 +0000
parents 953b4821b183
children
line wrap: on
line source

<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Extract cell barcodes from FASTQ files</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        #import json
        @COMMAND_LINK@
        umi_tools whitelist
            
            @FASTQ_BARCODE_EXTRACTION_OPTIONS@ 

            --subset-reads='$subset_reads'
            $allow_threshold_error
            #if $ed_above_threshold
                --ed-above-threshold=$ed_above_threshold
            #end if
            --knee-method='$knee_method'
            #if $input_type_cond.input_type == 'single':
                #if $gz:
                    --stdin=input_single.gz
                #else
                    --stdin=input_single.txt
                #end if
            #else:  ## equally valid for both 'paired' and 'paired_collection'
                #if $gz:
                    --stdin=input_read1.gz
                    --read2-in=input_read2.gz
                #else:
                    --stdin=input_read1.txt
                    --read2-in=input_read2.txt
                #end if
            #end if
            #if $celloptions.use_cell_opts == "advanced":
                #if str($celloptions.set_cell_number) != '':
                   --set-cell-number=$celloptions.set_cell_number
                #end if
                #if str($celloptions.expect_cells) != '':
                   --expect-cells=$celloptions.expect_cells
                #end if
                --error-correct-threshold=$celloptions.error_correct_threshold
            #end if
            --method=$method
            --plot-prefix=OUT

            @LOG@
            > '$out_whitelist' &&

            mkdir '${ out_html_report.files_path }' &&
            cp OUT_*.png '${ out_html_report.files_path }' &&

            echo "<html>
            <head></head><body>
            <h1>Cell and Count Metrics</h1>
            <img src=\"OUT_cell_barcode_count_density.png\" ><br />
            <img src=\"OUT_cell_barcode_knee.png\" ><br />
            <img src=\"OUT_cell_barcode_counts.png\" ><br />
            </body></html>" > '$out_html_report'
            &&
            mv OUT_cell_thresholds.tsv '$out_thresh'
    ]]></command>
    <inputs>
        <expand macro="input_types" />
        <expand macro="fastq_barcode_extraction_options_macro"/>

        <param argument="--method" type="select" label="Count reads or UMIs"
               help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent" >
            <option value="reads" selected="true">Reads</option>
            <option value="umis">UMIs</option>
        </param>
        <!-- TODO Cannot use expect-cells with 'distance' knee method.-->
        <param argument="--knee-method" type="select" label="Method for detection of knee">
            <option value="distance" selected="true">Distance</option>
            <option value="density">Density</option>
        </param>
        <param argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes" />
        <param argument="--allow-threshold-error" type="boolean" truevalue="--allow-threshold-error" falsevalue="" label="Don't select a threshold" help="Will still output the plots if requested"/> 
        <param argument="--ed-above-threshold" type="select" optional="true" label="Detect and correct CBs above the threshold" help="which may be sequence errors from another CB">
            <option value="correct">Correct</option>
            <option value="discard">Discard</option>
        </param>
        <conditional name="celloptions" >
            <param name="use_cell_opts" type="select" label="Cell parameters" >
                <option value="defaults" selected="True">Use Defaults</option>
                <option value="advanced">Advanced Options</option>
            </param>
            <when value="defaults"/>
            <when value="advanced">
                <param argument="--set-cell-number" type="integer" min="0" optional="true" label="Specify the number of cell barcodes to accept" />
                <param argument="--expect-cells" type="integer" min="0" optional="true" label="Prior expectation on the upper limit on the number of cells sequenced" />
                <param argument="--error-correct-threshold" type="integer" min="0" value="1" label="Hamming distance for correction of barcodes to whitelist barcodes. Set to zero to generate no error correcting metrics" />
            </when>
        </conditional>
        <expand macro="log_input_macro"/>
    </inputs>
    <outputs>
        <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/>
        
        <data name="filtered_out" format_source="input_read1" label="${tool.name} on ${on_string}: Forward reads not matching regex pattern">
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] in ['single', 'paired']</filter>
        </data>
        <data name="filtered_out_paired" format_source="input_read2" label="${tool.name} on ${on_string}: Reverse reads not matching regex pattern">
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired'</filter>
        </data>
        <collection name="filtered_out_paired_collection" type="paired" label="${tool.name} on ${on_string}: Reads not matching regex pattern">
            <data name="forward" format_source="input_readpair" />
            <data name="reverse" format_source="input_readpair" />
            <filter>extract_method_cond['extract_method'] == 'regex' and extract_method_cond['filtered_out_bool'] and input_type_cond['input_type'] == 'paired_collection'</filter>
        </collection>
        <expand macro="log_output_macro"/>
        <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" />
        <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" />
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <conditional name="input_type_cond" >
                <param name="input_type" value="single" />
                <param name="input_read1" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" />
            </conditional>
            <conditional name="extract_method_cond">
                <param name="prime3" value="true" />
            </conditional>
            <param name="method" value="reads" />
            <param name="knee_method" value="density"/>
            <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" />
            <output name="out_thresh" file="out_wl_single.tresh.tab" />
            <output name="out_html_report" file="out_wl_single.html" />
        </test>
        <test expect_num_outputs="4">
            <conditional name="input_type_cond" >
                <param name="input_type" value="paired" />
                <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" />
                <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" />
                <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" />
                <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
            </conditional>
            <param name="method" value="reads" />
            <conditional name="extract_method_cond">
                <param name="prime3" value="false" />
            </conditional>
            <param name="use_cell_opts" value="advanced" />
            <param name="knee_method" value="density"/>
            <param name="expect_cells" value="5" />
            <param name="error_correct_threshold" value="3" />
            <param name="log" value="true" />
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
        <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input -->
            <conditional name="input_type_cond" >
                <param name="input_type" value="paired_collection" />
                <param name="input_readpair" >
                    <collection type="paired">
                        <element name="forward" ftype="fastqsanger.gz" value="t_R1.fastq.gz" />
                        <element name="reverse" ftype="fastqsanger.gz" value="t_R2.fastq.gz" />
                    </collection>
                </param>
                <param name="bc_pattern"  value="CCCNNNNNNNNXXXXX" />
                <!-- <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> -->
            </conditional>
            <param name="method" value="reads" />
            <conditional name="extract_method_cond">
                <param name="prime3" value="false" />
            </conditional>
            <param name="use_cell_opts" value="advanced" />
            <param name="knee_method" value="density"/>
            <param name="expect_cells" value="5" />
            <param name="error_correct_threshold" value="3" />
            <param name="log" value="true" />
            <param name="filtered_out_bool" value="true"/>
            <output name="out_whitelist" file="out_wl_paired.txt" />
            <output name="out_log" file="out_wl_paired.log" lines_diff="50" />
            <output name="out_html_report" file="out_wl_paired.html" />
            <output name="out_thresh" file="out_wl_paired.tresh.tab" />
        </test>
        <!-- Error report on not accepting regex and lt and gt symbols -->
        <test expect_num_outputs="4">
            <conditional name="input_type_cond" >
                <param name="input_type" value="single" />
                <param name="input_read1" value="testYYY.40k.fastq.gz" ftype="fastqsanger.gz" />
            </conditional>
            <param name="bc_pattern" value="(?P&#60;cell_1&#62;.{8,10})(?P&#60;discard_1&#62;ACTGGCCTGCGA){s&#60;=3}(?P&#60;cell_2&#62;.{9})(?P&#60;discard_2&#62;GGTAGCGGTGACA){s&#60;=3}(?P&#60;cell_3&#62;.{9})(?P&#60;umi_1&#62;.{8})T{3}.*" />
            <param name="extract_method" value="regex" />
            <param name="method" value="umis" />
            <param name="knee_method" value="density"/>
            <param name="prime3" value="true" />
            <param name="filtered_out_bool" value="true"/>
            <output name="out_whitelist" file="out_wl_user.single.txt" />
            <output name="out_thresh" file="out_wl_user.single.tresh.tab" />
            <output name="out_html_report" file="out_wl_user.single.html" />
            <output name="filtered_out">
                <assert_contents>
                    <has_text text="@A00250:74:HFMFVDSXX:2:1101:1027:1016 1:N:0:GTAGAGGA"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[


UMI-tools whitelist - Extract barcodes from fastq
==================================================

Purpose
-------

Extract cell barcodes and identify the most likely true barcodes using
the 'knee' method.

@FASTQ_BARCODE_EXTRACTION_HELP@



Output:
-------

The whitelist is outputted as 4 tab-separated columns:

    1. whitelisted cell barcode
    2. Other cell barcode(s) (comma-separated) to correct to the
       whitelisted barcode
    3. Count for whitelisted cell barcodes
    4. Count(s) for the other cell barcode(s) (comma-separated)

example output:

    AAAAAA      AGAAAA          146     1
    AAAATC                      22
    AAACAT                      21
    AAACTA      AAACTN,GAACTA   27      1,1
    AAATAC                      72
    AAATCA      GAATCA          37      3
    AAATGT      AAAGGT,CAATGT   41      1,1
    AAATTG      CAATTG          36      1
    AACAAT                      18
    AACATA                      24

If --error-correct-threshold is set to 0, columns 2 and 4 will be empty.


    ]]></help>
    <expand macro="citations" />
</tool>