Mercurial > repos > iuc > umi_tools_extract
diff umi-tools_extract.xml @ 15:27ac32a22ad2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:52:06 +0000 |
parents | 9fa7803d1c51 |
children | 7accf7407811 |
line wrap: on
line diff
--- a/umi-tools_extract.xml Wed Jun 02 18:27:33 2021 +0000 +++ b/umi-tools_extract.xml Mon Sep 13 14:52:06 2021 +0000 @@ -1,118 +1,96 @@ -<tool id="umi_tools_extract" name="UMI-tools extract" version="@VERSION@.2"> +<tool id="umi_tools_extract" name="UMI-tools extract" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>Extract UMI from fastq files</description> + <expand macro="bio_tools"/> <macros> <import>macros.xml</import> - <macro name="out_conditional"> - <actions> - <conditional name="input_type.type"> - <when value="paired_collection" > - <action type="format"> - <option type="from_param" name="input_type.input_readpair" param_attribute="forward.ext" /> - </action> - </when> - <when value="paired" > - <action type="format"> - <option type="from_param" name="input_type.input_read1" param_attribute="ext" /> - </action> - </when> - </conditional> - </actions> - </macro> </macros> <expand macro="requirements" /> <command detect_errors="exit_code"><![CDATA[ @COMMAND_LINK@ umi_tools extract - --extract-method='$extract_method.value' - --bc-pattern='$bc_pattern' - - #if $input_type.type == 'single': - #if $gz: - --stdin=input_single.gz - --stdout out.gz - #else - --stdin=input_single.txt - --stdout '$out' - #end if + + @FASTQ_BARCODE_EXTRACTION_OPTIONS@ + #if $input_type_cond.input_type == 'single': + #if $gz: + --stdin=input_single.gz + --stdout out.gz + #else + --stdin=input_single.txt + --stdout '$out' + #end if + #else: + #if $gz: + --stdin=input_read1.gz + --read2-in=input_read2.gz + --stdout out1.gz + --read2-out=out2.gz #else: - #if $gz: - --stdin=input_read1.gz - --read2-in=input_read2.gz - --stdout out1.gz - --read2-out=out2.gz - #else: - --stdin=input_read1.txt - --read2-in=input_read2.txt - --stdout '$out1' + --stdin=input_read1.txt + --read2-in=input_read2.txt + #if $input_type_cond.input_type == 'paired' + --stdout '$out' --read2-out='$out2' - #end if - #if $input_type.barcode.barcode_select == "both_reads": - --split-barcode - --bc-pattern2='$input_type.barcode.bc_pattern2' + #else + --stdout '$out_paired_collection.forward' + --read2-out='$out_paired_collection.reverse' #end if #end if + $input_type_cond.reconcile_pairs + #end if - #if $barcodes.use_barcodes.value == 'yes': - --filter-cell-barcode - --whitelist='$barcodes.filter_barcode_file' - '$barcodes.filter_correct.value' - #end if + #if $whitelist + --whitelist='$whitelist' + #end if + #if $blacklist + --blacklist='$blacklist' + #end if + $error_correct_cell.value - #if not $prime3: - --3prime - #end if - #if $quality.quality_selector =='true': + #if $quality.quality_selector =='true': + #if str($quality.quality_filter_threshold) != '' --quality-filter-threshold '$quality.quality_filter_threshold' - --quality-encoding '$quality.quality_encoding' + #end if + #if str($quality.quality_filter_mask) != '' + --quality-filter-mask '$quality.quality_filter_mask' + #end if + #if $input_type_cond.input_type != 'paired_collection' + #set input=$input_type_cond.input_read1 + #else + #set input=$input_type_cond.input_readpair.forward #end if - #if $print_log == "1": - --log='$out_log' + --quality-encoding + #if $input.ext.startswith("fastqillumina") + phred64 + #else if $input.ext.startswith("fastqsolexa") + solexa + #else + phred33 #end if + #end if + @LOG@ #if $gz: - #if $input_type.type == 'single': + #if $input_type_cond.input_type == 'single': && mv out.gz '$out' + #else if $input_type_cond.input_type == 'paired' + && mv out1.gz '$out' + && mv out2.gz '$out2' #else - && mv out1.gz '$out1' - && mv out2.gz '$out2' + && mv out1.gz '$out_paired_collection.forward' + && mv out2.gz '$out_paired_collection.reverse' #end if #end if ]]></command> <inputs> - <expand macro="input_types" /> - - <conditional name="barcodes" > - <param name="use_barcodes" argument="--filter-cell-barcode" type="select" label="Use Known Barcodes?" > - <option value="yes">Yes</option> - <option value="no" selected="true" >No</option> - </param> - <when value="no" /> - <when value="yes" > - <param name="filter_barcode_file" type="data" format="tabular,tsv" label="Barcode File" /> - <param name="filter_correct" argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command." /> - </when> - </conditional> + <expand macro="input_types"> + <param argument="--reconcile-pairs" type="boolean" truevalue="--reconcile-pairs" falsevalue="" checked="false" label="Allow unpaired reads" help="Allow the presences of reads in read2 input that are not present in read1 input. This allows cell barcode filtering of read1s without considering read2s" /> + </expand> + <expand macro="fastq_barcode_extraction_options_macro"/> - <param name="extract_method" type="select" label="Method to extract barcodes" > - <option value="regex">Regular Expressions</option> - <option value="string" selected="true">String</option> - </param> - - <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" - help="Use this option to specify the format of the UMI/barcode. Use Ns to - represent the random positions and Xs to indicate the bc positions. - Bases with Ns will be extracted and added to the read name. Remaining - bases, marked with an X will be reattached to the read."> - <expand macro="barcode_sanitizer" /> - </param> + <param argument="--whitelist" type="data" optional="true" format="tabular,tsv" label="Allowlist of accepted barcodes" /> + <param argument="--blacklist" type="data" optional="true" format="tabular,tsv" label="Denylist of accepted barcodes" /> + <param argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command" /> - <param name="prime3" argument="--3prime" type="boolean" label="Is the barcode at the 5' end?" - truevalue="1" falsevalue="0" checked="true" - help="By default the barcode is assumed to be on the 5' end of the read, but - use this option to sepecify that it is on the 3' end instead." /> - <param name="print_log" argument="-L" type="boolean" label="Output log?" - truevalue="1" falsevalue="0" checked="true" - help="Choose if you want to generate a text file containing logging information." /> <conditional name="quality"> <param name="quality_selector" type="select" label="Enable quality filter?" > <option value="false">No</option> @@ -121,45 +99,46 @@ <when value="false"> </when> <when value="true"> - <param name="quality_filter_threshold" label="Phred score threshold" - type="integer" value="20" argument="--quality-filter-threshold" - help="Remove reads where any UMI base quality score falls below this threshold." /> - <param name="quality_encoding" argument="--quality-encoding" type="select" label="Library type" - help="Quality score encoding. Choose from phred33 [33-77], phred64 [64-106] or solexa [59-106]."> - <option value="phred33">phred33 [33-77]</option> - <option value="phred64">phred64 [64-106]</option> - <option value="solexa">solexa [59-106]</option> - </param> + <param argument="--quality-filter-threshold" label="Phred score threshold" + type="integer" value="" optional="true" + help="Remove reads where any UMI base quality score falls below this threshold" /> + <param argument="--quality-filter-mask" label="Mask UMI bases below threshold" + type="integer" value="" optional="true" + help="If a UMI base has a quality below this threshold, + replace the base with 'N'" /> </when> </conditional> + <expand macro="log_input_macro"/> </inputs> <outputs> - <data name="out" format_source="input_single" label="Reads: ${tool.name} on ${on_string}" > - <filter>input_type['type'] == "single"</filter> + <data name="out" format_source="input_read1" label="${tool.name} on ${on_string}: Reads" > + <filter>input_type_cond['input_type'] in ['single', 'paired']</filter> </data> - <data name="out1" format_source="input_read1" label="Reads1: ${tool.name} on ${on_string}" > - <filter>input_type['type'] != "single"</filter> - <expand macro="out_conditional" /> + <data name="out2" format_source="input_read2" label="${tool.name} on ${on_string}: Reads2" > + <filter>input_type_cond['input_type'] == 'paired'</filter> </data> - <data name="out2" format_source="input_read2" label="Reads2: ${tool.name} on ${on_string}" > - <filter>input_type['type'] != "single"</filter> - <expand macro="out_conditional" /> - </data> - - <data name="out_log" format="txt"> - <filter>print_log == True</filter> - </data> + <collection name="out_paired_collection" type="paired" label="${tool.name} on ${on_string}: Reads"> + <data name="forward" format_source="input_readpair" /> + <data name="reverse" format_source="input_readpair" /> + <filter>input_type_cond['input_type'] == 'paired_collection'</filter> + </collection> + <expand macro="log_output_macro"/> </outputs> <tests> <test expect_num_outputs="2"> - <param name="type" value="single" /> - <param name="input_single" value="t_R1.fastq" ftype="fastq" /> - <param name="bc_pattern" value="XXXNNN" /> - <param name="prime3" value="0" /> + <conditional name="input_type_cond"> + <param name="input_type" value="single" /> + <param name="input_read1" value="t_R1.fastq" ftype="fastqsanger" /> + <param name="bc_pattern" value="XXXNNN" /> + </conditional> + <conditional name="extract_method_cond"> + <param name="prime3" value="true" /> + </conditional> <param name="quality_selector" value="true" /> <param name="quality_filter_threshold" value="10" /> <param name="quality_encoding" value="phred33" /> - <output name="out" file="out_SE.fastq" ftype="fastq" /> + <param name="log" value="true"/> + <output name="out" file="out_SE.fastq" ftype="fastqsanger" /> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> @@ -169,12 +148,15 @@ </output> </test> <test expect_num_outputs="3"> - <param name="type" value="paired" /> - <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" /> - <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" /> - <param name="bc_pattern" value="NNNXXX" /> - <output name="out1" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" /> - <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" /> + <conditional name="input_type_cond"> + <param name="input_type" value="paired" /> + <param name="input_read1" value="t_R1.fastq.gz" ftype="fastqsanger.gz" /> + <param name="input_read2" value="t_R2.fastq.gz" ftype="fastqsanger.gz" /> + <param name="bc_pattern" value="NNNXXX" /> + </conditional> + <param name="log" value="true"/> + <output name="out" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" /> + <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastqsanger.gz" /> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> @@ -182,18 +164,23 @@ </assert_contents> </output> </test> - <test expect_num_outputs="3"> - <param name="type" value="paired_collection" /> <!-- same as before, but uncompressed --> - <param name="paired_type" value="no" /> - <param name="input_readpair" > - <collection type="paired" > - <element name="forward" ftype="fastq" value="t_R1.fastq" /> - <element name="reverse" ftype="fastq" value="t_R2.fastq" /> - </collection> - </param> - <param name="bc_pattern" value="NNNXXX" /> - <output name="out1" file="out_R1.fastq" ftype="fastq" /> - <output name="out2" file="out_R2.fastq" ftype="fastq" /> + <test expect_num_outputs="4"> + <conditional name="input_type_cond"> + <param name="input_type" value="paired_collection" /> <!-- same as before, but uncompressed --> + <param name="paired_type" value="no" /> + <param name="input_readpair"> + <collection type="paired" > + <element name="forward" ftype="fastqsanger" value="t_R1.fastq" /> + <element name="reverse" ftype="fastqsanger" value="t_R2.fastq" /> + </collection> + </param> + <param name="bc_pattern" value="NNNXXX" /> + </conditional> + <param name="log" value="true"/> + <output_collection name="out_paired_collection" type="paired"> + <element name="forward" file="out_R1.fastq" ftype="fastqsanger" /> + <element name="reverse" file="out_R2.fastq" ftype="fastqsanger" /> + </output_collection> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> @@ -202,113 +189,87 @@ </output> </test> <test expect_num_outputs="3"> - <param name="type" value="paired" /> - <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" /> - <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" /> + <conditional name="input_type_cond"> + <param name="input_type" value="paired" /> + <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" /> + <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" /> + <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" /> + </conditional> <param name="extract_method" value="string" /> - <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" /> - <param name="use_barcodes" value="yes" /> - <param name="filter_barcode_file" value="scrb_seq_barcodes" /> - <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastq.gz" /> + <param name="whitelist" value="scrb_seq_barcodes" /> + <param name="log" value="true"/> + <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastqsanger.gz" /> </test> <test expect_num_outputs="3"><!-- same as above but with regex barcode--> - <param name="type" value="paired" /> - <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" /> - <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" /> + <conditional name="input_type_cond"> + <param name="input_type" value="paired" /> + <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastqsanger.gz" /> + <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastqsanger.gz" /> + <param name="bc_pattern" value="^(?P<cell_1>.{6})(?P<umi_1>.{10})" /> + </conditional> <param name="extract_method" value="regex" /> - <param name="bc_pattern" value="^(?P<cell_1>.{6})(?P<umi_1>.{10})" /> - <param name="use_barcodes" value="yes" /> - <param name="filter_barcode_file" value="scrb_seq_barcodes" /> - <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastq.gz" /> + <param name="whitelist" value="scrb_seq_barcodes" /> + <param name="log" value="true"/> + <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastqsanger.gz" /> </test> <test expect_num_outputs="2"><!-- CelSeq2 example --> - <param name="type" value="paired" /> - <param name="input_read1" value="read_R1.200.gz" ftype="fastq.gz" /> - <param name="input_read2" value="read_R2.200.gz" ftype="fastq.gz" /> + <conditional name="input_type_cond"> + <param name="input_type" value="paired" /> + <param name="input_read1" value="read_R1.200.gz" ftype="fastqsanger.gz" /> + <param name="input_read2" value="read_R2.200.gz" ftype="fastqsanger.gz" /> + <param name="bc_pattern" value="NNNNNNCCCCCC" /> + </conditional> <param name="extract_method" value="string" /> - <param name="bc_pattern" value="NNNNNNCCCCCC" /> - <output name="out1" file="read_R1.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" /> - <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" /> - <param name="print_log" value="false"/> + <output name="out" file="read_R1.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" /> + <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastqsanger.gz" decompress="true" lines_diff="1" /> </test> </tests> <help><![CDATA[ - -UMI-tools extract.py - Extract UMI from fastq -============================================= - -Purpose -------- +extract - Extract UMI from fastq +================================ Extract UMI barcode from a read and add it to the read name, leaving -any sample barcode in place. Can deal with paired end reads and UMIs -split across the paired ends - -Options -------- +any sample barcode in place ---split-barcode - By default the UMI is assumed to be on the first read. Use this - option if the UMI is contained on both reads and specify the - pattern of the barcode/UMI on the second read using the option - ``--bc-pattern2`` +Can deal with paired end reads and UMIs +split across the paired ends. Can also optionally extract cell +barcodes and append these to the read name also. See the section below +for an explanation for how to encode the barcode pattern(s) to +specficy the position of the UMI +/- cell barcode. + ---bc-pattern - Use this option to specify the format of the UMI/barcode. Use Ns to - represent the random positions and Xs to indicate the bc positions. - Bases with Ns will be extracted and added to the read name. Remaining - bases, marked with an X will be reattached to the read. - - E.g. If the pattern is NNXXNN, - Then the read: +Filtering and correcting cell barcodes +-------------------------------------- - @HISEQ:87:00000000 read1 - AAGGTTGCTGATTGGATGGGCTAG - DA1AEBFGGCG01DFH00B1FF0B - + - - will become: - @HISEQ:87:00000000_AATT read1 - GGGCTGATTGGATGGGCTAG - 1AFGGCG01DFH00B1FF0B - + +``umi_tools extract`` can optionally filter cell barcodes against a user-supplied +whitelist (``--whitelist``). If a whitelist is not available for your data, +e.g +if you have performed droplet-based scRNA-Seq, you can use the +whitelist tool. ---bc-pattern2 - Use this option to specify the format of the UMI/barcode for - the second read pair if required. If --bc-pattern2 is not - supplied, this defaults to the same pattern as --bc-pattern +Cell barcodes which do not match the whitelist (user-generated or +automatically generated) can also be optionally corrected using the +``--error-correct-cell`` option. ---3prime - By default the barcode is assumed to be on the 5' end of the read, but - use this option to sepecify that it is on the 3' end instead - --L - Specify a log file to retain logging information and final statistics - ---split-barcode - barcode is split across read pair +The whitelist should be in the following format (tab-separated):: ---quality-filter-threshold=QUALITY_FILTER_THRESHOLD - Remove reads where any UMI base quality score falls - below this threshold ---quality-encoding=QUALITY_ENCODING - Quality score encoding. Choose from phred33[33-77] - phred64 [64-106] or solexa [59-106] - -Usage: ------- + AAAAAA AGAAAA + AAAATC + AAACAT + AAACTA AAACTN,GAACTA + AAATAC + AAATCA GAATCA + AAATGT AAAGGT,CAATGT -For single ended reads: - umi_tools extract --bc-pattern=[PATTERN] -L extract.log [OPTIONS] - -reads from stdin and outputs to stdout. +Where column 1 is the whitelisted cell barcodes and column 2 is +the list (comma-separated) of other cell barcodes which should be +corrected to the barcode in column 1. If the ``--error-correct-cell`` +option is not used, this column will be ignored. Any additional columns +in the whitelist input, such as the counts columns from the output of +umi_tools whitelist, will be ignored. -For paired end reads: - umi_tools extract --bc-pattern=[PATTERN] --read2-in=[FASTQIN] --read2-out=[FASTQOUT] -L extract.log [OPTIONS] - -reads end one from stdin and end two from FASTQIN and outputs end one to stdin -and end two to FASTQOUT. +@FASTQ_BARCODE_EXTRACTION_HELP@ ]]></help> <expand macro="citations" />