Mercurial > repos > iuc > umi_tools_extract
view umi-tools_extract.xml @ 11:a472e995d157 draft
"planemo upload commit 28e58376e1d70e38276873a7d5e2ab44db88c2c0"
author | iuc |
---|---|
date | Tue, 27 Aug 2019 17:10:51 -0400 |
parents | 828dba98cdb4 |
children | d5ff68d2d5ff |
line wrap: on
line source
<tool id="umi_tools_extract" name="UMI-tools extract" version="@VERSION@.0"> <description>Extract UMI from fastq files</description> <macros> <import>macros.xml</import> <macro name="out_conditional"> <actions> <conditional name="input_type.type"> <when value="paired_collection" > <action type="format"> <option type="from_param" name="input_type.input_readpair" param_attribute="forward.ext" /> </action> </when> <when value="paired" > <action type="format"> <option type="from_param" name="input_type.input_read1" param_attribute="ext" /> </action> </when> </conditional> </actions> </macro> </macros> <expand macro="requirements" /> <command detect_errors="exit_code"><![CDATA[ @COMMAND_LINK@ umi_tools extract --extract-method='$extract_method.value' --bc-pattern='$bc_pattern' #if $input_type.type == 'single': #if $gz: --stdin=input_single.gz --stdout out.gz #else --stdin=input_single.txt --stdout '$out' #end if #else: #if $gz: --stdin=input_read1.gz --read2-in=input_read2.gz --stdout out1.gz --read2-out=out2.gz #else: --stdin=input_read1.txt --read2-in=input_read2.txt --stdout '$out1' --read2-out='$out2' #end if #if $input_type.barcode.barcode_select == "both_reads": --split-barcode --bc-pattern2='$input_type.barcode.bc_pattern2' #end if #end if #if $barcodes.use_barcodes.value == 'yes': --filter-cell-barcode --whitelist='$barcodes.filter_barcode_file' '$barcodes.filter_correct.value' #end if #if not $prime3: --3prime #end if #if $quality.quality_selector =='true': --quality-filter-threshold '$quality.quality_filter_threshold' --quality-encoding '$quality.quality_encoding' #end if #if $print_log == "1": --log='$out_log' #else --supress-stats #end if #if $gz: #if $input_type.type == 'single': && mv out.gz '$out' #else && mv out1.gz '$out1' && mv out2.gz '$out2' #end if #end if ]]></command> <inputs> <expand macro="input_types" /> <conditional name="barcodes" > <param name="use_barcodes" argument="--filter-cell-barcode" type="select" label="Use Known Barcodes?" > <option value="yes">Yes</option> <option value="no" selected="true" >No</option> </param> <when value="no" /> <when value="yes" > <param name="filter_barcode_file" type="data" format="tabular,tsv" label="Barcode File" /> <param name="filter_correct" argument="--error-correct-cell" type="boolean" truevalue="--error-correct-cell" falsevalue="" checked="false" label="Apply correction to cell barcodes?" help="This only applies if your barcode file has two columns output from the umi_tools whitelist command." /> </when> </conditional> <param name="extract_method" type="select" label="Method to extract barcodes" > <option value="regex">Regular Expressions</option> <option value="string" selected="true">String</option> </param> <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" help="Use this option to specify the format of the UMI/barcode. Use Ns to represent the random positions and Xs to indicate the bc positions. Bases with Ns will be extracted and added to the read name. Remaining bases, marked with an X will be reattached to the read."> <sanitizer invalid_char=""> <valid initial="string.letters,string.digits"> <add value="!="/> <add value="-"/> <add value="_"/> <add value="."/> <add value="?"/> <add value="<"/><!-- left triangle bracket --> <add value=">"/><!-- right triangle bracket --> <add value="["/> <!-- left square bracket --> <add value="]"/> <!-- right square bracket --> <add value="^"/> <!-- caret --> <add value="{"/> <!-- left curly --> <add value="}"/> <!-- right curly --> <add value="("/> <!-- left parenthesis --> <add value=")"/> <!-- right parenthesis --> </valid> </sanitizer> </param> <param name="prime3" argument="--3prime" type="boolean" label="Is the barcode at the 5' end?" truevalue="1" falsevalue="0" checked="true" help="By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is on the 3' end instead." /> <param name="print_log" argument="-L" type="boolean" label="Output log?" truevalue="1" falsevalue="0" checked="true" help="Choose if you want to generate a text file containing logging information." /> <conditional name="quality"> <param name="quality_selector" type="select" label="Enable quality filter?" > <option value="false">No</option> <option value="true">Yes</option> </param> <when value="false"> </when> <when value="true"> <param name="quality_filter_threshold" label="Phred score threshold" type="integer" value="20" argument="--quality-filter-threshold" help="Remove reads where any UMI base quality score falls below this threshold." /> <param name="quality_encoding" argument="--quality-encoding" type="select" label="Library type" help="Quality score encoding. Choose from phred33 [33-77], phred64 [64-106] or solexa [59-106]."> <option value="phred33">phred33 [33-77]</option> <option value="phred64">phred64 [64-106]</option> <option value="solexa">solexa [59-106]</option> </param> </when> </conditional> </inputs> <outputs> <data name="out" format_source="input_single" label="Reads: ${tool.name} on ${on_string}" > <filter>input_type['type'] == "single"</filter> </data> <data name="out1" format_source="input_read1" label="Reads1: ${tool.name} on ${on_string}" > <filter>input_type['type'] != "single"</filter> <expand macro="out_conditional" /> </data> <data name="out2" format_source="input_read2" label="Reads2: ${tool.name} on ${on_string}" > <filter>input_type['type'] != "single"</filter> <expand macro="out_conditional" /> </data> <data name="out_log" format="txt"> <filter>print_log == True</filter> </data> </outputs> <tests> <test> <param name="type" value="single" /> <param name="input_single" value="t_R1.fastq" ftype="fastq" /> <param name="bc_pattern" value="XXXNNN" /> <param name="prime3" value="0" /> <param name="quality_selector" value="true" /> <param name="quality_filter_threshold" value="10" /> <param name="quality_encoding" value="phred33" /> <output name="out" file="out_SE.fastq" ftype="fastq" /> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> <has_text text="umi quality: 28" /> <has_text text="Reads output: 72" /> </assert_contents> </output> </test> <test> <param name="type" value="paired" /> <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq.gz" /> <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq.gz" /> <param name="bc_pattern" value="NNNXXX" /> <output name="out1" file="out_R1.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" /> <output name="out2" file="out_R2.fastq.gz" decompress="true" lines_diff="2" ftype="fastq.gz" /> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> <has_text text="Reads output: 100" /> </assert_contents> </output> </test> <test> <param name="type" value="paired_collection" /> <!-- same as before, but uncompressed --> <param name="paired_type" value="no" /> <param name="input_readpair" > <collection type="paired" > <element name="forward" ftype="fastq" value="t_R1.fastq" /> <element name="reverse" ftype="fastq" value="t_R2.fastq" /> </collection> </param> <param name="bc_pattern" value="NNNXXX" /> <output name="out1" file="out_R1.fastq" ftype="fastq" /> <output name="out2" file="out_R2.fastq" ftype="fastq" /> <output name="out_log" > <assert_contents> <has_text text="Input Reads: 100" /> <has_text text="Reads output: 100" /> </assert_contents> </output> </test> <test> <param name="type" value="paired" /> <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" /> <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" /> <param name="extract_method" value="string" /> <param name="bc_pattern" value="CCCCCCNNNNNNNNNN" /> <param name="use_barcodes" value="yes" /> <param name="filter_barcode_file" value="scrb_seq_barcodes" /> <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastq.gz" /> </test> <test><!-- same as above but with regex barcode--> <param name="type" value="paired" /> <param name="input_read1" value="scrb_seq_fastq.1.gz" ftype="fastq.gz" /> <param name="input_read2" value="scrb_seq_fastq.2.gz" ftype="fastq.gz" /> <param name="extract_method" value="regex" /> <param name="bc_pattern" value="^(?P<cell_1>.{6})(?P<umi_1>.{10})" /> <param name="use_barcodes" value="yes" /> <param name="filter_barcode_file" value="scrb_seq_barcodes" /> <output name="out2" file="scrb_extract.fastq.gz" decompress="true" ftype="fastq.gz" /> </test> <test><!-- CelSeq2 example --> <param name="type" value="paired" /> <param name="input_read1" value="read_R1.200.gz" ftype="fastq.gz" /> <param name="input_read2" value="read_R2.200.gz" ftype="fastq.gz" /> <param name="extract_method" value="string" /> <param name="bc_pattern" value="NNNNNNCCCCCC" /> <output name="out1" file="read_R1.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" /> <output name="out2" file="read_R2.200_extracted.fastq.gz" ftype="fastq.gz" decompress="true" lines_diff="1" /> </test> </tests> <help><![CDATA[ UMI-tools extract.py - Extract UMI from fastq ============================================= Purpose ------- Extract UMI barcode from a read and add it to the read name, leaving any sample barcode in place. Can deal with paired end reads and UMIs split across the paired ends Options ------- --split-barcode By default the UMI is assumed to be on the first read. Use this option if the UMI is contained on both reads and specify the pattern of the barcode/UMI on the second read using the option ``--bc-pattern2`` --bc-pattern Use this option to specify the format of the UMI/barcode. Use Ns to represent the random positions and Xs to indicate the bc positions. Bases with Ns will be extracted and added to the read name. Remaining bases, marked with an X will be reattached to the read. E.g. If the pattern is NNXXNN, Then the read: @HISEQ:87:00000000 read1 AAGGTTGCTGATTGGATGGGCTAG DA1AEBFGGCG01DFH00B1FF0B + will become: @HISEQ:87:00000000_AATT read1 GGGCTGATTGGATGGGCTAG 1AFGGCG01DFH00B1FF0B + --bc-pattern2 Use this option to specify the format of the UMI/barcode for the second read pair if required. If --bc-pattern2 is not supplied, this defaults to the same pattern as --bc-pattern --3prime By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is on the 3' end instead -L Specify a log file to retain logging information and final statistics --split-barcode barcode is split across read pair --quality-filter-threshold=QUALITY_FILTER_THRESHOLD Remove reads where any UMI base quality score falls below this threshold --quality-encoding=QUALITY_ENCODING Quality score encoding. Choose from phred33[33-77] phred64 [64-106] or solexa [59-106] Usage: ------ For single ended reads: umi_tools extract --bc-pattern=[PATTERN] -L extract.log [OPTIONS] reads from stdin and outputs to stdout. For paired end reads: umi_tools extract --bc-pattern=[PATTERN] --read2-in=[FASTQIN] --read2-out=[FASTQOUT] -L extract.log [OPTIONS] reads end one from stdin and end two from FASTQIN and outputs end one to stdin and end two to FASTQOUT. ]]></help> <expand macro="citations" /> </tool>