Mercurial > repos > iuc > umi_tools_whitelist
view umi-tools_whitelist.xml @ 9:0c721837cbcf draft
planemo upload commit a7a086ce7d7d84f53d4a022fa1da25ef7b9a5b9a
author | iuc |
---|---|
date | Fri, 20 Jul 2018 03:49:25 -0400 |
parents | 095c349b4343 |
children | 3adbf2fa0928 |
line wrap: on
line source
<tool id="umi_tools_whitelist" name="UMI-tools whitelist" version="@VERSION@.1"> <description>Extract cell barcodes from FASTQ files</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements" /> <command detect_errors="exit_code"><![CDATA[ @COMMAND_LINK@ umi_tools whitelist --bc-pattern='$bc_pattern' --subset-reads='$subset_reads' #if $input_type.type == 'single': #if $gz: --stdin=input_single.gz #else --stdin=input_single.txt #end if #else: ## equally valid for both 'paired' and 'paired_collection' #if $gz: --stdin=input_read1.gz --read2-in=input_read2.gz #else: --stdin=input_read1.txt --read2-in=input_read2.txt #end if #if $input_type.barcode.barcode_select == "both_reads": --bc-pattern2='$input_type.barcode.bc_pattern2' #end if #end if #if $celloptions.use_cell_opts == "advanced": #if $celloptions.set_cell_number != "0": --set-cell-number=$celloptions.set_cell_number #end if #if $celloptions.expect_cells != "0": --expect-cells=$celloptions.expect_cells #end if --error-correct-threshold=$celloptions.error_correct_thresh #end if --method=$method --plot-prefix=OUT $prime3 #if $log: --log='$out_log' #end if --log2stderr > '$out_whitelist' && mkdir '${ out_html_report.files_path }' && cp OUT_*.png '${ out_html_report.files_path }' && echo "<html> <head></head><body> <h1>Cell and Count Metrics</h1> <img src=\"OUT_cell_barcode_count_density.png\" ><br /> <img src=\"OUT_cell_barcode_knee.png\" ><br /> <img src=\"OUT_cell_barcode_counts.png\" ><br /> </body></html>" > '$out_html_report' && mv OUT_cell_thresholds.tsv '$out_thresh' ]]></command> <inputs> <expand macro="input_types" /> <param name="bc_pattern" argument="--bc-pattern" type="text" label="Barcode pattern for first read" help="Use this option to specify the format of the UMI/barcode. Use Ns to represent the random positions and Xs to indicate the bc positions. Bases with Ns will be extracted and added to the read name. Remaining bases, marked with an X will be reattached to the read."> </param> <param name="method" argument="--method" type="select" label="Count reads or UMIs" help="Many published protocols rank CBs by the number of reads the CBs appear in. However you could also use the number of unique UMIs a CB is associated with. Note that this is still and approximation to the number of transcripts captured because the same UMI could be associated with two different transcripts and be counted as independent." > <option value="reads" selected="true" /> <option value="umis" /> </param> <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" truevalue="--3prime" falsevalue="" help="By default the barcode is assumed to be on the 5' end of the read, but use this option to specify that it is on the 3' end instead." /> <param name="subset_reads" argument="--subset-reads" type="integer" min="0" value="0" label="Use the first N reads to automatically identify the true cell barcodes." /> <conditional name="celloptions" > <param name="use_cell_opts" type="select" label="Cell parameters" > <option value="defaults" selected="True">Use Defaults</option> <option value="advanced">Advanced Options</option> </param> <when value="defaults"/> <when value="advanced"> <param name="set_cell_number" type="integer" min="0" value="0" label="Specify the number of cell barcodes to accept" /> <param name="expect_cells" type="integer" min="0" value="0" label="Prior expectation on the upper limit on the number of cells sequenced" /> <param name="error_correct_thresh" type="integer" min="0" value="0" label="Hamming distance for correction of barcodes to whilelist barcodes. Set to zero to generate no error correcting metrics." /> </when> </conditional> <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" help="Choose if you want to generate a text file containing logging information." /> </inputs> <outputs> <data name="out_whitelist" format="tabular" label="${tool.name} on ${on_string}: Whitelist"/> <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > <filter>log</filter> </data> <data name="out_html_report" format="html" label="${tool.name} on ${on_string}: Webpage" /> <data name="out_thresh" format="tabular" label="${tool.name} on ${on_string}: TSV Cell Thresholds" /> </outputs> <tests> <test expect_num_outputs="3"> <param name="type" value="single" /> <param name="input_single" value="t_R2.fastq.gz" ftype="fastq" /> <param name="bc_pattern" value="CCCCCCCCNNNNNNNN" /> <param name="method" value="reads" /> <param name="prime3" value="true" /> <output name="out_whitelist" file="out_wl_single.txt" lines_diff="40" /> <output name="out_thresh" file="out_wl_single.tresh.tab" /> <output name="out_html_report" file="out_wl_single.html" /> </test> <test expect_num_outputs="4"> <param name="type" value="paired" /> <param name="input_read1" value="t_R1.fastq.gz" ftype="fastq" /> <param name="input_read2" value="t_R2.fastq.gz" ftype="fastq" /> <param name="barcode_select" value="first_read_only" /> <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> <param name="method" value="reads" /> <param name="prime3" value="false" /> <param name="use_cell_opts" value="advanced" /> <param name="expect_cells" value="5" /> <param name="error_correct_thresh" value="3" /> <param name="log" value="true" /> <output name="out_whitelist" file="out_wl_paired.txt" /> <output name="out_log" file="out_wl_paired.log" lines_diff="40" /> <output name="out_html_report" file="out_wl_paired.html" /> <output name="out_thresh" file="out_wl_paired.tresh.tab" /> </test> <test expect_num_outputs="4"> <!-- As previous, identical outputs but paired collection input --> <param name="type" value="paired_collection" /> <param name="input_readpair" > <collection type="paired"> <element name="forward" ftype="fastq.gz" value="t_R1.fastq.gz" /> <element name="reverse" ftype="fastq.gz" value="t_R2.fastq.gz" /> </collection> </param> <param name="barcode_select" value="first_read_only" /> <param name="bc_pattern" value="CCCNNNNNNNNXXXXX" /> <param name="bc_pattern2" value="CCCCCCCCNNNNNNNN" /> <param name="method" value="reads" /> <param name="prime3" value="false" /> <param name="use_cell_opts" value="advanced" /> <param name="expect_cells" value="5" /> <param name="error_correct_thresh" value="3" /> <param name="log" value="true" /> <output name="out_whitelist" file="out_wl_paired.txt" /> <output name="out_log" file="out_wl_paired.log" lines_diff="40" /> <output name="out_html_report" file="out_wl_paired.html" /> <output name="out_thresh" file="out_wl_paired.tresh.tab" /> </test> </tests> <help><![CDATA[ UMI-tools whitelist - Extract barcodes from fastq ================================================== Purpose ------- Extract cell barcodes and identify the most likely true barcodes using the 'knee' method. Options ------- --bc-pattern This should be used where the barcodes are always in the same place in the read. - N = UMI position (required) - C = cell barcode position (optional) - X = sample position (optional) Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will be removed from the read. Bases with an X will be reattached to the read. E.g. If the pattern is NNNNCC, Then the read: @HISEQ:87:00000000 read1 AAGGTTGCTGATTGGATGGGCTAG DA1AEBFGGCG01DFH00B1FF0B + will become: @HISEQ:87:00000000_TT_AAGG read1 GCTGATTGGATGGGCTAG 1AFGGCG01DFH00B1FF0B + where 'TT' is the cell barcode and 'AAGG' is the UMI. --set-cell-number Use this option to explicity set the number of cell barcodes which should be accepted. Note that the exact number of cell barcodes in the outputted whitelist may be slightly less than this if there are multiple cells observed with the same frequency at the threshold between accepted and rejected cell barcodes. --expect-cells=[EXPECTED_CELLS] An upper limit estimate for the number of inputted cells. The knee method will now select the first threshold (order ascendingly) which results in the number of cell barcodes accepted being <= EXPECTED_CELLS and > EXPECTED_CELLS * 0.1. --bc-pattern2 Use this option to specify the format of the UMI/barcode for the second read pair if required. If --bc-pattern2 is not supplied, this defaults to the same pattern as --bc-pattern --3prime By default the barcode is assumed to be on the 5' end of the read, but use this option to sepecify that it is on the 3' end instead Usage: ------ For single ended reads: umi_tools whitelist --bc-pattern=[PATTERN] -L extract.log [OPTIONS] reads from stdin and outputs to stdout. For paired end reads where the cell barcodes is split across the read pairs: umi_tools whitelist --bc-pattern=[PATTERN] --bc-pattern2=[PATTERN] --read2-in=[FASTQIN] -L extract.log [OPTIONS] reads end one from stdin and end two from FASTQIN and outputs to stdin Output: ------- The whitelist is outputted as 4 tab-separated columns: 1. whitelisted cell barcode 2. Other cell barcode(s) (comma-separated) to correct to the whitelisted barcode 3. Count for whitelisted cell barcodes 4. Count(s) for the other cell barcode(s) (comma-separated) example output: AAAAAA AGAAAA 146 1 AAAATC 22 AAACAT 21 AAACTA AAACTN,GAACTA 27 1,1 AAATAC 72 AAATCA GAATCA 37 3 AAATGT AAAGGT,CAATGT 41 1,1 AAATTG CAATTG 36 1 AACAAT 18 AACATA 24 If --error-correct-threshold is set to 0, columns 2 and 4 will be empty. ]]></help> <expand macro="citations" /> </tool>