Mercurial > repos > iuc > umi_tools_extract
view macros.xml @ 16:7accf7407811 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit b5b83fd1cf0a44edcd9a6cc7562a3add2f47bd3f"
author | iuc |
---|---|
date | Sat, 23 Oct 2021 20:38:07 +0000 |
parents | 27ac32a22ad2 |
children | f3759eec3018 |
line wrap: on
line source
<?xml version="1.0"?> <macros> <!-- macros applying to all umi_tools --> <token name="@TOOL_VERSION@">1.1.2</token> <token name="@VERSION_SUFFIX@">1</token> <token name="@PROFILE@">21.01</token> <xml name="requirements"> <requirements> <requirement type="package" version="@TOOL_VERSION@">umi_tools</requirement> <yield /> </requirements> </xml> <xml name="citations"> <citations> <citation type="doi">10.1101/gr.209601.116</citation> <citation type="bibtex"> @misc{githubUMI-tools, title = {UMI-tools}, publisher = {GitHub}, journal = {GitHub repository}, url = {https://github.com/CGATOxford/UMI-tools}, } </citation> </citations> </xml> <xml name="advanced_options_macro"> <section name="advanced" title="Extra parameters" expanded="false"> <param argument="--random-seed" type="integer" min="0" optional="true" label="Random Seed" /> </section> </xml> <token name="@ADVANCED_OPTIONS@"><![CDATA[ #if str($advanced.random_seed) != '' --random-seed='$advanced.random_seed' #end if ]]></token> <!-- macros for extract and whitelist--> <macro name="barcode_sanitizer" > <sanitizer invalid_char=""> <valid initial="string.letters,string.digits"> <add value="*" /><!-- asterisk --> <add value="," /><!-- comma --> <add value="." /><!-- period --> <add value="<" /><!-- less than --> <add value="=" /><!-- equals sign --> <add value=">" /><!-- greater than --> <add value="?" /><!-- question mark --> <add value="_" /><!-- underscore --> <add value="(" /><!-- left bracket --> <add value=")" /><!-- right bracket --> <add value="["/> <!-- left square bracket --> <add value="]"/> <!-- right square bracket --> <add value="{"/><!-- left brace --> <add value="}"/><!-- right brace --> <add value="^"/> <!-- caret --> <add value="-"/> <add value="!"/> </valid> </sanitizer> </macro> <xml name="sanitize_tag" > <sanitizer invalid_char=""> <valid initial="string.letters,string.digits" /> </sanitizer> </xml> <macro name="barcode1_macro" > <param argument="--bc-pattern" type="text" label="Barcode pattern for first read" help="Use this option to specify the format of the UMI/barcode. Use Ns to represent the random positions and Xs to indicate the bc positions. Bases with Ns will be extracted and added to the read name. Remaining bases, marked with an X will be reattached to the read"> <validator type="empty_field" /> <expand macro="barcode_sanitizer" /> </param> </macro> <macro name="barcode2_macro" > <param argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read" help="Use this option to specify the format of the UMI/barcode for the second read pair if required" > <expand macro="barcode_sanitizer" /> </param> </macro> <!-- not just fastq because this would allow also fastqcsanger --> <token name="@FASTQ_FORMATS@">fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz,fastqsolexa,fastqsolexa.gz</token> <xml name="bio_tools"> <xrefs> <xref type="bio.tools">umi-tools</xref> </xrefs> </xml> <xml name="input_types"> <conditional name="input_type_cond"> <param name="input_type" type="select" label="Library type"> <option value="single">Single-end</option> <option value="paired">Paired-end</option> <option value="paired_collection">Paired-end Dataset Collection</option> </param> <when value="single"> <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> <expand macro="barcode1_macro"/> </when> <when value="paired"> <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> <param name="input_read2" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> <expand macro="barcode1_macro"/> <expand macro="barcode2_macro"/> <yield/> </when> <when value="paired_collection"> <param name="input_readpair" type="data_collection" collection_type="paired" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> <expand macro="barcode1_macro"/> <expand macro="barcode2_macro"/> <yield/> </when> </conditional> </xml> <token name="@COMMAND_LINK@"><![CDATA[ #set $gz = False #if $input_type_cond.input_type == 'single': #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): ln -s '$input_type_cond.input_read1' input_single.gz && #set $gz = True #else ln -s '$input_type_cond.input_read1' input_single.txt && #end if #elif $input_type_cond.input_type == 'paired': #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): ln -s '$input_type_cond.input_read1' input_read1.gz && ln -s '$input_type_cond.input_read2' input_read2.gz && #set $gz = True #else ln -s '$input_type_cond.input_read1' input_read1.txt && ln -s '$input_type_cond.input_read2' input_read2.txt && #end if #else ## paired_collection #if $input_type_cond.input_readpair.forward.is_of_type("fastq.gz", "fastqsanger.gz"): ln -s '$input_type_cond.input_readpair.forward' input_read1.gz && ln -s '$input_type_cond.input_readpair.reverse' input_read2.gz && #set $gz = True #else ln -s '$input_type_cond.input_readpair.forward' input_read1.txt && ln -s '$input_type_cond.input_readpair.reverse' input_read2.txt && #end if #end if ]]></token> <!-- macros for count, dedup, and group --> <token name="@LINK_SAM_BAM_INPUT@"><![CDATA[ #if $input.is_of_type("sam"): ## TODO dedup has problems with SAM input in some cases ## https://github.com/CGATOxford/UMI-tools/issues/483 ## so convert it to sorted BAM for now ## #set $input_file = $input samtools sort --no-PG '$input' > 'input.bam' && samtools index -b 'input.bam' && #set $input_file = 'input.bam' #else: ln -sf '${input}' 'input.bam' && ln -sf '$input.metadata.bam_index' 'input.bam.bai' && #set $input_file = 'input.bam' #end if ]]></token> <token name="@SET_INPUT_TYPE@"><![CDATA[ ## TODO see comment in LINK_SAM_BAM_INPUT ## #if $input.is_of_type("sam"): ## --in-sam ## #end if ]]></token> <xml name="fastq_barcode_extraction_options_macro"> <conditional name="extract_method_cond"> <param argument="--extract-method" type="select" label="Barcode Extraction Method" help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" > <option value="string" selected="true" /> <option value="regex" /> </param> <when value="string"> <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" truevalue="--3prime" falsevalue="" help="By default the barcode is assumed to be on the 5' end of the read, but use this option to specify that it is on the 3' end instead. This option only works with ``--extract-method=string`` since 3' encoding can be specified explicitly with a regex, e.g ``.*(?P<umi_1>.{5})$``" /> </when> <when value="regex"> <param name="filtered_out_bool" type="boolean" label="Write out reads not matching regex pattern"/> </when> </conditional> <param argument="--ignore-read-pair-suffixes" type="boolean" truevalue="--ignore-read-pair-suffixes" falsevalue="" label="Ignore '\1' and '\2' read name suffixes"/> </xml> <token name="@FASTQ_BARCODE_EXTRACTION_OPTIONS@"><![CDATA[ ## fastq barcode extraction options: --extract-method='$extract_method_cond.extract_method' --bc-pattern='$input_type_cond.bc_pattern' #if $input_type_cond.input_type != 'single' and $input_type_cond.bc_pattern2 != '' --bc-pattern2='$input_type_cond.bc_pattern2' #end if #if $extract_method_cond.extract_method == 'string' $extract_method_cond.prime3 #else if $extract_method_cond.filtered_out_bool #if $input_type_cond.input_type == 'single': --filtered-out='$filtered_out' #else if $input_type_cond.input_type == 'paired': --filtered-out='$filtered_out' --filtered-out2='$filtered_out_paired' #else --filtered-out='$filtered_out_paired_collection.forward' --filtered-out2='$filtered_out_paired_collection.reverse' #end if #end if $ignore_read_pair_suffixes ]]></token> <token name="@FASTQ_BARCODE_EXTRACTION_HELP@"><![CDATA[ There are two methods enabled to extract the umi barcode (+/- cell barcode). For both methods, the patterns should be provided using the ``--bc-pattern`` and ``--bc-pattern2`` options.x - ``string`` This should be used where the barcodes are always in the same place in the read. - N = UMI position (required) - C = cell barcode position (optional) - X = sample position (optional) Bases with Ns and Cs will be extracted and added to the read name. The corresponding sequence qualities will be removed from the read. Bases with an X will be reattached to the read. E.g. If the pattern is `NNNNCC`, Then the read:: @HISEQ:87:00000000 read1 AAGGTTGCTGATTGGATGGGCTAG + DA1AEBFGGCG01DFH00B1FF0B will become:: @HISEQ:87:00000000_TT_AAGG read1 GCTGATTGGATGGGCTAG + 1AFGGCG01DFH00B1FF0B where 'TT' is the cell barcode and 'AAGG' is the UMI. - ``regex`` This method allows for more flexible barcode extraction and should be used where the cell barcodes are variable in length. Alternatively, the regex option can also be used to filter out reads which do not contain an expected adapter sequence. UMI-tools uses the regex module rather than the more standard re module since the former also enables fuzzy matching The regex must contain groups to define how the barcodes are encoded in the read. The expected groups in the regex are: umi_n = UMI positions, where n can be any value (required) cell_n = cell barcode positions, where n can be any value (optional) discard_n = positions to discard, where n can be any value (optional) UMI positions and cell barcode positions will be extracted and added to the read name. The corresponding sequence qualities will be removed from the read. Discard bases and the corresponding quality scores will be removed from the read. All bases matched by other groups or components of the regex will be reattached to the read sequence For example, the following regex can be used to extract reads from the Klein et al inDrop data:: (?P<cell_1>.{8,12})(?P<discard_1>GAGTGATTGCTTGTGACGCCTT)(?P<cell_2>.{8})(?P<umi_1>.{6})T{3}.* Where only reads with a 3' T-tail and `GAGTGATTGCTTGTGACGCCTT` in the correct position to yield two cell barcodes of 8-12 and 8bp respectively, and a 6bp UMI will be retained. You can also specify fuzzy matching to allow errors. For example if the discard group above was specified as below this would enable matches with up to 2 errors in the discard_1 group. :: (?P<discard_1>GAGTGATTGCTTGTGACGCCTT){s<=2} Note that all UMIs must be the same length for downstream processing with dedup, group or count commands]]></token> <xml name="barcode_options_macro"> <conditional name="bc" > <param argument="--extract-umi-method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" > <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option> <option value="tag" >Barcodes are contained in tags</option> <option value="umis" >Barcodes were extracted using umis</option> </param> <when value="read_id" > <param argument="--umi-separator" type="text" label="Delimiter between read id and the UMI" value="_" > <sanitizer invalid_char="" > <valid initial="string.punctuation" /> </sanitizer> </param> </when> <when value="tag" > <param argument="--umi-tag" type="text" label="Tag which contains the UMI" value="RX" > <expand macro="sanitize_tag" /> </param> <param argument="--umi-tag-split" type="text" label="Separate the UMI in tag by SPLIT" help="and take the first element"/> <param argument="--umi-tag-delimiter" type="text" label="Separate the UMI in tag by DELIMITER" help="and concatenate the elements"/> <param argument="--cell-tag" type="text" label="Tag which contains the cell barcode" > <expand macro="sanitize_tag" /> </param> <param argument="--cell-tag-split" type="text" label="Separate the cell barcode in tag by SPLIT" help="and take the first element"/> <param argument="--cell-tag-delimiter" type="text" label="Separate the cell barcode in tag by DELIMITER" help="and concatenate the elements"/> </when> <when value="umis"/> </conditional> </xml> <token name="@BARCODE_OPTIONS@"><![CDATA[ --extract-umi-method $bc.extract_umi_method #if str($bc.extract_umi_method) == 'read_id': --umi-separator '$bc.umi_separator' #else if str($bc.extract_umi_method) == 'tag': --umi-tag '$bc.umi_tag' #if $bc.umi_tag_split != '' --umi-tag-split '$bc.umi_tag_split' #end if #if $bc.umi_tag_delimiter != '' --umi-tag-delimiter '$bc.umi_tag_delimiter' #end if --cell-tag '$bc.cell_tag' #if $bc.cell_tag_split != '' --cell-tag-split '$bc.cell_tag_split' #end if #if $bc.cell_tag_delimiter != '' --cell-tag-delimiter '$bc.cell_tag_delimiter' #end if #end if ]]></token> <token name="@BARCODE_HELP@"><![CDATA[ Extracting barcodes ------------------- It is assumed that the FASTQ files were processed with ``umi_tools extract`` before mapping and thus the UMI is the last word of the read name. e.g: @HISEQ:87:00000000_AATT where ``AATT`` is the UMI sequeuence. If you have used an alternative method which does not separate the read id and UMI with a "_", such as bcl2fastq which uses ":", you can specify the separator with the option ``--umi-separator=<sep>``, replacing <sep> with e.g ":". Alternatively, if your UMIs are encoded in a tag, you can specify this by setting the option --extract-umi-method=tag and set the tag name with the --umi-tag option. For example, if your UMIs are encoded in the 'UM' tag, provide the following options: ``--extract-umi-method=tag`` ``--umi-tag=UM`` Finally, if you have used umis to extract the UMI +/- cell barcode, you can specify ``--extract-umi-method=umis`` The start position of a read is considered to be the start of its alignment minus any soft clipped bases. A read aligned at position 500 with cigar 2S98M will be assumed to start at position 498.]]></token> <xml name="umi_grouping_options_macro"> <section name="umi" title="UMI grouping options"> <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads" help="All methods start by identifying the reads with the same mapping position"> <option value="unique">Reads group share the exact same UMI</option> <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option> <option value="cluster">Identify clusters based on hamming distance</option> <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option> <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option> </param> <param argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (>14bp)" /> <param argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" /> <param argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced" /> <param argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" /> </section> </xml> <token name="@UMI_GROUPING_OPTIONS@"><![CDATA[ --method $umi.method --edit-distance-threshold $umi.edit_distance_threshold $umi.spliced_is_unique --soft-clip-threshold $umi.soft_clip_threshold $umi.read_length ]]></token> <token name="@UMI_GROUPING_HELP@"><![CDATA[ UMI grouping options -------------------- Grouping Method ............... What method to use to identify group of reads with the same (or similar) UMI(s)? All methods start by identifying the reads with the same mapping position. The simplest methods, unique and percentile, group reads with the exact same UMI. The network-based methods, cluster, adjacency and directional, build networks where nodes are UMIs and edges connect UMIs with an edit distance <= threshold (usually 1). The groups of reads are then defined from the network in a method-specific manner. For all the network-based methods, each read group is equivalent to one read count for the gene. - unique Reads group share the exact same UMI - percentile Reads group share the exact same UMI. UMIs with counts < 1% of the median counts for UMIs at the same position are ignored. - cluster Identify clusters of connected UMIs (based on hamming distance threshold). Each network is a read group - adjacency Cluster UMIs as above. For each cluster, select the node (UMI) with the highest counts. Visit all nodes one edge away. If all nodes have been visited, stop. Otherwise, repeat with remaining nodes until all nodes have been visted. Each step defines a read group. - directional (default) Identify clusters of connected UMIs (based on hamming distance threshold) and umi A counts >= (2* umi B counts) - 1. Each network is a read group. ]]></token> <xml name="sambam_options_macro"> <section name="sambam" title="SAM/BAM options"> <param argument="--mapping-quality" type="integer" value="0" label="Minimum mapping quality for a read to be retained"/> <param argument="--unmapped-reads" type="select" label="How to handle unmapped reads"> <option value="discard">discard</option> <option value="use">use</option> <option value="correct">correct</option> </param> <param argument="--chimeric-pairs" type="select" optional="true" label="How to handle chimeric read pairs (default: use)"> <option value="discard">discard</option> <option value="use">use</option> <option value="correct">correct</option> </param> <param argument="--unpaired-reads" type="select" optional="true" label="How to handle unpaired reads (default: use)"> <option value="discard">discard</option> <option value="use">use</option> <option value="correct">correct</option> </param> <param argument="--ignore-umi" type="boolean" truevalue="--ignore-umi" falsevalue="" label="Ignore UMI and dedup only on position"/> <param argument="--ignore-tlen" type="boolean" truevalue="--ignore-tlen" falsevalue="" label="Dedup paired end reads based solely on read1" help="whether or not the template length is the same"/> <param argument="--chrom" type="text" value="" label="Consider only chromosome" help="If a value is given only a single chromosome with the given name is considered"/> <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" /> <!--in-sam is set automatically--> <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates" /> </section> </xml> <token name="@SAMBAM_OPTIONS@"><![CDATA[ --mapping-quality $sambam.mapping_quality --unmapped-reads $sambam.unmapped_reads #if $sambam.chimeric_pairs --chimeric-pairs $sambam.chimeric_pairs #end if #if $sambam.unpaired_reads --unpaired-reads $sambam.unpaired_reads #end if $sambam.ignore_umi $sambam.ignore_tlen #if str($sambam.chrom) != '' --chrom '$sambam.chrom' #end if --subset $sambam.subset $sambam.paired @SET_INPUT_TYPE@ ]]></token> <!-- per-gene is hard coded in count https://github.com/CGATOxford/UMI-tools/blob/c3ead0792ad590822ca72239ef01b8e559802da9/umi_tools/count.py#L92 hence we need a specialized macro here TODO count used XF as default for gene-tag now I set it explicitly for the tests but we could as well parametrize the macro and set tool specific defaults --> <xml name="fullsc_options_macro"> <expand macro="sc_options_macro"> <param argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="Must combine with either --gene-tag or --per-contig. As for --per-contig except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file" /> </expand> </xml> <token name="@FULLSC_OPTIONS@"><![CDATA[ $sc.per_gene @SC_OPTIONS@ ]]></token> <xml name="sc_options_macro"> <section name="sc" title="Single-cell RNA-Seq options"> <yield/> <param argument="--gene-tag" type="text" optional="true" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file"> <expand macro="sanitize_tag" /> </param> <param argument="--assigned-status-tag" type="text" optional="true" label="Bam tag describing whether read is assigned to a gene" help="By default, this is set as the same tag as --gene-tag"> <expand macro="sanitize_tag" /> </param> <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" > <expand macro="barcode_sanitizer" /> </param> <param argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" /> <param argument="--gene-transcript-map" type="data" format="tabular" optional="true" label="Tabular file mapping genes to transripts" /> <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" label="Group reads only if they have the same cell barcode" /> </section> </xml> <token name="@SC_OPTIONS@"><![CDATA[ #if str($sc.gene_tag) != "": --gene-tag '$sc.gene_tag' #end if #if str($sc.assigned_status_tag) != "": --assigned-status-tag '$sc.assigned_status_tag' #end if #if str($sc.skip_tags_regex) != "": --skip-tags-regex '$sc.skip_tags_regex' #end if $sc.per_contig #if $sc.gene_transcript_map: --gene-transcript-map '$sc.gene_transcript_map' #end if $sc.per_cell ]]></token> <xml name="groupdedup_options_macro"> <section name="gd" title="group/dedup specific options"> <param argument="--buffer-whole-contig" type="boolean" truevalue="--buffer-whole-contig" falsevalue="" label="Read whole contig before outputting bundles" help="Guarantees that no reads are missed, but increases memory usage" /> <!-- TODO this option is hidden on the CLI. Should we expose it? --> <param argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" /> <param argument="--multimapping-detection-method" type="select" optional="true" label="BAM Tag indicating multimapping " help="Some aligners identify multimapping using bam tags. Setting this option to NH, X0 or XT will use these tags when selecting the best read amongst reads with the same position and umi"> <option value="NH">NH</option> <option value="X0">X0</option> <option value="XT">XT</option> </param> </section> </xml> <token name="@GROUPDEDUP_OPTIONS@"><![CDATA[ $gd.buffer_whole_contig $gd.whole_contig $gd.multimapping_detection_method ]]></token> <xml name="log_input_macro"> <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" help="Choose if you want to generate a text file containing logging information" /> </xml> <xml name="log_output_macro"> <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > <filter>log</filter> </data> </xml> <token name="@LOG@"><![CDATA[ #if $log: --log='$out_log' #end if --log2stderr ]]></token> </macros>