view macros.xml @ 17:f3759eec3018 draft default tip

"planemo upload for repository commit 31bad8c5bf75981eafcd19ec6b00593f184fdeb8"
author iuc
date Thu, 18 Nov 2021 08:24:00 +0000
parents 7accf7407811
line wrap: on
line source

<?xml version="1.0"?>

    <!-- macros applying to all umi_tools -->

    <token name="@TOOL_VERSION@">1.1.2</token>
    <token name="@VERSION_SUFFIX@">2</token>
    <token name="@PROFILE@">21.01</token>
    <xml name="requirements">
            <requirement type="package" version="@TOOL_VERSION@">umi_tools</requirement>
            <yield />
    <xml name="citations">
            <citation type="doi">10.1101/gr.209601.116</citation>
            <citation type="bibtex">
                title = {UMI-tools},
                publisher = {GitHub},
                journal = {GitHub repository},
                url = {},
    <xml name="advanced_options_macro">
        <section name="advanced" title="Extra parameters" expanded="false">
            <param argument="--random-seed" type="integer" min="0" optional="true" label="Random Seed" />
    <token name="@ADVANCED_OPTIONS@"><![CDATA[
        #if str($advanced.random_seed) != ''
        #end if
    <!-- macros for extract and whitelist-->
    <macro name="barcode_sanitizer" >
        <sanitizer invalid_char="">
            <valid initial="string.letters,string.digits">
                <add value="&#42;" /><!-- asterisk -->
                <add value="&#44;" /><!-- comma -->
                <add value="&#46;" /><!-- period -->
                <add value="&#60;" /><!-- less than -->
                <add value="&#61;" /><!-- equals sign -->
                <add value="&#62;" /><!-- greater than -->
                <add value="&#63;" /><!-- question mark -->
                <add value="&#95;" /><!-- underscore -->
                <add value="&#40;" /><!-- left bracket -->
                <add value="&#41;" /><!-- right bracket -->
                <add value="&#91;"/> <!-- left square bracket -->
                <add value="&#93;"/> <!-- right square bracket -->
                <add value="&#123;"/><!-- left brace -->
                <add value="&#125;"/><!-- right brace -->
                <add value="&#94;"/> <!-- caret -->
                <add value="&#36;"/> <!-- dollar sign-->
                <add value="&#43;" /><!-- plus sign -->
                <add value="-"/>
                <add value="!"/>
    <xml name="sanitize_tag" >
        <sanitizer invalid_char="">
            <valid initial="string.letters,string.digits" />
    <macro name="barcode1_macro" >
        <param argument="--bc-pattern" type="text" label="Barcode pattern for first read"
            help="Use this option to specify the format of the UMI/barcode. Use Ns to
                    represent the random positions and Xs to indicate the bc positions.
                    Bases with Ns will be extracted and added to the read name. Remaining
                    bases, marked with an X will be reattached to the read">
            <validator type="empty_field" /> 
            <expand macro="barcode_sanitizer" />
    <macro name="barcode2_macro" >
        <param argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read"
            help="Use this option to specify the format of the UMI/barcode for
                the second read pair if required" >
            <expand macro="barcode_sanitizer" />
    <!-- not just fastq because this would allow also fastqcsanger -->
    <token name="@FASTQ_FORMATS@">fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz,fastqsolexa,fastqsolexa.gz</token>
    <xml name="bio_tools">
            <xref type="">umi-tools</xref>
    <xml name="input_types">
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Library type">
                <option value="single">Single-end</option>
                <option value="paired">Paired-end</option>
                <option value="paired_collection">Paired-end Dataset Collection</option>
            <when value="single">
                <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" />
                <expand macro="barcode1_macro"/>
            <when value="paired">
                <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" />
                <param name="input_read2" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" />
                <expand macro="barcode1_macro"/>
                <expand macro="barcode2_macro"/>
            <when value="paired_collection">
                <param name="input_readpair" type="data_collection" collection_type="paired" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" />
                <expand macro="barcode1_macro"/>
                <expand macro="barcode2_macro"/>
    <token name="@COMMAND_LINK@"><![CDATA[
        #set $gz = False
        #if $input_type_cond.input_type == 'single':
            #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type_cond.input_read1' input_single.gz &&
                #set $gz = True
                ln -s '$input_type_cond.input_read1' input_single.txt &&
            #end if
        #elif $input_type_cond.input_type == 'paired':
            #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type_cond.input_read1' input_read1.gz &&
                ln -s '$input_type_cond.input_read2' input_read2.gz &&
                #set $gz = True
                ln -s '$input_type_cond.input_read1' input_read1.txt &&
                ln -s '$input_type_cond.input_read2' input_read2.txt &&
            #end if
        #else  ## paired_collection
            #if $input_type_cond.input_readpair.forward.is_of_type("fastq.gz", "fastqsanger.gz"):
                ln -s '$input_type_cond.input_readpair.forward' input_read1.gz &&
                ln -s '$input_type_cond.input_readpair.reverse' input_read2.gz &&
                #set $gz = True
                ln -s '$input_type_cond.input_readpair.forward' input_read1.txt &&
                ln -s '$input_type_cond.input_readpair.reverse' input_read2.txt &&
            #end if
        #end if

    <!-- macros for count, dedup, and group -->

    <token name="@LINK_SAM_BAM_INPUT@"><![CDATA[
        #if $input.is_of_type("sam"):
            ## TODO dedup has problems with SAM input in some cases
            ## so convert it to sorted BAM for now
            ## #set $input_file = $input
            samtools sort --no-PG '$input' > 'input.bam' &&
            samtools index -b 'input.bam' &&
            #set $input_file = 'input.bam'
            ln -sf '${input}' 'input.bam' &&
            ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
            #set $input_file = 'input.bam'
        #end if
    <token name="@SET_INPUT_TYPE@"><![CDATA[
        ## TODO see comment in LINK_SAM_BAM_INPUT
        ## #if $input.is_of_type("sam"):
        ##     --in-sam
        ## #end if

    <xml name="fastq_barcode_extraction_options_macro">
        <conditional name="extract_method_cond">
            <param argument="--extract-method" type="select" label="Barcode Extraction Method"
                   help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" >
                <option value="string" selected="true">String</option>
                <option value="regex">Regex</option>
            <when value="string">
                <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?"
                    truevalue="--3prime" falsevalue=""
                    help="By default the barcode is assumed to be on the 5' end of the read, but
                        use this option to specify that it is on the 3' end instead. 
                        This option only works with ``--extract-method=string``
                        since 3' encoding can be specified explicitly with a regex, e.g
                        ``.*(?P&lt;umi_1&gt;.{5})$``" />
            <when value="regex">
                <param name="filtered_out_bool" type="boolean" label="Write out reads not matching regex pattern"/>
        <param argument="--ignore-read-pair-suffixes" type="boolean" truevalue="--ignore-read-pair-suffixes" falsevalue="" label="Ignore '\1' and '\2' read name suffixes"/>
        ## fastq barcode extraction options:
        #if $input_type_cond.input_type != 'single' and $input_type_cond.bc_pattern2 != ''
        #end if
        #if $extract_method_cond.extract_method == 'string'
        #else if $extract_method_cond.filtered_out_bool
            #if $input_type_cond.input_type == 'single':
            #else if $input_type_cond.input_type == 'paired':
            #end if
        #end if
        There are two methods enabled to extract the umi barcode (+/-
        cell barcode). For both methods, the patterns should be provided
        using the ``--bc-pattern`` and ``--bc-pattern2`` options.x
   - ``string``
         This should be used where the barcodes are always in the same
         place in the read.
         - N = UMI position (required)
         - C = cell barcode position (optional)
         - X = sample position (optional)
         Bases with Ns and Cs will be extracted and added to the read
         name. The corresponding sequence qualities will be removed from
         the read. Bases with an X will be reattached to the read.
         E.g. If the pattern is `NNNNCC`,
         Then the read::
             @HISEQ:87:00000000 read1
         will become::
             @HISEQ:87:00000000_TT_AAGG read1
         where 'TT' is the cell barcode and 'AAGG' is the UMI.
    - ``regex``
         This method allows for more flexible barcode extraction and
         should be used where the cell barcodes are variable in
         length. Alternatively, the regex option can also be used to
         filter out reads which do not contain an expected adapter
         sequence. UMI-tools uses the regex module rather than the more
         standard re module since the former also enables fuzzy matching
         The regex must contain groups to define how the barcodes are
         encoded in the read. The expected groups in the regex are:
         umi_n = UMI positions, where n can be any value (required)
         cell_n = cell barcode positions, where n can be any value (optional)
         discard_n = positions to discard, where n can be any value (optional)
         UMI positions and cell barcode positions will be extracted and
         added to the read name. The corresponding sequence qualities
         will be removed from the read.
         Discard bases and the corresponding quality scores will be
         removed from the read. All bases matched by other groups or
         components of the regex will be reattached to the read sequence
         For example, the following regex can be used to extract reads
         from the Klein et al inDrop data::
         Where only reads with a 3' T-tail and `GAGTGATTGCTTGTGACGCCTT` in
         the correct position to yield two cell barcodes of 8-12 and 8bp
         respectively, and a 6bp UMI will be retained.
         You can also specify fuzzy matching to allow errors. For example if
         the discard group above was specified as below this would enable
         matches with up to 2 errors in the discard_1 group.
         Note that all UMIs must be the same length for downstream
         processing with dedup, group or count commands]]></token> 

    <xml name="barcode_options_macro">
        <conditional name="bc" >
            <param argument="--extract-umi-method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" >
                <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option>
                <option value="tag" >Barcodes are contained in tags</option>
                <option value="umis" >Barcodes were extracted using umis</option>
            <when value="read_id" >
                <param argument="--umi-separator" type="text" label="Delimiter between read id and the UMI" value="_" >
                    <sanitizer invalid_char="" >
                        <valid initial="string.punctuation" />
            <when value="tag" >
                <param argument="--umi-tag" type="text" label="Tag which contains the UMI" value="RX" >
                    <expand macro="sanitize_tag" />
                <param argument="--umi-tag-split" type="text" label="Separate the UMI in tag by SPLIT" help="and take the first element"/>
                <param argument="--umi-tag-delimiter" type="text" label="Separate the UMI in tag by DELIMITER" help="and concatenate the elements"/>
                <param argument="--cell-tag" type="text" label="Tag which contains the cell barcode" >
                    <expand macro="sanitize_tag" />
                <param argument="--cell-tag-split" type="text" label="Separate the cell barcode in tag by SPLIT" help="and take the first element"/>
                <param argument="--cell-tag-delimiter" type="text" label="Separate the cell barcode in tag by DELIMITER" help="and concatenate the elements"/>
            <when value="umis"/>
    <token name="@BARCODE_OPTIONS@"><![CDATA[
        --extract-umi-method $bc.extract_umi_method
        #if str($bc.extract_umi_method) == 'read_id':
            --umi-separator '$bc.umi_separator'
        #else if str($bc.extract_umi_method) == 'tag':
            --umi-tag '$bc.umi_tag'
            #if $bc.umi_tag_split != ''
                --umi-tag-split '$bc.umi_tag_split'
            #end if
            #if $bc.umi_tag_delimiter != ''
                --umi-tag-delimiter '$bc.umi_tag_delimiter'
            #end if
            --cell-tag '$bc.cell_tag'
            #if $bc.cell_tag_split != ''
                --cell-tag-split '$bc.cell_tag_split'
            #end if
            #if $bc.cell_tag_delimiter != ''
                --cell-tag-delimiter '$bc.cell_tag_delimiter'
            #end if
        #end if
    <token name="@BARCODE_HELP@"><![CDATA[ 
Extracting barcodes
It is assumed that the FASTQ files were processed with ``umi_tools
extract`` before mapping and thus the UMI is the last word of the read
name. e.g:


where ``AATT`` is the UMI sequeuence.

If you have used an alternative method which does not separate the
read id and UMI with a "_", such as bcl2fastq which uses ":", you can
specify the separator with the option ``--umi-separator=<sep>``,
replacing <sep> with e.g ":".

Alternatively, if your UMIs are encoded in a tag, you can specify this
by setting the option --extract-umi-method=tag and set the tag name
with the --umi-tag option. For example, if your UMIs are encoded in
the 'UM' tag, provide the following options:
``--extract-umi-method=tag`` ``--umi-tag=UM``

Finally, if you have used umis to extract the UMI +/- cell barcode,
you can specify ``--extract-umi-method=umis``

The start position of a read is considered to be the start of its alignment
minus any soft clipped bases. A read aligned at position 500 with
cigar 2S98M will be assumed to start at position 498.]]></token>

    <xml name="umi_grouping_options_macro">
        <section name="umi" title="UMI grouping options">
            <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads" help="All methods start by identifying the reads with the same mapping position">
                <option value="unique">Reads group share the exact same UMI</option>
                <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option>
                <option value="cluster">Identify clusters based on hamming distance</option>
                <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option>
                <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
            <param argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
            <param argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
            <param argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced" />
            <param argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
    <token name="@UMI_GROUPING_OPTIONS@"><![CDATA[
        --method $umi.method
        --edit-distance-threshold $umi.edit_distance_threshold
        --soft-clip-threshold $umi.soft_clip_threshold
    <token name="@UMI_GROUPING_HELP@"><![CDATA[
UMI grouping options

Grouping Method

What method to use to identify group of reads with the same (or
similar) UMI(s)?

All methods start by identifying the reads with the same mapping position.

The simplest methods, unique and percentile, group reads with
the exact same UMI. The network-based methods, cluster, adjacency and
directional, build networks where nodes are UMIs and edges connect UMIs
with an edit distance <= threshold (usually 1). The groups of reads
are then defined from the network in a method-specific manner. For all
the network-based methods, each read group is equivalent to one read
count for the gene.

- unique
    Reads group share the exact same UMI

- percentile
    Reads group share the exact same UMI. UMIs with counts < 1% of the
    median counts for UMIs at the same position are ignored.

- cluster
    Identify clusters of connected UMIs (based on hamming distance
    threshold). Each network is a read group

- adjacency
    Cluster UMIs as above. For each cluster, select the node (UMI)
    with the highest counts. Visit all nodes one edge away. If all
    nodes have been visited, stop. Otherwise, repeat with remaining
    nodes until all nodes have been visted. Each step
    defines a read group.

- directional (default)
    Identify clusters of connected UMIs (based on hamming distance
    threshold) and umi A counts >= (2* umi B counts) - 1. Each
    network is a read group.


    <xml name="sambam_options_macro">
        <section name="sambam" title="SAM/BAM options">
            <param argument="--mapping-quality" type="integer" value="0" label="Minimum mapping quality for a read to be retained"/>
            <param argument="--unmapped-reads" type="select" label="How to handle unmapped reads">
                <option value="discard">discard</option>
                <option value="use">use</option>
                <option value="correct">correct</option>
            <param argument="--chimeric-pairs" type="select" optional="true" label="How to handle chimeric read pairs (default: use)">
                <option value="discard">discard</option>
                <option value="use">use</option>
                <option value="correct">correct</option>
            <param argument="--unpaired-reads" type="select" optional="true" label="How to handle unpaired reads (default: use)">
                <option value="discard">discard</option>
                <option value="use">use</option>
                <option value="correct">correct</option>
            <param argument="--ignore-umi" type="boolean" truevalue="--ignore-umi" falsevalue="" label="Ignore UMI and dedup only on position"/>
            <param argument="--ignore-tlen" type="boolean" truevalue="--ignore-tlen" falsevalue="" label="Dedup paired end reads based solely on read1" help="whether or not the template length is the same"/>
            <param argument="--chrom" type="text" value="" label="Consider only chromosome" help="If a value is given only a single chromosome with the given name is considered"/>
            <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
            <!--in-sam is set automatically-->
            <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates" />
    <token name="@SAMBAM_OPTIONS@"><![CDATA[
        --mapping-quality $sambam.mapping_quality
        --unmapped-reads $sambam.unmapped_reads
        #if $sambam.chimeric_pairs
            --chimeric-pairs $sambam.chimeric_pairs
        #end if
        #if $sambam.unpaired_reads
            --unpaired-reads $sambam.unpaired_reads
        #end if
        #if str($sambam.chrom) != ''
            --chrom '$sambam.chrom'
        #end if
        --subset $sambam.subset
    <!-- per-gene is hard coded in count
         hence we need a specialized macro here 
         TODO count used XF as default for gene-tag now I set it explicitly for the tests but we could as well parametrize the macro and set tool specific defaults

    <xml name="fullsc_options_macro">
        <expand macro="sc_options_macro">
            <param argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene"
                help="Must combine with either --gene-tag or --per-contig. As for --per-contig except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file" />
    <token name="@FULLSC_OPTIONS@"><![CDATA[
    <xml name="sc_options_macro">
        <section name="sc" title="Single-cell RNA-Seq options">
            <param argument="--gene-tag" type="text" optional="true" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file">
                <expand macro="sanitize_tag" />
            <param argument="--assigned-status-tag" type="text" optional="true" label="Bam tag describing whether read is assigned to a gene" help="By default, this is set as the same tag as --gene-tag">
                <expand macro="sanitize_tag" />
            <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" >
                <expand macro="barcode_sanitizer" />
            <param argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
            <param argument="--gene-transcript-map" type="data" format="tabular" optional="true" label="Tabular file mapping genes to transripts" />
            <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" label="Group reads only if they have the same cell barcode" />
    <token name="@SC_OPTIONS@"><![CDATA[
            #if str($sc.gene_tag) != "":
                --gene-tag '$sc.gene_tag'
            #end if
            #if str($sc.assigned_status_tag) != "":
                --assigned-status-tag '$sc.assigned_status_tag'
            #end if
            #if str($sc.skip_tags_regex) != "":
                --skip-tags-regex '$sc.skip_tags_regex'
            #end if
            #if $sc.gene_transcript_map:
                --gene-transcript-map '$sc.gene_transcript_map'
            #end if

    <xml name="groupdedup_options_macro">
        <section name="gd" title="group/dedup specific options">
            <param argument="--buffer-whole-contig" type="boolean" truevalue="--buffer-whole-contig" falsevalue="" label="Read whole contig before outputting bundles" help="Guarantees that no reads are missed, but increases memory usage" />
            <!-- TODO this option is hidden on the CLI. Should we expose it? -->
            <param argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
            <param argument="--multimapping-detection-method" type="select" optional="true" label="BAM Tag indicating multimapping " help="Some aligners identify multimapping using bam tags. Setting this option to NH, X0 or XT will use these tags when selecting the best read amongst reads with the same position and umi">
                <option value="NH">NH</option>
                <option value="X0">X0</option>
                <option value="XT">XT</option>
    <token name="@GROUPDEDUP_OPTIONS@"><![CDATA[
    <xml name="log_input_macro">
        <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" help="Choose if you want to generate a text file containing logging information" />
    <xml name="log_output_macro">
        <data name="out_log" format="txt" label="${} on ${on_string}: logfile" >
    <token name="@LOG@"><![CDATA[
        #if $log:
        #end if