view umi-tools_group.xml @ 2:4325a02ea7a3 draft

planemo upload commit 8da5246c32d60a49e6b6b9027c9adc0a31d4bc5a
author iuc
date Sun, 25 Feb 2018 13:07:29 -0500
parents f73f13641bb6
children a24f5b991320
line wrap: on
line source

<tool id="umi_tools_group" name="UMI-tools group" version="@VERSION@.0">
    <description>Extract UMI from fastq files</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <requirement type="package" version="1.6">samtools</requirement>
    </expand>
    <command detect_errors="exit_code"><![CDATA[
        #if $input.is_of_type("sam"):
            #set $input_file = $input
        #else:
            ln -sf '${input}' 'input.bam' &&
            ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
            #set $input_file = 'input.bam'
        #end if

        umi_tools group
            --random-seed 0
            --extract-umi-method $extract_umi_method
            #if str($extract_umi_method) != 'read_id':
                --umi-separator '$umi_separator' --umi-tag '$umi_tag'
            #end if
            --method $method --edit-distance-threshold $edit_distance_threshold
            $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold
            $read_length $whole_contig --subset $subset $per_contig $per_gene
            #if $gene_transcript_map:
                --gene-transcript-map '$gene_transcript_map'
            #end if
            #if len(str($gene_tag)) > 0:
                --gene-tag '$gene_tag'
            #end if
            #if $group_output:
                --group-out '$group_out'
            #end if
            #if $input.is_of_type("sam"):
                --in-sam
            #end if
            --output-bam
            -I '$input_file' -S grouped.bam &&
            samtools sort grouped.bam -@ \${GALAXY_SLOTS:-1} -o '$output' -O BAM
    ]]></command>
    <inputs>
        <param name="input" type="data" format="sam,bam" label="Reads to group in SAM or BAM format" />
        <param name="extract_umi_method" argument="--extract-umi-method" type="select">
            <option value="read_id" selected="True">Read ID</option>
            <option value="tag">Tag</option>
        </param>
        <param name="group_output" argument="--group-out" type="boolean" truevalue="--group-out" falsevalue="" label="Output a flatfile describing the read groups" />
        <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" />
        <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." />
        <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position">
            <option value="unique">Reads group share the exact same UMI</option>
            <option value="cluster">Identify clusters based on hamming distance</option>
            <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
        </param>
        <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
        <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
        <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
        <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
        <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
        <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
        <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
        <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
        <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
        <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
        <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" />
        <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." />
    </inputs>
    <outputs>
        <data format="bam" name="output" />
        <data format="tabular" name="group_out">
            <filter>group_out</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="input" value="group_in2.bam" ftype="bam" />
            <param name="extract_umi_method" value="read_id" />
            <param name="paired" value="True" />
            <param name="method" value="unique" />
            <output name="output" file="group_out2.bam" ftype="bam" sort="True" />
        </test>
        <test>
            <param name="input" value="group_in3.bam" ftype="bam" />
            <param name="extract_umi_method" value="read_id" />
            <param name="group_output" value="True" />
            <param name="method" value="unique" />
            <output name="group_out" file="group_out3.tab" />
            <output name="output" file="group_out3.bam" ftype="bam" sort="True" />
        </test>
        <test>
            <param name="input" value="group_in4.bam" ftype="bam" />
            <param name="extract_umi_method" value="tag" />
            <param name="umi_tag" value="BX" />
            <param name="method" value="unique" />
            <output name="group_out" file="group_out4.tab" />
            <output name="output" file="group_out4.bam" ftype="bam" sort="True" />
        </test>
        <test>
            <param name="input" value="group_in5.bam" ftype="bam" />
            <param name="extract_umi_method" value="read_id" />
            <param name="umi_tag" value="BX" />
            <param name="method" value="cluster" />
            <output name="output" file="group_out5.bam" ftype="bam" sort="True" />
        </test>
        <test>
            <param name="input" value="group_in6.bam" ftype="bam" />
            <param name="extract_umi_method" value="read_id" />
            <param name="umi_tag" value="BX" />
            <param name="method" value="directional" />
            <output name="output" file="group_out6.bam" ftype="bam" sort="True" />
        </test>
    </tests>
    <help><![CDATA[
umi_tools group - Group reads based on their UMI
================================================

Purpose
-------

The purpose of this command is to identify groups of reads based on
their genomic coordinate and UMI. It is assumed that the FASTQ files
were processed with umi_tools extract before mapping and thus the UMI is
the last word of the read name. e.g:

@HISEQ:87:00000000_AATT

where AATT is the UMI sequeuence.

If you have used an alternative method which does not separate the
read id and UMI with a "_", such as bcl2fastq which uses ":", you can
specify the separator with the option "--umi-separator=<sep>",
replacing <sep> with e.g ":".

Alternatively, if your UMIs are encoded in a tag, you can specify this
by setting the option --extract-umi-method=tag and set the tag name
with the --umi-tag option. For example, if your UMIs are encoded in
the 'UM' tag, provide the following options:
"--extract-umi-method=tag --umi-tag=UM"

By default, reads are considered identical if they have the same start
coordinate, are on the same strand, and have the same UMI. Optionally,
splicing status can be considered (see below).

The start postion of a read is considered to be the start of its alignment
minus any soft clipped bases. A read aligned at position 500 with
cigar 2S98M will be assumed to start at postion 498.

Methods
-------

group can be run with multiple methods to identify group of reads with
the same (or similar) UMI(s). All methods start by identifying the
reads with the same mapping position.

The simpliest method, "unique", groups reads with the exact same
UMI. The network-based methods, "cluster", "adjacency" and
"directional", build networks where nodes are UMIs and edges connect
UMIs with an edit distance <= threshold (usually 1). The groups of
reads are then defined from the network in a method-specific manner.

Note that the "percentile" method used with the dedup command is not
available with group. This is because this method does not group
similar UMIs as per the network methods. Instead it applies a
threshold for inclusion of the UMI in the output and excluded UMIs are
not assigned to a "true" UMI.

  "unique"
      Reads group share the exact same UMI

  "cluster"
      Identify clusters of connected UMIs (based on hamming distance
      threshold). Each network is a read group

  "directional"
      Identify clusters of connected UMIs (based on hamming distance
      threshold) and umi A counts >= (2* umi B counts) - 1. Each
      network is a read group.

The group command can be used to create two types of outfile: a tagged
BAM or a flatfile describing the read groups

To generate the tagged-BAM file, use the option --output-bam and
provide a filename with the -S option. Alternatively, if you do not
provide a filename, the bam file will be outputted to the stdout. If
you have provided the --log/-L option to send the logging output
elsewhere, you can pipe the output from the group command directly to
e.g samtools sort like so:

``umi_tools group -I inf.bam --group-out=grouped.tsv --output-bam --log=group.log --paired | samtools sort - -o grouped_sorted.bam``

The tagged-BAM file will have two tagged per read:

 - UG = Unique_id.
    0-indexed unique id number for each group of reads with the same genomic position and UMI or UMIs inferred to be from the same true UMI + errors

 - BX = Final UMI.
     The inferred true UMI for the group

To generate the flatfile describing the read groups, include the
--group-out=<filename> option. The columns of the read groups file are
below. The first five columns relate to the read. The final 3 columns
relate to the group.

  - read_id
      read identifier

  - contig
      alignment contig

  - position
      Alignment position. Note that this position is not the start position of the read in the BAM file but the start of the read taking into account the read strand and cigar

  - umi
      The read UMI

  - umi_count
      The number of times this UMI is observed for reads at the same position

  - final_umi
      The inferred true UMI for the group

  - final_umi_count
      The total number of reads within the group

  - unique_id
      The unique id for the group


Options
-------

--extract-umi-method (choice)
      How are the UMIs encoded in the read?

      Options are:

      - "read_id" (default)
            UMIs contained at the end of the read separated as
            specified with --umi-separator option

      - "tag"
            UMIs contained in a tag, see --umi-tag option

--umi-separator (string)
      Separator between read id and UMI. See --extract-umi-method above

--umi-tag (string)
      Tag which contains UMI. See --extract-umi-method above

--method (choice, string)
      Method used to identify PCR duplicates within reads. All methods
      start by identifying the reads with the same mapping position

      Options are:

      - "unique"
          Reads group share the exact same UMI

      - "cluster"
          Identify clusters of connected UMIs (based on edit distance
          threshold). Each network is a read group

      - "directional"
          Identify clusters of connected UMIs (based on edit distance
          threshold) and umi A counts >= (2* umi B counts) - 1. Each
          network is a read group.

--edit-distance-threshold (int)
       For the adjacency and cluster methods the threshold for the
       edit distance to connect two UMIs in the network can be
       increased. The default value of 1 works best unless the UMI is
       very long (>14bp)

--paired
       BAM is paired end - output both read pairs. This will also
       force the use of the template length to determine reads with
       the same mapping coordinates.

--spliced-is-unique
       Causes two reads that start in the same position on the same
       strand and having the same UMI to be considered unique if one is
       spliced and the other is not. (Uses the 'N' cigar operation to test
       for splicing)

--soft-clip-threshold (int)
       Mappers that soft clip, will sometimes do so rather than mapping a
       spliced read if there is only a small overhang over the exon
       junction. By setting this option, you can treat reads with at least
       this many bases soft-clipped at the 3' end as spliced.

--multimapping-detection-method (string, choice)
       If the sam/bam contains tags to identify multimapping reads, you can
       specify for use when selecting the best read at a given loci.
       Supported tags are "NH", "X0" and "XT". If not specified, the read
       with the highest mapping quality will be selected

--read-length
      Use the read length as as a criteria when deduping, for e.g sRNA-Seq

--whole-contig
      Consider all alignments to a single contig together. This is useful if
      you have aligned to a transcriptome multi-fasta

--subset (float, [0-1])
      Only consider a fraction of the reads, chosen at random. This is useful
      for doing saturation analyses.

--chrom
      Only consider a single chromosome. This is useful for debugging purposes

--per-contig (string)
      Deduplicate per contig (field 3 in BAM; RNAME).
      All reads with the same contig will be
      considered to have the same alignment position. This is useful
      if your library prep generates PCR duplicates with non identical
      alignment positions such as CEL-Seq. In this case, you would
      align to a reference transcriptome with one transcript per gene

--per-gene (string)
      Deduplicate per gene. As above except with this option you can
      align to a reference transcriptome with more than one transcript
      per gene. You need to also provide --gene-transcript-map option.
      This will also add a metacontig ('MC') tag to the reads if used
      in conjunction with --output-bam

--gene-transcript-map (string)
      File mapping genes to transripts (tab separated), e.g:

      gene1   transcript1
      gene1   transcript2
      gene2   transcript3

--gene-tag (string)
      Deduplicate per gene. As per --per-gene except here the gene
      information is encoded in the bam read tag specified so you do
      not need to supply --gene-transcript-map

--group-out (string, filename)
      Output a flatfile describing the read groups

--output-bam (string, filename)
      Output a tagged bam file to stdout or -S <filename>

-i, --in-sam/-o, --out-sam
      By default, inputs are assumed to be in BAM format and output are output
      in BAM format. Use these options to specify the use of SAM format for
      inputs or outputs.

-I    (string, filename) input file name
      The input file must be sorted and indexed.

-S    (string, filename) output file name

-L    (string, filename) log file name

Usage
-----
    umi_tools group -I infile.bam --output-bam -S grouped.bam -L group.log --

    ]]></help>
    <expand macro="citations" />
</tool>