diff umi-tools_dedup.xml @ 12:4098ab380097 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:51:31 +0000
parents 7fa28eb10fed
children
line wrap: on
line diff
--- a/umi-tools_dedup.xml	Wed Feb 10 19:30:35 2021 +0000
+++ b/umi-tools_dedup.xml	Mon Sep 13 14:51:31 2021 +0000
@@ -1,128 +1,157 @@
-<tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@VERSION@+galaxy1">
+<tool id="umi_tools_dedup" name="UMI-tools deduplicate" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>Extract UMI from fastq files</description>
+    <expand macro="bio_tools"/>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="requirements">
-        <requirement type="package" version="1.9">samtools</requirement>
+        <requirement type="package" version="1.12">samtools</requirement>
     </expand>
     <command detect_errors="exit_code"><![CDATA[
-        #if $input.is_of_type("sam"):
-            #set $input_file = $input
-        #else:
-            ln -sf '${input}' 'input.bam' &&
-            ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
-            #set $input_file = 'input.bam'
-        #end if
+        @LINK_SAM_BAM_INPUT@
 
+        echo $input.ext &&
         umi_tools dedup
-            '$output_stats_bool'
-            --random-seed 0
-            --extract-umi-method $extract_umi_method
-            #if str($extract_umi_method) != 'read_id':
-                --umi-separator '$umi_separator' --umi-tag '$umi_tag'
+            #if $output_stats_bool
+                --output-stats=stats_outputs
             #end if
-            --method $method --edit-distance-threshold $edit_distance_threshold
-            $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold
-            $read_length $whole_contig --subset $subset $per_contig $per_gene
-            #if $gene_transcript_map:
-                --gene-transcript-map '$gene_transcript_map'
-            #end if
-            #if len(str($gene_tag)) > 0:
-                --gene-tag '$gene_tag'
-            #end if
-            #if $input.is_of_type("sam"):
-                --in-sam
-            #end if
-            -I '$input_file' -S deduped.bam &&
-            samtools sort deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
+            @GROUPDEDUP_OPTIONS@
+            @BARCODE_OPTIONS@
+            @UMI_GROUPING_OPTIONS@
+            @SAMBAM_OPTIONS@
+            @FULLSC_OPTIONS@
+            @ADVANCED_OPTIONS@
+            -I '$input_file' -S deduped.bam
+            ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy
+            ## compares the generated file with the one in test-data
+            ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files`
+            ## problem seems to be the BAM file generated with pysam
+            ## may be dropped in the future
+            --no-sort-output
+            @LOG@
+            && samtools sort --no-PG deduped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM
+
     ]]></command>
     <inputs>
         <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" />
-        <param name="extract_umi_method" argument="--extract-umi-method" type="select">
-            <option value="read_id" selected="True">Read ID</option>
-            <option value="tag">Tag</option>
-        </param>
-        <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" />
-        <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." />
-        <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position">
-            <option value="unique">Reads group share the exact same UMI</option>
-            <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option>
-            <option value="cluster">Identify clusters based on hamming distance</option>
-            <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option>
-            <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option>
-        </param>
-        <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
-        <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
-        <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
-        <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
-        <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
-        <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
-        <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" />
-        <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
-        <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
-        <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
-        <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" />
-        <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." />
-        <param name="output_stats_bool" type="boolean" truevalue="--output-stats=stats_outputs" falsevalue="" checked="false" label="Output UMI related statistics files?"/>
+        <param name="output_stats_bool" type="boolean" checked="false" label="Output UMI related statistics files?"/>
+        <expand macro="groupdedup_options_macro"/>
+        <expand macro="barcode_options_macro"/>
+        <expand macro="umi_grouping_options_macro"/>
+        <expand macro="sambam_options_macro"/>
+        <expand macro="fullsc_options_macro"/>
+        <expand macro="advanced_options_macro"/>
+        <expand macro="log_input_macro"/>
     </inputs>
     <outputs>
         <data format="bam" name="output" />
-        <collection name="output_stats" type="list" label="UMI_tools dedup stats">
+        <collection name="output_stats" type="list" label="${tool.name} on ${on_string} stats">
             <filter>output_stats_bool</filter>
             <data name="edit_distance" format="tabular" from_work_dir="stats_outputs_edit_distance.tsv"/>
             <data name="per_umi" format="tabular" from_work_dir="stats_outputs_per_umi.tsv"/>
             <data name="per_umi_per_position" format="tabular" from_work_dir="stats_outputs_per_umi_per_position.tsv"/>
         </collection>
+        <expand macro="log_output_macro"/>
     </outputs>
     <tests>
         <test expect_num_outputs="1">
             <param name="input" value="group_in1.sam" ftype="sam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="method" value="unique" />
-            <output name="output" file="dedup_out1.bam" ftype="bam" sort="True"/>
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
+            <output name="output" file="dedup_out1.bam" ftype="bam" lines_diff="2"/><!--lines_diff won't be needed in later versions since umitools use \-\-no-PG internally -->
         </test>
         <test expect_num_outputs="1">
-            <param name="input" value="group_in2.bam" ftype="bam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="paired" value="True" />
-            <param name="method" value="unique" />
-            <output name="output" file="dedup_out2.bam" ftype="bam" sort="True" />
+            <param name="input" value="group_in2.sam" ftype="sam" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+            </conditional>
+            <section name="sambam">
+                <param name="paired" value="true" />
+            </section>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
+            <output name="output" file="dedup_out2.bam" ftype="bam" lines_diff="2" />
         </test>
         <test expect_num_outputs="1">
             <param name="input" value="group_in3.bam" ftype="bam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="method" value="unique" />
-            <output name="output" file="dedup_out3.bam" ftype="bam" sort="True" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
+            <output name="output" file="dedup_out3.bam" ftype="bam" lines_diff="2" />
         </test>
         <test expect_num_outputs="1">
             <param name="input" value="group_in4.bam" ftype="bam" />
-            <param name="extract_umi_method" value="tag" />
-            <param name="umi_tag" value="BX" />
-            <param name="method" value="unique" />
-            <output name="output" file="dedup_out4.bam" ftype="bam" sort="True" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="tag" />
+                <param name="umi_tag" value="BX" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
+            <output name="output" file="dedup_out4.bam" ftype="bam" lines_diff="2"/>
         </test>
         <test expect_num_outputs="1">
             <param name="input" value="group_in5.bam" ftype="bam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="umi_tag" value="BX" />
-            <param name="method" value="cluster" />
-            <output name="output" file="dedup_out5.bam" ftype="bam" sort="True" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+                <param name="umi_tag" value="BX" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="cluster" />
+            </section>
+            <output name="output" file="dedup_out5.bam" ftype="bam" lines_diff="2"/>
         </test>
         <test expect_num_outputs="1">
             <param name="input" value="group_in6.bam" ftype="bam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="umi_tag" value="BX" />
-            <param name="method" value="directional" />
-            <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+                <param name="umi_tag" value="BX" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
+            <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
         </test>
         <test expect_num_outputs="5">
             <param name="input" value="group_in6.bam" ftype="bam" />
-            <param name="extract_umi_method" value="read_id" />
-            <param name="umi_tag" value="BX" />
-            <param name="method" value="directional" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="read_id" />
+                <param name="umi_tag" value="BX" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
             <param name="output_stats_bool" value="true"/>
-            <output name="output" file="dedup_out6.bam" ftype="bam" sort="True" />
+            <output name="output" file="dedup_out6.bam" ftype="bam" lines_diff="2"/>
             <output_collection name="output_stats">
                 <element name="edit_distance" file="stats_outputs_edit_distance.tsv" />
                 <element name="per_umi" file="stats_outputs_per_umi.tsv" />
@@ -131,183 +160,100 @@
         </test>
     </tests>
     <help><![CDATA[
-umi_tools dedup - Deduplicate reads based on their UMI
-======================================================
+umi_tools dedup - Deduplicate reads based on their UMI and mapping coordinates
+==============================================================================
 
 Purpose
 -------
 
 The purpose of this command is to deduplicate BAM files based on the first
-mapping co-ordinate and the UMI attached to the read. It is assumed that the
-FASTQ files were processed with extract_umi.py before mapping and thus the UMI
-is the last word of the read name. e.g:
-
-@HISEQ:87:00000000_AATT
-
-where AATT is the UMI sequeuence.
+mapping co-ordinate and the UMI attached to the read. 
 
-If you have used an alternative method which does not separate the
-read id and UMI with a "_", such as bcl2fastq which uses ":", you can
-specify the separator with the option "--umi-separator=<sep>",
-replacing <sep> with e.g ":".
+@BARCODE_HELP@
 
-Alternatively, if your UMIs are encoded in a tag, you can specify this
-by setting the option --extract-umi-method=tag and set the tag name
-with the --umi-tag option. For example, if your UMIs are encoded in
-the 'UM' tag, provide the following options:
-"--extract-umi-method=tag --umi-tag=UM"
+@UMI_GROUPING_HELP@
 
-The start postion of a read is considered to be the start of its alignment
-minus any soft clipped bases. A read aligned at position 500 with
-cigar 2S98M will be assumed to start at postion 498.
-
-
-Methods
--------
+Selecting the representative read
+---------------------------------
+For every group of duplicate reads, a single representative read is
+retained.The following criteria are applied to select the read that
+will be retained from a group of duplicated reads:
 
-dedup can be run with multiple methods to identify groups of reads with
-the same (or similar) UMI(s). All methods start by identifying the
-reads with the same mapping position.
+1. The read with the lowest number of mapping coordinates (see
+``--multimapping-detection-method`` option)
 
-The simpliest method, "unique", groups reads with the exact same
-UMI. The network-based methods, "cluster", "adjacency" and
-"directional", build networks where nodes are UMIs and edges connect
-UMIs with an edit distance <= threshold (usually 1). The groups of
-reads are then defined from the network in a method-specific manner.
+2. The read with the highest mapping quality. Note that this is not
+the read sequencing quality and that if two reads have the same
+mapping quality then one will be picked at random regardless of the
+read quality.
 
-  "unique"
-      Reads group share the exact same UMI
+Otherwise a read is chosen at random.
 
-  "percentile"
-      Reads group share the exact same UMI. UMIs with counts < 1% of the
-      median counts for UMIs at the same position are ignored.
-
-  "cluster"
-      Identify clusters of connected UMIs (based on hamming distance
-      threshold). Each network is a read group
+Optional statistics output
+--------------------------
 
-  "adjacency"
-      Cluster UMIs as above. For each cluster, select the node(UMI)
-      with the highest counts. Visit all nodes one edge away. If all
-      nodes have been visted, stop. Otherise, repeat with remaining
-      nodes until all nodes have been visted. Each step
-      defines a read group.
+One can use the edit distance between UMIs at the same position as an
+quality control for the deduplication process by comparing with
+a null expectation of random sampling. For the random sampling, the
+observed frequency of UMIs is used to more reasonably model the null
+expectation.
 
-  "directional" (default)
-      Identify clusters of connected UMIs (based on hamming distance
-      threshold) and umi A counts >= (2* umi B counts) - 1. Each
-      network is a read group.
+Use the option ``Output UMI related statistics files?`` generate stats outfiles:
 
-Options
--------
-
---extract-umi-method (choice)
-      How are the UMIs encoded in the read?
-
-      Options are:
-
-      - "read_id" (default)
-            UMIs contained at the end of the read separated as
-            specified with --umi-separator option
-
-      - "tag"
-            UMIs contained in a tag, see --umi-tag option
-
---umi-separator (string)
-      Separator between read id and UMI. See --extract-umi-method above
-
---umi-tag (string)
-      Tag which contains UMI. See --extract-umi-method above
+edit_distance
+  Reports the (binned) average edit distance between the UMIs at each
+  position. Positions with a single UMI are reported seperately.  The
+  edit distances are reported pre- and post-deduplication alongside
+  the null expectation from random sampling of UMIs from the UMIs
+  observed across all positions. Note that separate null
+  distributions are reported since the null depends on the observed
+  frequency of each UMI which is different pre- and
+  post-deduplication. The post-duplication values should be closer to
+  their respective null than the pre-deduplication vs null comparison
 
---edit-distance-threshold (int)
-       For the adjacency and cluster methods the threshold for the
-       edit distance to connect two UMIs in the network can be
-       increased. The default value of 1 works best unless the UMI is
-       very long (>14bp)
-
---paired
-       BAM is paired end - output both read pairs. This will also
-       force the use of the template length to determine reads with
-       the same mapping coordinates.
-
---spliced-is-unique
-       Causes two reads that start in the same position on the same
-       strand and having the same UMI to be considered unique if one is
-       spliced and the other is not. (Uses the 'N' cigar operation to test
-       for splicing)
+In addition, this option will trigger reporting of further summary
+statistics for the UMIs which may be informative for selecting the
+optimal deduplication method or debugging.
 
---soft-clip-threshold (int)
-       Mappers that soft clip, will sometimes do so rather than mapping a
-       spliced read if there is only a small overhang over the exon
-       junction. By setting this option, you can treat reads with at least
-       this many bases soft-clipped at the 3' end as spliced.
-
---multimapping-detection-method (string, choice)
-       If the sam/bam contains tags to identify multimapping reads, you can
-       specify for use when selecting the best read at a given loci.
-       Supported tags are "NH", "X0" and "XT". If not specified, the read
-       with the highest mapping quality will be selected
+Each unique UMI sequence may be observed [0-many] times at multiple
+positions in the BAM. The following files report the distribution for
+the frequencies of each UMI.
 
---read-length
-      Use the read length as as a criteria when deduping, for e.g sRNA-Seq
-
---whole-contig
-      Consider all alignments to a single contig together. This is useful if
-      you have aligned to a transcriptome multi-fasta
-
---subset (float, [0-1])
-      Only consider a fraction of the reads, chosen at random. This is useful
-      for doing saturation analyses.
-
---chrom
-      Only consider a single chromosome. This is useful for debugging purposes
+per_umi_per_position
+  The `_stats_per_umi_per_position.tsv` file simply tabulates the
+  counts for unique combinations of UMI and position. E.g if prior to
+  deduplication, we have two positions in the BAM (POSa, POSb), at
+  POSa we have observed 2*UMIa, 1*UMIb and at POSb: 1*UMIc, 3*UMId,
+  then the stats file is populated thus:
 
---per-contig (string)
-      Deduplicate per contig (field 3 in BAM; RNAME).
-      All reads with the same contig will be
-      considered to have the same alignment position. This is useful
-      if your library prep generates PCR duplicates with non identical
-      alignment positions such as CEL-Seq. In this case, you would
-      align to a reference transcriptome with one transcript per gene
-
---per-gene (string)
-      Deduplicate per gene. As above except with this option you can
-      align to a reference transcriptome with more than one transcript
-      per gene. You need to also provide --gene-transcript-map option.
-      This will also add a metacontig ('MC') tag to the reads if used
-      in conjunction with --output-bam
-
---gene-transcript-map (string)
-      File mapping genes to transripts (tab separated), e.g:
-
-      gene1   transcript1
-      gene1   transcript2
-      gene2   transcript3
+  ====== =============
+  counts instances_pre
+  ------ -------------
+  1      2
+  2      1
+  3      1
+  ====== =============
+  
+  If post deduplication, UMIb is grouped with UMIa such that POSa:
+  3*UMIa, then the `instances_post` column is populated thus:
 
---gene-tag (string)
-      Deduplicate per gene. As per --per-gene except here the gene
-      information is encoded in the bam read tag specified so you do
-      not need to supply --gene-transcript-map
-
---output-bam (string, filename)
-      Output a tagged bam file to stdout or -S <filename>
-
--i, --in-sam/-o, --out-sam
-      By default, inputs are assumed to be in BAM format and output are output
-      in BAM format. Use these options to specify the use of SAM format for
-      inputs or outputs.
+  ====== ============= ==============
+  counts instances_pre instances_post
+  ------ ------------- --------------
+  1      2             1
+  2      1             0
+  3      1             2
+  ====== ============= ==============
 
--I    (string, filename) input file name
-      The input file must be sorted and indexed.
-
--S    (string, filename) output file name
+per_umi_per
+  The `_stats_per_umi_per.tsv` table provides UMI-level summary
+  statistics. Keeping in mind that each unique UMI sequence can be
+  observed at [0-many] times across multiple positions in the BAM,
 
--L    (string, filename) log file name
+  :times_observed: How many positions the UMI was observed at
+  :total_counts: The total number of times the UMI was observed across all positions
+  :median_counts: The median for the distribution of how often the UMI was observed at each position (excluding zeros)
 
-Usage
------
-    umi_tools dedup -I infile.bam -S grouped.bam --
-
-    ]]></help>
+  Hence, whenever times_observed=1, total_counts==median_counts.]]></help>
     <expand macro="citations" />
 </tool>