Mercurial > repos > jjohnson > fgbio_call_molecular_consensus_reads

--- a/fgbio_call_molecular_consensus_reads.xml	Sun Feb 21 23:40:53 2021 +0000
+++ b/fgbio_call_molecular_consensus_reads.xml	Wed Feb 24 13:02:26 2021 +0000
@@ -24,8 +24,8 @@
             --min-input-base-quality=$filter_options.min_input_base_quality
         #end if

-        #if $bam_options.read-name-prefix
-            --read-name-prefix='$bam_options.read-name-prefix'
+        #if $bam_options.read_name_prefix
+            --read-name-prefix='$bam_options.read_name_prefix'
         #end if
         #if $bam_options.tag
             --tag=$bam_options.tag
@@ -41,20 +41,18 @@
         #end if
     ]]></command>
     <inputs>
-        <param name="input" type="data" format="bam" label="Fastq files corresponding to each sequencing read"/>
+        <param name="input" type="data" format="unsorted.bam,bam" label="BAM TemplateCoorinate sorted by fgbio SortBAM"/>
         <param argument="--min-reads" type="integer" value="" min="1" label="Minimum number of reads to produce a consensus base" help="Default: 1"/>
-
         <section name="filter_options" title="Optional Filter Settings" expanded="false">
-            <param argument="--max-reads" type="integer" value="" min="1" label="Maximum number of reads to to use when building a consensus"
+            <param argument="--max-reads" type="integer" value="" min="1" optional="true" label="Maximum number of reads to to use when building a consensus"
                    help="If more than this many reads are present in a tag family, the family is randomly downsampled to exactly max-reads reads."/>
-            <param argument="--error-rate-pre-umi" type="integer" value="" min="1" label="Phred-scaled error rate for an error prior to the UMIs being integrated" help="Default: 45"/>
-            <param argument="--error-rate-post-umi" type="integer" value="" min="1" label="Phred-scaled error rate for an error post the UMIs being integrated" help="Default: 40"/>
-            <param argument="--min-input-base-quality" type="integer" value="" min="1" label="Ignore bases in raw reads that have Q below this value" help="Default: 10"/>
+            <param argument="--error-rate-pre-umi" type="integer" value="" min="1" optional="true" label="Phred-scaled error rate for an error prior to the UMIs being integrated" help="Default: 45"/>
+            <param argument="--error-rate-post-umi" type="integer" value="" min="1" optional="true" label="Phred-scaled error rate for an error post the UMIs being integrated" help="Default: 40"/>
+            <param argument="--min-input-base-quality" type="integer" value="" min="1" optional="true" label="Ignore bases in raw reads that have Q below this value" help="Default: 10"/>
         </section>
-
         <section name="bam_options" title="BAM Settings" expanded="false">
-            <param argument="--read-name-prefix" type="text" value="" label="Prefix for all consensus read names"/>
-            <param argument="--tag" type="text" value="" label="The SAM attribute with the unique molecule tag" help="Default: MI">
+            <param argument="--read-name-prefix" type="text" value="" optional="true" label="Prefix for all consensus read names"/>
+            <param argument="--tag" type="text" value="" optional="true" label="The SAM attribute with the unique molecule tag" help="Default: MI">
                 <expand macro="sam_tag_validator"/>
             </param>
             <param argument="--read-group-id" type="text" value="" optional="true" label="The new read group ID for all the consensus reads" help="Default: A"/>
@@ -64,13 +62,16 @@
             </param>
         </section>
         <expand macro="sam_sort_order" />
-        <param argument="output_rejects" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output tag family size counts"/>
+        <param argument="output_rejects" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output BAM of rejected reads"/>
     </inputs>
     <outputs>
-        <data name="rejects" format="bam" >
+        <data name="rejects" format="unsorted.bam" >
             <filter>output_rejects == True</filter>
+            <expand macro="sort_order_change_format" />
         </data>
-        <data name="output" format="bam" />
+        <data name="output" format="unsorted.bam" >
+            <expand macro="sort_order_change_format" />
+        </data>
     </outputs>
     <help><![CDATA[
 **fgbio CallMolecularConsensusReads**
@@ -80,12 +81,14 @@
 Reads with the same unique molecular tag are examined base-by-base to assess the likelihood of each base in the source molecule. The likelihood model is as follows:

   1. First, the base qualities are adjusted. The base qualities are assumed to represent the probability of a sequencing error (i.e. the sequencer observed the wrong base present on the cluster/flowcell/well). The base quality scores are converted to probabilities incorporating a probability representing the chance of an error from the time the unique molecular tags were integrated to just prior to sequencing. The resulting probability is the error rate of all processes from right after integrating the molecular tag through to the end of sequencing.
-  2. Next, a consensus sequence is called for all reads with the same unique molecular tag base-by-base. For a given base position in the reads, the likelihoods that an A, C, G, or T is the base for the underlying source molecule respectively are computed by multiplying the likelihood of each read observing the base position being considered. The probability of error (from 1.) is used when the observed base does not match the hypothesized base for the underlying source molecule, while one minus that probability is used otherwise. The computed likelihoods are normalized by dividing them by the sum of all four likelihoods to produce a posterior probability, namely the probability that the source molecule was an A, C, G, or T from just after integrating molecular tag through to sequencing, given the observations. The base with the maximum posterior probability as the consensus call, and the posterior probability is used as its raw base quality.
-  3. Finally, the consensus raw base quality is modified by incorporating the probability of an error prior to integrating the unique molecular tags. Therefore, the probability used for the final consensus base quality is the posterior probability of the source molecule having the consensus base given the observed reads with the same molecular tag, all the way from sample extraction and through sample and library preparation, through preparing the library for sequencing (e.g. amplification, target selection), and finally, through sequencing.
-This tool assumes that reads with the same tag are grouped together (consecutive in the file). Also, this tool calls each end of a pair independently, and does not jointly call bases that overlap within a pair. Insertion or deletion errors in the reads are not considered in the consensus model.
+
+  2. Next, a consensus sequence is called for all reads with the same unique molecular tag base-by-base. For a given base position in the reads, the likelihoods that an A, C, G, or T is the base for the underlying source molecule respectively are computed by multiplying the likelihood of each read observing the base position being considered. The probability of error (from 1.) is used when the observed base does not match the hypothesized base for the underlying source molecule, while one minus that probability is used otherwise. The computed likelihoods are normalized by dividing them by the sum of all four likelihoods to produce a posterior probability, namely the probability that the source molecule was an A, C, G, or T from just after integrating molecular tag through to sequencing, given the observations. The base with the maximum posterior probability as the consensus call, and the posterior probability is used as its raw base quality.
+
+  3. Finally, the consensus raw base quality is modified by incorporating the probability of an error prior to integrating the unique molecular tags. Therefore, the probability used for the final consensus base quality is the posterior probability of the source molecule having the consensus base given the observed reads with the same molecular tag, all the way from sample extraction and through sample and library preparation, through preparing the library for sequencing (e.g. amplification, target selection), and finally, through sequencing.  This tool assumes that reads with the same tag are grouped together (consecutive in the file). Also, this tool calls each end of a pair independently, and does not jointly call bases that overlap within a pair. Insertion or deletion errors in the reads are not considered in the consensus model.

 Particular attention should be paid to setting the --min-reads parameter as this can have a dramatic effect on both results and runtime. For libraries with low duplication rates (e.g. 100-300X exomes libraries) in which it is desirable to retain singleton reads while making consensus reads from sets of duplicates, --min-reads=1 is appropriate. For libraries with high duplication rates where it is desirable to only produce consensus reads supported by 2+ reads to allow error correction, --min-reads=2 or higher is appropriate. After generation, consensus reads can be further filtered using the FilterConsensusReads tool. As such it is always safe to run with --min-reads=1 and filter later, but filtering at this step can improve performance significantly.

+
 Consensus reads have a number of additional optional tags set in the resulting BAM file. The tags break down into those that are single-valued per read:

   - consensus depth      [cD] (int)  : the maximum depth of raw reads at any point in the consensus read
@@ -99,8 +102,6 @@

 The per base depths and errors are both capped at 32,767. In all cases no-calls (Ns) and bases below the --min-input-base-quality are not counted in tag value calculations.

-
-
 http://fulcrumgenomics.github.io/fgbio/tools/latest/CallMolecularConsensusReads.html
     ]]></help>
     <expand macro="citations" />
--- a/macros.xml	Sun Feb 21 23:40:53 2021 +0000
+++ b/macros.xml	Wed Feb 24 13:02:26 2021 +0000
@@ -17,22 +17,33 @@
     </xml>
     <xml name="sam_sort_order">
         <param argument="--sort-order" type="select" optional="true" label="Sort BAM by">
+            <option value="TemplateCoordinate">TemplateCoordinate</option>
             <option value="Coordinate">Coordinate</option>
             <option value="Queryname">Queryname</option>
             <option value="Random">Random</option>
             <option value="RandomQuery">RandomQuery</option>
         </param>
     </xml>
+
+    <xml name="sort_order_change_format">
+        <change_format>
+            <when input="sort_order" value="Coordinate" format="bam" />
+            <when input="sort_order" value="TemplateCoordinate" format="bam" />
+            <when input="sort_order" value="QueryName" format="unsorted.bam" />
+            <when input="sort_order" value="Random" format="unsorted.bam" />
+            <when input="sort_order" value="RandomQuery" format="unsorted.bam" />
+        </change_format>
+    </xml>

     <token name="@READ_STRUCTURES_HELP@"><![CDATA[
 **Read Structures**

 Read structures are made up of <number><operator> pairs much like the CIGAR string in BAM files. Four kinds of operators are recognized:

-    T identifies a template read
-    B identifies a sample barcode read
-    M identifies a unique molecular index read
-    S identifies a set of bases that should be skipped or ignored
+ -  T identifies a template read
+ -  B identifies a sample barcode read
+ -  M identifies a unique molecular index read
+ -  S identifies a set of bases that should be skipped or ignored

 The last <number><operator> pair may be specified using a + sign instead of number to denote “all remaining bases”. This is useful if, e.g., fastqs have been trimmed and contain reads of varying length. For example to convert a paired-end run with an index read and where the first 5 bases of R1 are a UMI and the second five bases are monotemplate you might specify:
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml.bak	Wed Feb 24 13:02:26 2021 +0000
@@ -0,0 +1,56 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.3.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">fgbio</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <token name="@READ_STRUCTURE_PATTERN@">(([1-9][0-9]*[TBMS])*([+]|[1-9][0-9]*)[TBMS])</token>
+    <token name="@READ_STRUCTURES_PATTERN@">@READ_STRUCTURE_PATTERN@(\s@READ_STRUCTURE_PATTERN@)*</token>
+    <xml name="read_structures_validator">
+            <validator type="regex" message="">^@READ_STRUCTURES_PATTERN@$</validator>
+    </xml>
+    <xml name="sam_tag_validator">
+            <validator type="regex" message="">^[A-Za-z][A-Za-z]$</validator>
+    </xml>
+    <xml name="sam_sort_order">
+        <param argument="--sort-order" type="select" optional="true" label="Sort BAM by">
+            <option value="Coordinate">Coordinate</option>
+            <option value="Queryname">Queryname</option>
+            <option value="Random">Random</option>
+            <option value="RandomQuery">RandomQuery</option>
+        </param>
+    </xml>
+
+    <token name="@READ_STRUCTURES_HELP@"><![CDATA[
+**Read Structures**
+
+Read structures are made up of <number><operator> pairs much like the CIGAR string in BAM files. Four kinds of operators are recognized:
+
+    - T identifies a template read
+    - B identifies a sample barcode read
+    - M identifies a unique molecular index read
+    - S identifies a set of bases that should be skipped or ignored
+
+The last <number><operator> pair may be specified using a + sign instead of number to denote “all remaining bases”. This is useful if, e.g., fastqs have been trimmed and contain reads of varying length. For example to convert a paired-end run with an index read and where the first 5 bases of R1 are a UMI and the second five bases are monotemplate you might specify:
+
+::
+
+    --input r1.fq r2.fq i1.fq --read-structures 5M5S+T +T +B
+
+Alternative if you know your reads are of fixed length you could specify:
+
+::
+
+    --input r1.fq r2.fq i1.fq --read-structures 5M5S65T 75T 8B
+
+
+]]></token>
+    <xml name="citations">
+        <citations>
+            <yield />
+        </citations>
+    </xml>
+</macros>