Mercurial > repos > gbcs-embl-heidelberg > je_markdupes

diff je-markdupes.xml @ 0:d39a96961423 draft
Initial upload
author: gbcs-embl-heidelberg
date: Wed, 25 Nov 2015 12:36:12 -0500
children: 4ccf1406832d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/je-markdupes.xml	Wed Nov 25 12:36:12 2015 -0500
@@ -0,0 +1,408 @@
+<tool id="je_markdupes" name="Je-MarkDuplicates" version="1.0">
+    <description>to filter BAM files for read duplicates taking UMIs into account</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <version_command>echo '1.0'</version_command>
+    <command interpreter="bash">
+<![CDATA[
+    je markdupes
+
+    ## picard MarkDuplicates defaults
+    INPUT="${inputFile}"
+    OUTPUT="${outFile}"
+
+    METRICS_FILE="${metrics_file}"
+
+    REMOVE_DUPLICATES="${remove_duplicates}"
+    ASSUME_SORTED="${assume_sorted}"
+
+    #for $element in $adv_options.comments:
+        COMMENT="${element.comment}"
+    #end for
+
+    DUPLICATE_SCORING_STRATEGY="${adv_options.duplicate_scoring_strategy}"
+
+    #import pipes
+    READ_NAME_REGEX=${ pipes.quote( str( $adv_options.read_name_regex ) ) or "''" }
+    OPTICAL_DUPLICATE_PIXEL_DISTANCE="${adv_options.optical_duplicate_pixel_distance}"
+
+    VALIDATION_STRINGENCY="${adv_options.validation_stringency}"
+    QUIET=true
+    VERBOSITY=ERROR
+
+    ## Je Markdupes Specific
+    MM=${MM}
+    #if str($MAX_N) != "":
+        MAX_N=${MAX_N}
+    #end if
+    @barcode_option_cmd@
+
+    #for $i, $option in enumerate( $repeat_slots )
+        #if str($option.SLOTS) != "":
+            SLOTS=${option.SLOTS}
+        #end if
+    #end for
+
+    #if str($trim_conditional.T) == "true":
+        T=${trim_conditional.T}
+        #for $i, $option in enumerate( $trim_conditional.repeat_tslots )
+            #if str($option.TSLOTS) != "":
+                TSLOTS=${option.TSLOTS}
+            #end if
+        #end for
+    #end if
+]]>
+    </command>
+    <configfiles>
+        <expand macro="barcode_config_file"></expand>
+    </configfiles>
+
+    <inputs>
+        <param format="bam,sam" name="inputFile" type="data" label="Select SAM/BAM dataset"
+            help="If empty, upload or import a SAM/BAM dataset"/>
+        <param name="remove_duplicates" type="boolean" label="If true do not write duplicates to the output file
+            instead of writing them with appropriate flags set" help="REMOVE_DUPLICATES; default=False"/>
+        <param name="assume_sorted" type="boolean" label="Assume the input file is already sorted" checked="true"
+            truevalue="true" falsevalue="false" help="ASSUME_SORTED; default=True"/>
+        <conditional name="barcodes">
+            <param name="barcode_list_type_con" type="select" label="Do you have a predefined list of UMIs">
+                <option value="file" selected="true">A one column txt file from the history</option>
+                <option value="text">Paste the UMI list in a text field</option>
+                <option value="no_barcodes">No predefined list</option>
+            </param>
+
+            <when value="file">
+                <param name="BARCODE_FILE" type="data" format="tabular,txt" label="UMI file"
+                    help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
+                    Format: one column text file, one UMI per line. All UMIs MUST have the same length."/>
+            </when>
+
+            <when value="text">
+                <param name="barcode_text" type="text" area="True" size="10x30"
+                    value="barcode\n" label="Barcode file"
+                    help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected.
+                    Format: one column text file, one UMI per line. All UMIs MUST have the same length.">
+                    <sanitizer>
+                        <valid initial="string.printable"></valid>
+                        <mapping initial="none"/>
+                    </sanitizer>
+                </param>
+            </when>
+            <when value="no_barcodes"/>
+        </conditional>
+        <repeat name="repeat_slots" min="1" title="Unique Molecular Identifier location">
+            <param name="SLOTS" type="text" value="-1" label="Where to find the UMIs in the read name"
+                help="SLOTS. The last position is considered by default (-1). See help below."/>
+        </repeat>
+        <param name="MM" type="integer" value="1" min="0"
+            label="Number of maximum mismatches to consider two Unique Molecular Identifiers (UMIs) similar"
+            help="MISMATCHES"/>
+        <param name="MAX_N" type="text" value="" label="Maximum number of Ns a UMI can contain"
+              help="MAX_NUMBER_OF_N. Above this value, reads are placed in a 'undefined' group.
+              Default value is the MISMATCHES number."/>
+        <param name="SPLIT" type="text" value=":" label="Character to split up the header" help="SPLIT"/>
+        <conditional name="trim_conditional">
+            <param name="T" type="select"
+                label="Should barcode information be removed from read names in the output BAM" help="TRIM_HEADERS">
+                <option value="true">Yes</option>
+                <option value="false" selected="true">No</option>
+            </param>
+            <when value="true">
+                <repeat name="repeat_tslots" min="1" title="Unique Molecular Identifier location for trimming">
+                    <param name="TSLOTS" type="text" value="-1"
+                        label="Where to find the UMIs in the read name that should be removed from the header"
+                        help="TSLOTS. Value for SLOTS is considered by default. See help below"/>
+                </repeat>
+            </when>
+            <when value="false"/>
+        </conditional>
+        <section name="adv_options" title="Advanced Options" expanded="False">
+            <repeat name="comments" title="Comment" min="0" help="You can provide multiple comments">
+              <param name="comment" type="text" label="Add this comment to BAM dataset"/>
+            </repeat>
+
+            <param name="duplicate_scoring_strategy" type="select" label="The scoring strategy for choosing the
+                non-duplicate among candidates" help="DUPLICATE_SCORING_STRATEGY; default=SUM_OF_BASE_QUALITIES">
+              <option value="SUM_OF_BASE_QUALITIES">SUM_OF_BASE_QUALITIES</option>
+              <option value="TOTAL_MAPPED_REFERENCE_LENGTH">TOTAL_MAPPED_REFERENCE_LENGTH</option>
+            </param>
+
+            <param name="read_name_regex" type="text" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."
+                label="Regular expression that can be used to parse read names in the incoming SAM/BAM dataset"
+                help="READ_NAME_REGEX; Read names are parsed to extract three variables: tile/region, x coordinate and
+                y coordinate. These values are used to estimate the rate of optical duplication in order to give a more
+                accurate estimated library size. See help below for more info;
+                default=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.">
+              <sanitizer>
+                <valid initial="string.printable">
+                </valid>
+              </sanitizer>
+            </param>
+            <param name="optical_duplicate_pixel_distance" type="integer" value="100" min="0" max="500"
+                label="The maximum offset between two duplicte clusters in order to consider them optical duplicates"
+                help="OPTICAL_DUPLICATE_PIXEL_DISTANCE; default=100"/>
+
+            <param name="validation_stringency" type="select" label="Select validation stringency"
+                help="Setting stringency to SILENT can improve performance when processing a BAM file in which
+                variable-length data (read, qualities, tags) do not otherwise need to be decoded.">
+              <option value="LENIENT" selected="True">Lenient</option>
+              <option value="SILENT">Silent</option>
+              <option value="STRICT">Strict</option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data format="bam" name="outFile" label="${tool.name} on ${on_string}: Je-MarkDuplicates BAM output"/>
+        <data format="txt" name="metrics_file" label="${tool.name} on ${on_string}: Je-MarkDuplicate metrics"/>
+    </outputs>
+
+    <tests>
+        <test>
+          <!-- picard markduplicates default test -->
+          <param name="inputFile" value="markdupes_DNase_sorted.bam" ftype="bam"/>
+          <param name="barcode_list_type_con" value="file"/>
+          <param name="BARCODE_FILE" value="markdupes_umis.txt" ftype="txt"/>
+          <param name="repeat_slots_0|SLOTS" value="-1"/>
+          <param name="repeat_slots_1|SLOTS" value="-2"/>
+          <param name="MM" value="2"/>
+          <param name="MAX_N" value="1"/>
+          <param name="comment" value="test-run"/>
+          <param name="assume_sorted" value="True"/>
+          <param name="remove_duplicates" value="True"/>
+          <param name="read_name_regex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."/>
+          <param name="optical_duplicate_pixel_distance" value="100"/>
+          <param name="duplicate_scoring_strategy" value="SUM_OF_BASE_QUALITIES"/>
+          <param name="validation_stringency" value="LENIENT"/>
+          <output name="outFile" file="markdupes_DNase_sorted_marked.bam" ftype="bam" lines_diff="2"/>
+          <output name="metrics_file" file="markdupes_metrics.txt" ftype="txt" lines_diff="4"/>
+        </test>
+    </tests>
+
+
+  <help>
+<![CDATA[
+**What it does**
+
+Je MarkDupes: Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules taking into account
+molecular barcodes (Unique Molecular Identifiers or UMIs) found in read header.
+All records are then either written to the output file with the duplicate records flagged or trashed.
+
+Input file is a bam file.
+
+Author: Charles Girardot  (charles.girardot@embl.de).
+
+Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
+
+------
+
+**Know what you are doing**
+
+.. class:: warningmark
+
+  You will want to read the `documentation`__.
+
+  .. __: http://gbcs.embl.de/portal/Je
+
+------
+
+**Parameter list**
+
+This is an exhaustive list of options::
+
+  INPUT=String
+  I=String
+
+    One or more input SAM or BAM files to analyze. Must be coordinate sorted.
+
+    Default value: null. This option may be specified 0 or more times.
+
+  OUTPUT=File
+  O=File
+
+    The output file to write marked records to
+
+    Required.
+
+  MISMATCHES=Integer
+  MM=Integer
+
+    Number of MisMatches (inclusive) to still consider two Unique Molecular Identifiers
+    (UMIs) the same i.e. this option buffers for sequencing errors.
+    Indeed, in case of a sequencing error, 2 duplicate reads would not be considered
+    duplicates anymore.
+    Note that N are not considered mismatches during comparison ie ATTNGG and NTTANG are seen
+    as the same barcode and these two reads would be flagged duplicates.
+    This option takes a single value even when several barcodes are present (see SLOTS).
+    Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
+    of barcodes (see BC option), the MM value is applicable in each lookup. When a predefined
+    set of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and
+    the MM value is therefore considered *overall* as the concatenated code is seen as a
+    unique code.
+    MM=null is like MM=0
+    Use the minimum Hamming distance of the original barcode set (if applicable).
+
+    Required.
+
+  MAX_NUMBER_OF_N=Integer
+  MAX_N=Integer
+
+    Maximum number of Ns a molecular code can contain (inclusive). Above this value, reads
+    are placed in a UNDEF group.
+    More precisely, these 'too degenarate' codes will not :
+    	 * be compared to the list of predefined codes [predefined code list situation ie BC
+    option given] nor
+    	 * be considered as a potential independent code [no predefined code list situation ie
+    BC option not given]
+    Default value is the MISMATCHES number.
+    Note that when declaring several barcodes (see SLOTS) AND providing a predefined set
+    of barcodes (see BC option), the MAX_N value is applicable to each barcode. When a
+    predefined set
+    of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and the
+    MAX_N value
+    is therefore considered *overall*.
+
+    Default value: null.
+
+
+  SLOTS=Integer
+  SLOTS=Integer
+
+    Where to find the UMIs (and only the UMIs) in the read name once read name has been
+    tokenized using the SPLIT character (e.g. ':').
+    By default, the UMI is considered to be found at the end of the read header i.e. after
+    the last ':'. Use this option to indicate other or additional UMI positions (e.g.
+    multiple UMIs present in read header.
+    IMPORTANT: counting starts at 1 and negative numbers can be used to start counting from
+    the end.
+    For example, consider the following read name that lists 3 different barcodes in the end:
+      HISEQ:44:C6KC0ANXX:8:2112:20670:79594:CGATGTTT:GATCCTAG:AAGGTACG
+    to indicate that the three barcodes are molecular codes, use
+      SLOTS=-1 SLOTS=-2 SLOTS=-3
+    if only the 2 last ones should be considered (the third one being a sample encoding
+    barcode), use
+      SLOTS=-1 SLOTS=-2
+
+    Default value: null. This option may be specified 0 or more times.
+
+  BARCODE_FILE=File
+  BC=File
+
+    Pre-defined list of UMIs that can be expected. Format: one column text file, one barcode
+    per line. All UMIs MUST have the same length.
+
+    Default value: null.
+
+  TRIM_HEADERS=Boolean
+  T=Boolean
+
+    Should barcode information be removed from read names in the output BAM?
+
+    Default value: false. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  TSLOTS=Integer
+  TSLOTS=Integer
+
+    Where to find *all* barcode(s) (i.e. sample encoding and UMIs) in the read name once has
+    been tokenized using the SPLIT character (e.g. ':').
+    This option is only considered when TRIM_HEADERS=true. When TSLOTS is ommited while
+    TRIM_HEADERS=true, the values of SLOTS apply.
+    IMPORTANT : counting starts at 1 and negative numbers can be used to start counting from
+    the end.
+    See SLOT help for examples.
+
+    Default value: null. This option may be specified 0 or more times.
+
+  SPLIT_CHAR=String
+  SPLIT=String
+
+    Character to use to split up the read header line, default is ':'.
+
+    Default value: ':'. This option can be set to 'null' to clear the default value.
+
+  INPUT=String
+  I=String
+
+    One or more input SAM or BAM files to analyze. Must be coordinate sorted.
+
+    Default value: null. This option may be specified 0 or more times.
+
+  OUTPUT=File
+  O=File
+
+    The output file to write marked records to  Required.
+
+  METRICS_FILE=File
+  M=File
+
+    File to write duplication metrics to  Required.
+
+  COMMENT=String
+  CO=String
+
+    Comment(s) to include in the output file's header.
+
+    Default value: null. This option may be specified 0 or more times.
+
+  REMOVE_DUPLICATES=Boolean
+
+    If true do not write duplicates to the output file instead of writing them with
+    appropriate flags set.
+
+    Default value: false. This option can be set to 'null' to clear
+    the default value.
+    Possible values: {true, false}
+
+  ASSUME_SORTED=Boolean
+  AS=Boolean
+
+    If true, assume that the input file is coordinate sorted even if the header says
+    otherwise.
+
+    Default value: false. This option can be set to 'null' to clear the default
+    value.
+    Possible values: {true, false}
+
+  DUPLICATE_SCORING_STRATEGY=ScoringStrategy
+  DS=ScoringStrategy
+
+    The scoring strategy for choosing the non-duplicate among candidates.
+
+    Default value: SUM_OF_BASE_QUALITIES. This option can be set to 'null' to clear the default value.
+    Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH}
+
+  READ_NAME_REGEX=String
+
+    Regular expression that can be used to parse read names in the incoming SAM file. Read
+    names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
+    These values are used to estimate the rate of optical duplication in order to give a more
+    accurate estimated library size. Set this option to null to disable optical duplicate
+    detection. The regular expression should contain three capture groups for the three
+    variables, in order. It must match the entire read name. Note that if the default regex
+    is specified, a regex match is not actually done, but instead the read name  is split on
+    colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
+    tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
+    are assumed to be tile, x and y values.
+
+    Default value:
+    [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to
+    clear the default value.
+
+  OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
+
+    The maximum offset between two duplicte clusters in order to consider them optical
+    duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
+    unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
+    which case 50-100 is more normal.
+
+    Default value: 100. This option can be set to 'null'
+    to clear the default value.
+
+]]>
+  </help>
+
+</tool>
author	gbcs-embl-heidelberg
date	Wed, 25 Nov 2015 12:36:12 -0500
parents
children	4ccf1406832d