diff tools/gatk/table_recalibration.xml @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/gatk/table_recalibration.xml	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,400 @@
+<tool id="gatk_table_recalibration" name="Table Recalibration" version="0.0.1">
+  <description>on BAM files</description>
+  <command interpreter="python">gatk_wrapper.py
+   --stdout "${output_log}"
+   -d "-I" "${reference_source.input_bam}" "${reference_source.input_bam.ext}" "gatk_input"
+   -d "" "${reference_source.input_bam.metadata.bam_index}" "bam_index" "gatk_input" ##hardcode galaxy ext type as bam_index
+   -p 'java 
+    -jar "${GALAXY_DATA_INDEX_DIR}/shared/jars/gatk/GenomeAnalysisTK.jar"
+    -T "TableRecalibration"
+    -o "${output_bam}"
+    -et "NO_ET" ##ET no phone home
+    ##-log "${output_log}" ##don't use this to log to file, instead directly capture stdout
+    #if $reference_source.reference_source_selector != "history":
+        -R "${reference_source.ref_file.fields.path}"
+    #end if
+    --recal_file "${input_recal}"
+    --disable_bam_indexing
+   '
+    ##start standard gatk options
+    #if $gatk_param_type.gatk_param_type_selector == "advanced":
+        #for $sample_metadata in $gatk_param_type.sample_metadata:
+            -p '--sample_metadata "${sample_metadata.sample_metadata_file}"'
+        #end for
+        #for $read_filter in $gatk_param_type.read_filter:
+            -p '--read_filter "${read_filter.read_filter_type.read_filter_type_selector}"
+            ###raise Exception( str( dir( $read_filter ) ) )
+            #for $name, $param in $read_filter.read_filter_type.iteritems():
+                #if $name not in [ "__current_case__", "read_filter_type_selector" ]:
+                    --${name} "${param}"
+                #end if
+            #end for
+            '
+        #end for
+        #if str( $gatk_param_type.input_intervals ) != "None":
+            -d "-L" "${gatk_param_type.input_intervals}" "${gatk_param_type.input_intervals.ext}" "input_intervals"
+        #end if
+        #if str( $gatk_param_type.input_exclude_intervals ) != "None":
+            -d "-XL" "${gatk_param_type.input_exclude_intervals}" "${gatk_param_type.input_exclude_intervals.ext}" "input_intervals"
+        #end if
+        #set $rod_binding_names = dict()
+        #for $rod_binding in $gatk_param_type.rod_bind:
+            #if str( $rod_binding.rod_bind_type.rod_bind_type_selector ) == 'custom':
+                #set $rod_bind_name = $rod_binding.rod_bind_type.custom_rod_name
+            #else
+                #set $rod_bind_name = $rod_binding.rod_bind_type.rod_bind_type_selector
+            #end if
+            #set $rod_binding_names[$rod_bind_name] = $rod_binding_names.get( $rod_bind_name, -1 ) + 1
+            -d "-B:${rod_bind_name},%(file_type)s" "${rod_binding.rod_bind_type.input_rod}" "${rod_binding.rod_bind_type.input_rod.ext}" "input_${rod_bind_name}_${rod_binding_names[$rod_bind_name]}"
+            #if str( $rod_binding.rod_bind_type.rodToIntervalTrackName ):
+                -p '--rodToIntervalTrackName "${rod_bind_name}"'
+            #end if
+        #end for
+        -p '--BTI_merge_rule "${gatk_param_type.BTI_merge_rule}"'
+        #if str( $gatk_param_type.input_dbsnp_rod ) != "None":
+            -d "-D" "${gatk_param_type.input_dbsnp_rod}" "${gatk_param_type.input_dbsnp_rod.ext}" "dbsnp_rod"
+        #end if
+        -p '--downsampling_type "${gatk_param_type.downsampling_type.downsampling_type_selector}"'
+        #if str( $gatk_param_type.downsampling_type.downsampling_type_selector ) != "NONE":
+            -p '--${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_type_selector} "${gatk_param_type.downsampling_type.downsample_to_type.downsample_to_value}"'
+        #end if
+        -p '
+        --baq "${gatk_param_type.baq}"
+        --baqGapOpenPenalty "${gatk_param_type.baq_gap_open_penalty}"
+        ${gatk_param_type.use_original_qualities}
+        --defaultBaseQualities "${gatk_param_type.default_base_qualities}"
+        --validation_strictness "${gatk_param_type.validation_strictness}"
+        --interval_merging "${gatk_param_type.interval_merging}"
+        '
+        #if str( $gatk_param_type.read_group_black_list ) != "None":
+            -d "-read_group_black_list" "${gatk_param_type.read_group_black_list}" "txt" "input_read_group_black_list"
+        #end if
+    #end if
+    #if str( $reference_source.reference_source_selector ) == "history":
+        -d "-R" "${reference_source.ref_file}" "${reference_source.ref_file.ext}" "gatk_input"
+    #end if
+    ##end standard gatk options
+    
+    ##start analysis specific options
+    #if $analysis_param_type.analysis_param_type_selector == "advanced":
+        -p '
+        #if $analysis_param_type.default_read_group_type.default_read_group_type_selector == "set":
+            --default_read_group "${analysis_param_type.default_read_group_type.default_read_group}"
+        #end if
+        #if str( $analysis_param_type.default_platform ) != "default":
+            --default_platform "${analysis_param_type.default_platform}"
+        #end if
+        #if str( $analysis_param_type.force_read_group_type.force_read_group_type_selector ) == "set":
+            --force_read_group "${analysis_param_type.force_read_group_type.force_read_group}"
+        #end if
+        #if str( $analysis_param_type.force_platform ) != "default":
+            --force_platform "${analysis_param_type.force_platform}"
+        #end if
+        ${analysis_param_type.exception_if_no_tile}
+        #if str( $analysis_param_type.solid_options_type.solid_options_type_selector ) == "set":
+            #if str( $analysis_param_type.solid_options_type.solid_recal_mode ) != "default":
+                --solid_recal_mode "${analysis_param_type.solid_options_type.solid_recal_mode}" 
+            #end if
+            #if str( $analysis_param_type.solid_options_type.solid_nocall_strategy ) != "default":
+                --solid_nocall_strategy "${analysis_param_type.solid_options_type.solid_nocall_strategy}" 
+            #end if
+        #end if
+        ${analysis_param_type.simplify_bam}
+        --preserve_qscores_less_than "${analysis_param_type.preserve_qscores_less_than}"
+        --smoothing "${analysis_param_type.smoothing}"
+        --max_quality_score "${analysis_param_type.max_quality_score}"
+        --window_size_nqs "${analysis_param_type.window_size_nqs}"
+        --homopolymer_nback "${analysis_param_type.homopolymer_nback}"
+        ${analysis_param_type.do_not_write_original_quals}
+        '
+    #end if
+  </command>
+  <inputs>
+    <param name="input_recal" type="data" format="csv" label="Covariates table recalibration file" />
+    <conditional name="reference_source">
+      <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
+        <option value="cached">Locally cached</option>
+        <option value="history">History</option>
+      </param>
+      <when value="cached">
+        <param name="input_bam" type="data" format="bam" label="BAM file">
+          <validator type="unspecified_build" />
+          <validator type="dataset_metadata_in_file" filename="picard_index.loc" metadata_name="dbkey" metadata_column="1" message="Sequences are not currently available for the specified build." /> <!-- fixme!!! this needs to be a select -->
+        </param>
+        <param name="ref_file" type="select" label="Using reference genome">
+          <options from_data_table="picard_indexes">
+            <filter type="data_meta" key="dbkey" ref="input_bam" column="dbkey"/>
+          </options>
+        </param>
+      </when>
+      <when value="history"> <!-- FIX ME!!!! -->
+        <param name="input_bam" type="data" format="bam" label="BAM file" />
+        <param name="ref_file" type="data" format="fasta" label="Using reference file" />
+      </when>
+    </conditional>
+    
+    <conditional name="gatk_param_type">
+      <param name="gatk_param_type_selector" type="select" label="Basic or Advanced GATK options">
+        <option value="basic" selected="True">Basic</option>
+        <option value="advanced">Advanced</option>
+      </param>
+      <when value="basic">
+        <!-- Do nothing here -->
+      </when>
+      <when value="advanced">
+        <repeat name="sample_metadata" title="Sample Metadata">
+            <param name="sample_metadata_file" type="data" format="txt" label="Sample file(s) in JSON format" />
+        </repeat>
+        <repeat name="read_filter" title="Read Filter">
+            <conditional name="read_filter_type">
+		      <param name="read_filter_type_selector" type="select" label="Read Filter Type">
+		        <option value="MaxReadLength" selected="True">MaxReadLength</option>
+		        <option value="ZeroMappingQualityRead">ZeroMappingQualityRead</option>
+		      </param>
+	          <when value="ZeroMappingQualityRead">
+	              <!-- no extra options -->
+	          </when>
+	          <when value="MaxReadLength">
+	              <param name="maxReadLength" type="integer" value="76" label="Max Read Length"/>
+	          </when>
+            </conditional>
+        </repeat>
+        <param name="input_intervals" type="data" format="picard_interval_list" optional="True" label="A list of genomic intervals over which to operate" />
+        <param name="input_exclude_intervals" type="data" format="picard_interval_list" optional="True" label="A list of genomic intervals to exclude from processing" />
+        <repeat name="rod_bind" title="Binding for reference-ordered data">
+            <conditional name="rod_bind_type">
+		      <param name="rod_bind_type_selector" type="select" label="Binding Type">
+		        <option value="snps" selected="True">SNPs</option>
+		        <option value="indels">INDELs</option>
+		        <option value="custom">Custom</option>
+		      </param>
+	          <when value="snps">
+	              <param name="input_rod" type="data" format="vcf,gatk_dbsnp,bed" label="ROD file" />
+	              <param name="rodToIntervalTrackName" type="boolean" truevalue="--rodToIntervalTrackName" falsevalue="" label="Use ROD as interval List (-BTI, --rodToIntervalTrackName)" help="Only one ROD may have this option specified" />
+	          </when>
+	          <when value="indels">
+	              <param name="input_rod" type="data" format="vcf,gatk_dbsnp,bed" label="ROD file" />
+	              <param name="rodToIntervalTrackName" type="boolean" truevalue="--rodToIntervalTrackName" falsevalue="" label="Use ROD as interval List (-BTI, --rodToIntervalTrackName)" help="Only one ROD may have this option specified" />
+	          </when>
+	          <when value="custom">
+	              <param name="custom_rod_name" type="text" value="Unknown" label="ROD Name"/>
+	              <param name="input_rod" type="data" format="vcf,gatk_dbsnp,bed" label="ROD file" />
+	              <param name="rodToIntervalTrackName" type="boolean" truevalue="--rodToIntervalTrackName" falsevalue="" label="Use ROD as interval List (-BTI, --rodToIntervalTrackName)" help="Only one ROD may have this option specified" />
+	          </when>
+            </conditional>
+        </repeat>
+        <param name="BTI_merge_rule" type="select" label="BTI merge rule">
+          <option value="UNION" selected="True">UNION</option>
+          <option value="INTERSECTION">INTERSECTION</option>
+        </param>
+        <param name="input_dbsnp_rod" type="data" format="gatk_dbsnp" optional="True" label="dbSNP reference ordered data (ROD)" />
+        <conditional name="downsampling_type">
+          <param name="downsampling_type_selector" type="select" label="Type of reads downsampling to employ at a given locus" help="Downsampling Type">
+            <option value="NONE" selected="True">NONE</option>
+            <option value="ALL_READS">ALL_READS</option>
+            <option value="BY_SAMPLE">BY_SAMPLE</option>
+          </param>
+          <when value="NONE">
+	          <!-- no more options here -->
+	      </when>
+          <when value="ALL_READS">
+	          <conditional name="downsample_to_type">
+	              <param name="downsample_to_type_selector" type="select" label="Type of reads downsampling to employ at a given locus" help="Downsampling Type">
+	                  <option value="downsample_to_fraction" selected="True">Downsample by Fraction</option>
+	                  <option value="downsample_to_coverage">Downsample by Coverage</option>
+	              </param>
+	              <when value="downsample_to_fraction">
+	                  <param name="downsample_to_value" type="float" label="Fraction [0.0-1.0] of reads to downsample to" value="0.1"/>
+	              </when>
+	              <when value="downsample_to_coverage">
+	                  <param name="downsample_to_value" type="integer" label="Coverage to downsample to at any given locus" value="0"/>
+	              </when>
+	          </conditional>
+	      </when>
+          <when value="BY_SAMPLE">
+	          <conditional name="downsample_to_type">
+	              <param name="downsample_to_type_selector" type="select" label="Type of reads downsampling to employ at a given locus" help="Downsampling Type">
+	                  <option value="downsample_to_fraction" selected="True">Downsample by Fraction</option>
+	                  <option value="downsample_to_coverage">Downsample by Coverage</option>
+	              </param>
+	              <when value="downsample_to_fraction">
+	                  <param name="downsample_to_value" type="float" label="Fraction [0.0-1.0] of reads to downsample to" value="0.1"/>
+	              </when>
+	              <when value="downsample_to_coverage">
+	                  <param name="downsample_to_value" type="integer" label="Coverage to downsample to at any given locus" value="0"/>
+	              </when>
+	          </conditional>
+	      </when>
+        </conditional>
+        <param name="baq" type="select" label="Type of BAQ calculation to apply in the engine">
+          <option value="OFF" selected="True">OFF</option>
+          <option value="CALCULATE_AS_NECESSARY">CALCULATE_AS_NECESSARY</option>
+          <option value="RECALCULATE">RECALCULATE</option>
+        </param>
+        <param name="baq_gap_open_penalty" type="integer" label="BAQ gap open penalty (Phred Scaled)" value="40" help="Default value is 40. 30 is perhaps better for whole genome call sets."/>
+        <param name="use_original_qualities" type="boolean" truevalue="--useOriginalQualities" falsevalue="" label="Use the original base quality scores from the OQ tag" />
+        <param name="default_base_qualities" type="integer" label="Value to be used for all base quality scores, when some are missing" value="-1"/>
+        <param name="validation_strictness" type="select" label="How strict should we be with validation">
+          <option value="STRICT" selected="True">STRICT</option>
+          <option value="LENIENT">LENIENT</option>
+          <option value="SILENT">SILENT</option>
+        </param>
+        <param name="interval_merging" type="select" label="Interval merging rule">
+          <option value="ALL" selected="True">ALL</option>
+          <option value="OVERLAPPING_ONLY">OVERLAPPING_ONLY</option>
+        </param>
+        <param name="read_group_black_list" type="data" format="txt" optional="True" label="Read group black list" />
+      </when>
+    </conditional>
+    
+    
+    <conditional name="analysis_param_type">
+      <param name="analysis_param_type_selector" type="select" label="Basic or Advanced Analysis options">
+        <option value="basic" selected="True">Basic</option>
+        <option value="advanced">Advanced</option>
+      </param>
+      <when value="basic">
+        <!-- Do nothing here -->
+      </when>
+      <when value="advanced">
+        <conditional name="default_read_group_type">
+          <param name="default_read_group_type_selector" type="select" label="Set default Read Group">
+            <option value="default" selected="True">Don't Set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="default">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="default_read_group" type="text" value="Unknown" label="If a read has no read group then default to the provided String"/>
+          </when>
+        </conditional>
+        <param name="default_platform" type="select" label="Set default Platform">
+          <option value="default" selected="True">Don't Set</option>
+          <option value="illumina">illumina</option>
+          <option value="454">454</option>
+          <option value="solid">solid</option>
+        </param>
+        <conditional name="force_read_group_type">
+          <param name="force_read_group_type_selector" type="select" label="Force Read Group">
+            <option value="default" selected="True">Don't Force</option>
+            <option value="set">Force</option>
+          </param>
+          <when value="default">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="force_read_group" type="text" value="Unknown" label="If provided, the read group ID of EVERY read will be forced to be the provided String."/>
+          </when>
+        </conditional>
+        <param name="force_platform" type="select" label="Force Platform">
+          <option value="default" selected="True">Don't Force</option>
+          <option value="illumina">illumina</option>
+          <option value="454">454</option>
+          <option value="solid">solid</option>
+        </param>
+        <param name="exception_if_no_tile" type="boolean" checked="False" truevalue="--exception_if_no_tile" falsevalue="" label="Throw an exception when no tile can be found"/>
+        <conditional name="solid_options_type">
+          <param name="solid_options_type_selector" type="select" label="Set SOLiD specific options">
+            <option value="default" selected="True">Don't Set</option>
+            <option value="set">Set</option>
+          </param>
+          <when value="default">
+            <!-- do nothing here -->
+          </when>
+          <when value="set">
+            <param name="solid_recal_mode" type="select" label="How should we recalibrate solid bases in which the reference was inserted">
+              <option value="default" selected="True">Don't set</option>
+              <option value="DO_NOTHING">DO_NOTHING</option>
+              <option value="SET_Q_ZERO">SET_Q_ZERO</option>
+              <option value="SET_Q_ZERO_BASE_N">SET_Q_ZERO_BASE_N</option>
+              <option value="REMOVE_REF_BIAS">REMOVE_REF_BIAS</option>
+            </param>
+            <param name="solid_nocall_strategy" type="select" label="Behavior of the recalibrator when it encounters no calls">
+              <option value="default" selected="True">Don't set</option>
+              <option value="THROW_EXCEPTION">THROW_EXCEPTION</option>
+              <option value="LEAVE_READ_UNRECALIBRATED">LEAVE_READ_UNRECALIBRATED</option>
+              <option value="PURGE_READ">PURGE_READ</option>
+            </param>
+          </when>
+        </conditional>
+        <param name="simplify_bam" type="boolean" checked="False" truevalue="-simplifyBAM" falsevalue="" label="Simplify BAM"/>
+        <param name="window_size_nqs" type="integer" value="5" label="Window size used by MinimumNQSCovariate"/>
+        <param name="homopolymer_nback" type="integer" value="7" label="Number of previous bases to look at in HomopolymerCovariate" />
+        <param name="preserve_qscores_less_than" type="integer" value="5" label="Bases with quality scores less than this threshold won't be recalibrated"/>
+        <param name="smoothing" type="integer" value="1" label="smoothing"/>
+        <param name="max_quality_score" type="integer" value="50" label="Max quality score"/>
+        <param name="do_not_write_original_quals" type="boolean" checked="False" truevalue="--doNotWriteOriginalQuals" falsevalue="" label="Do Not Write Original Quality tag"/>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data format="bam" name="output_bam" label="${tool.name} on ${on_string} (BAM)" />
+    <data format="txt" name="output_log" label="${tool.name} on ${on_string} (log)" />
+  </outputs>
+  <tests>
+      <test>
+          <param name="input_recal" value="gatk/gatk_count_covariates/gatk_count_covariates_out_1.csv" ftype="csv" /> 
+          <param name="reference_source_selector" value="history" />
+          <param name="ref_file" value="phiX.fasta" ftype="fasta" />
+          <param name="input_bam" value="gatk/gatk_indel_realigner/gatk_indel_realigner_out_1.bam" ftype="bam" />
+          <param name="gatk_param_type_selector" value="basic" />
+          <param name="analysis_param_type_selector" value="basic" />
+          <output name="output_bam" file="gatk/gatk_table_recalibration/gatk_table_recalibration_out_1.bam" ftype="bam" lines_diff="2" />
+          <output name="output_log" file="gatk/gatk_table_recalibration/gatk_table_recalibration_out_1.log.contains" compare="contains" />
+      </test>
+  </tests>
+  <help>
+**What it does**
+
+     This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.  For 
+     each base in each read this walker calculates various user-specified covariates (such as read group, reported 
+     quality score, cycle, and dinuc) Using these values as a key in a large hashmap the walker calculates an empirical 
+     base quality score and overwrites the quality score currently in the read. This walker then outputs a new bam file 
+     with these updated (recalibrated) reads.  Note: This walker expects as input the recalibration table file generated 
+     previously by CovariateCounterWalker. Note: This walker is designed to be used in conjunction with 
+     CovariateCounterWalker.
+
+------
+
+Please cite the website "http://addlink.here" as well as:
+
+Add citation here 2011.
+
+------
+
+**Input formats**
+
+GenomeAnalysisTK: TableRecalibration accepts an aligned BAM and a recalibration CSV input files.
+
+------
+
+**Outputs**
+
+The output is in BAM format, see http://addlink.here for more details.
+
+-------
+
+**Settings**::
+
+ default_read_group                           If a read has no read group then default to the provided String.
+ default_platform                                If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.
+ force_read_group                               If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.
+ force_platform                                    If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.
+ window_size_nqs                                 The window size used by MinimumNQSCovariate for its calculation
+ homopolymer_nback                           The number of previous bases to look at in HomopolymerCovariate
+ exception_if_no_tile                               If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1
+ solid_recal_mode                             How should we recalibrate solid bases in whichthe reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS (DO_NOTHING|SET_Q_ZERO|SET_Q_ZERO_BASE_N|REMOVE_REF_BIAS)
+ solid_nocall_strategy   Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ (THROW_EXCEPTION|LEAVE_READ_UNRECALIBRATED|PURGE_READ)
+ recal_file                                     Filename for the input covariates table recalibration .csv file
+ out                                                           The output BAM file
+ bam_compression                            Compression level to use for writing BAM files
+ disable_bam_indexing                                                   Turn off on-the-fly creation of indices for output BAM files.
+ simplifyBAM                                               If provided, output BAM files will be simplified to include just key reads for downstream variation discovery analyses (removing duplicates, PF-, non-primary reads), as well stripping all extended tags from the kept reads except the read group identifier
+ preserve_qscores_less_than            Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below &lt; 5, since base callers use these values to indicate random or bad bases
+ smoothing                                              Number of imaginary counts to add to each bin bin order to smooth out bins with few data points, default=1
+ max_quality_score                            The integer value at which to cap the quality scores, default=50
+ doNotWriteOriginalQuals                                         If true, we will not write the original quality (OQ) tag for each read
+
+  </help>
+</tool>