# HG changeset patch
# User devteam
# Date 1418774601 18000
# Node ID 3d4f1fa26f0ef80dbf33feeaf5d72dc5773dd672
# Parent ab1f60c26526673de0731231d41be23918a3d47f
Uploaded
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_AddCommentsToBam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_AddCommentsToBam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,79 @@
+
+ add comments to BAM dataset
+ picard
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ AddCommentsToBam
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ #for $element in $comments:
+ COMMENT="${element.comment}"
+ #end for
+ QUIET=true
+ VERBOSITY=ERROR
+ VALIDATION_STRINGENCY=${validation_stringency}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Adds one or more comments (@CO) to the header of a specified BAM dataset.
+
+@dataset_collections@
+
+@description@
+
+ COMMENT=String
+ C=String Comments to add to the BAM file This option may be specified 0 or more times.
+
+@more_info@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_AddOrReplaceReadGroups.xml
--- a/picard_AddOrReplaceReadGroups.xml Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_AddOrReplaceReadGroups.xml Tue Dec 16 19:03:21 2014 -0500
@@ -1,96 +1,81 @@
-
- picard
-
- picard_wrapper.py
- --input="${inputFile}"
- --rg-lb="${rglb}"
- --rg-pl="${rgpl}"
- --rg-pu="${rgpu}"
- --rg-sm="${rgsm}"
- --rg-id="${rgid}"
- --rg-opts="${readGroupOpts.rgOpts}"
- #if $readGroupOpts.rgOpts == "full"
- --rg-cn="${readGroupOpts.rgcn}"
- --rg-ds="${readGroupOpts.rgds}"
+
+ add or replaces read group information
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ AddOrReplaceReadGroups
+ INPUT="${inputFile}"
+ RGLB="${rglb}"
+ RGPL="${rgpl}"
+ RGPU="${rgpu}"
+ RGSM="${rgsm}"
+ RGID="${rgid}"
+
+ #if str( $rgcn):
+ RGCN="${rgcn}"
#end if
- --output-format="${outputFormat}"
- --output="${outFile}"
- -j "\$JAVA_JAR_PATH/AddOrReplaceReadGroups.jar"
- --tmpdir "${__new_file_path__}"
+
+ #if str( $rgds):
+ RGDS="${rgds}"
+ #end if
+
+ #if str( $rgpi):
+ RGPI="${rgpi}"
+ #end if
+
+ #if str( $rgdt):
+ RGDT="${rgdt}"
+ #end if
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+ OUTPUT="${outFile}"
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
@@ -101,94 +86,51 @@
Add or Replace Read Groups in an input BAM or SAM file.
-**Read Groups are Important!**
+@dataset_collections@
-Many downstream analysis tools (such as GATK, for example) require BAM datasets to contain read groups. Even if you are not going to use GATK, setting read groups correctly from the start will simplify your life greatly. Below we provide an explanation of read groups fields taken from GATK FAQ webpage:
+@RG@
-.. csv-table::
- :header-rows: 1
-
- Tag,Importance,Definition,Meaning
- "ID","Required","Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.","Ideally, this should be a globally unique identify across all sequencing data in the world, such as the Illumina flowcell + lane name and number. Will be referenced by each read with the RG:Z field, allowing tools to determine the read group information associated with each read, including the sample from which the read came. Also, a read group is effectively treated as a separate run of the NGS instrument in tools like base quality score recalibration (a GATK component) -- all reads within a read group are assumed to come from the same instrument run and to therefore share the same error model."
- "SM","Sample. Use pool name where a pool is being sequenced.","Required. As important as ID.","The name of the sample sequenced in this read group. GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample. Therefore it's critical that the SM field be correctly specified, especially when using multi-sample tools like the Unified Genotyper (a GATK component)."
- "PL","Platform/technology used to produce the read. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.","Important. Not currently used in the GATK, but was in the past, and may return. The only way to known the sequencing technology used to generate the sequencing data","It's a good idea to use this field."
- "LB","DNA preparation library identify","Essential for MarkDuplicates","MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes."
+@description@
-**Example of Read Group usage**
-
-Support we have a trio of samples: MOM, DAD, and KID. Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts. Each of these libraries is run on two lanes of an illumina hiseq, requiring 3 x 2 x 2 = 12 lanes of data. When the data come off the sequencer, we would create 12 BAM files, with the following @RG fields in the header::
+ INPUT=File
+ I=File Input file (bam or sam). Required.
- Dad's data:
- @RG ID:FLOWCELL1.LANE1 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200
- @RG ID:FLOWCELL1.LANE2 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200
- @RG ID:FLOWCELL1.LANE3 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400
- @RG ID:FLOWCELL1.LANE4 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400
-
- Mom's data:
- @RG ID:FLOWCELL1.LANE5 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200
- @RG ID:FLOWCELL1.LANE6 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200
- @RG ID:FLOWCELL1.LANE7 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
- @RG ID:FLOWCELL1.LANE8 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
-
- Kid's data:
- @RG ID:FLOWCELL2.LANE1 PL:illumina LB:LIB-KID-1 SM:KID PI:200
- @RG ID:FLOWCELL2.LANE2 PL:illumina LB:LIB-KID-1 SM:KID PI:200
- @RG ID:FLOWCELL2.LANE3 PL:illumina LB:LIB-KID-2 SM:KID PI:400
- @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400
+ OUTPUT=File
+ O=File Output file (bam or sam). Required.
-Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).
-
-**Picard documentation**
+ SORT_ORDER=SortOrder
+ SO=SortOrder Optional sort order to output in. If not supplied OUTPUT is in the same order as INPUT.
+ Default value: null. Possible values: {unsorted, queryname, coordinate}
-This is a Galaxy wrapper for AddOrReplaceReadGroups, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
+ RGID=String
+ ID=String Read Group ID Default value: 1. This option can be set to 'null' to clear the default
+ value.
-------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-Either a sam file or a bam file must be supplied. If a bam file is used, it must
-be coordinate-sorted. Galaxy currently coordinate-sorts all bam files.
-
-The output file is either bam (the default) or sam, according to user selection,
-and contains the same information as the input file except for the appropraite
-additional (or modified) read group tags. Bam is recommended since it is smaller.
-
-From the Picard documentation.
-
-AddOrReplaceReadGroups REQUIRED parameters::
-
- Option (Type) Description
+ RGLB=String
+ LB=String Read Group Library Required.
- RGLB=String Read Group Library
- RGPL=String Read Group platform (e.g. illumina, solid)
- RGPU=String Read Group platform unit (eg. run barcode)
- RGSM=String Read Group sample name
- RGID=String Read Group ID; Default value: null (empty)
+ RGPL=String
+ PL=String Read Group platform (e.g. illumina, solid) Required.
-AddOrReplaceReadGroups OPTIONAL parameters::
+ RGPU=String
+ PU=String Read Group platform unit (eg. run barcode) Required.
- Option (Type) Description
-
- RGCN=String Read Group sequencing center name; Default value: null (empty)
- RGDS=String Read Group description Default value: null (empty)
+ RGSM=String
+ SM=String Read Group sample name Required.
-One parameter that Picard's AddOrReplaceReadGroups offers that is automatically
-set by Galaxy is the SORT_ORDER, which is set to coordinate.
+ RGCN=String
+ CN=String Read Group sequencing center name Default value: null.
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
+ RGDS=String
+ DS=String Read Group description Default value: null.
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
+ RGDT=Iso8601Date
+ DT=Iso8601Date Read Group run date Default value: null.
+ RGPI=Integer
+ PI=Integer Read Group predicted insert size Default value: null.
-
+@more_info@
@@ -203,3 +145,4 @@
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_BamIndexStats.xml
--- a/picard_BamIndexStats.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-
- picard
-
- picard_wrapper.py
- --input "${input_file}"
- --bai-file "${input_file.metadata.bam_index}"
- -t "${htmlfile}"
- -d "${htmlfile.files_path}"
- -j "\$JAVA_JAR_PATH/BamIndexStats.jar"
- --tmpdir "${__new_file_path__}"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Purpose**
-
-Generate Bam Index Stats for a provided BAM file.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for BamIndexStats, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
-------
-
-.. class:: infomark
-
-**Inputs and outputs**
-
-The only input is the BAM file you wish to obtain statistics for, which is required.
-Note that it must be coordinate-sorted. Galaxy currently coordinate-sorts all BAM files.
-
-This tool outputs an HTML file that contains links to the actual metrics results, as well
-as a log file with info on the exact command run.
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-------
-
-**Example**
-
-Given a BAM file created from the following::
-
- @HD VN:1.0 SO:coordinate
- @SQ SN:chr1 LN:101
- @SQ SN:chr7 LN:404
- @SQ SN:chr8 LN:202
- @SQ SN:chr10 LN:303
- @SQ SN:chr14 LN:505
- @RG ID:0 SM:Hi,Mom!
- @RG ID:1 SM:samplesample DS:ClearDescription
- @PG ID:1 PN:Hey! VN:2.0
- @CO Just a generic comment to make the header longer
- read1 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0
- read2 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0
- read3 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0
- read4 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0
- read5 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))II'I*/)-I*-)I.-)I)I),/-II..)./.,.).*II,I.II-)III0*IIIIIIII/32/,01460II/6/*0*/2/283//36868/I RG:Z:0
- read6 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0
- read7 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0
- read8 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA I/15445666651/566666553+2/14/I/555512+3/)-'/-I-'*+))*''13+3)'//++''/'))/3+I*5++)I'2+I+/*I-II*)I-./1'1 RG:Z:0
-
-The following metrics file will be produced::
-
- chr1 length= 101 Aligned= 0 Unaligned= 0
- chr7 length= 404 Aligned= 7 Unaligned= 0
- chr8 length= 202 Aligned= 0 Unaligned= 0
- chr10 length= 303 Aligned= 0 Unaligned= 0
- chr14 length= 505 Aligned= 0 Unaligned= 0
- NoCoordinateCount= 1
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_BedToIntervalList.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_BedToIntervalList.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,117 @@
+
+ convert coordinate data into picard interval list format
+ picard
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ #set $picard_dict = "localref.dict"
+ #set $ref_fasta = "localref.fa" ## This is done because picards "likes" .fa extension
+
+ ln -s "${reference_source.ref_file}" "${ref_fasta}" &&
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+
+ java -jar \$JAVA_JAR_PATH/picard.jar CreateSequenceDictionary REFERENCE="${ref_fasta}" OUTPUT="${picard_dict}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+ &&
+
+ #else:
+
+ #set $ref_fasta = str( $reference_source.ref_file.fields.path ) ## getting path of reference fasta file (must end with .fa)
+ #set $picard_dict=$ref_fasta[:-2]+"dict" ## replacing .fa with .dict
+
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ BedToIntervalList
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ SEQUENCE_DICTIONARY="${picard_dict}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Convert coordinate data (such as BED or Galaxy Interval) into Picard Interval Format.
+
+@dataset_collections@
+
+@description@
+
+ SEQUENCE_DICTIONARY=File
+ SD=File The sequence dictionary. You can either use dictionary pre-cached
+ on this instance of Galaxy, or create one on teh fly from a FASTA
+ file uploaded to history (right pane of the interface).
+
+
+@more_info@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CleanSam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CleanSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,63 @@
+
+
+ perform SAM/BAM grooming
+
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CleanSam
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ QUIET=true
+ VERBOSITY=ERROR
+ VALIDATION_STRINGENCY=${validation_stringency}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Read SAM/BAM and perform various fix-ups. Currently, the only fix-ups are:
+
+ 1. to soft-clip an alignment that hangs off the end of its reference sequence.
+ 2. to set MAPQ to 0 if a read is unmapped.
+
+@dataset_collections@
+
+@more_info@
+
+
+
\ No newline at end of file
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectAlignmentSummaryMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectAlignmentSummaryMetrics.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,137 @@
+
+ writes a file containing summary alignment metrics
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectAlignmentSummaryMetrics
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ MAX_INSERT_SIZE=${maxinsert}
+ #for $sequence in $adapters:
+ ADAPTER_SEQUENCE="${sequence.adapter}"
+ #end for
+ METRIC_ACCUMULATION_LEVEL="${metric_accumulation_level}"
+ IS_BISULFITE_SEQUENCED="${bisulphite}"
+
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+
+ ASSUME_SORTED="${assume_sorted}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Reads a SAM or BAM file and writes a file containing summary alignment metrics.
+
+@dataset_collections@
+
+@description@
+
+ MAX_INSERT_SIZE=Integer Paired end reads above this insert size will be considered chimeric along with
+ inter-chromosomal pairs. Default value: 100000.
+
+ ADAPTER_SEQUENCE=String List of adapter sequences to use when processing the alignment metrics This option may
+ be specified 0 or more times.
+
+ METRIC_ACCUMULATION_LEVEL=MetricAccumulationLevel
+ LEVEL=MetricAccumulationLevel The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE,
+ LIBRARY, READ_GROUP} This option may be specified 0 or more times.
+
+ IS_BISULFITE_SEQUENCED=Boolean
+ BS=Boolean Whether the SAM or BAM file consists of bisulfite sequenced reads.
+
+
+ REFERENCE_SEQUENCE=File
+ R=File Reference sequence fasta Default value: null.
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectBaseDistributionByCycle.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectBaseDistributionByCycle.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,116 @@
+
+ charts the nucleotide distribution per cycle in a SAM or BAM dataset
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectBaseDistributionByCycle
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ CHART_OUTPUT="${pdfFile}"
+ ALIGNED_READS_ONLY="${aligned_reads_only}"
+ PF_READS_ONLY="${pf_reads_only}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ASSUME_SORTED="${assume_sorted}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Program to chart the nucleotide distribution per cycle in a SAM or BAM file.
+
+@dataset_collections@
+
+@description@
+
+ ALIGNED_READS_ONLY=Boolean If set to true, calculate the base distribution over aligned reads only. Default value:
+ false. This option can be set to 'null' to clear the default value. Possible values:
+ {true, false}
+
+ PF_READS_ONLY=Boolean If set to true calculate the base distribution over PF reads only. Default value: false.
+ This option can be set to 'null' to clear the default value. Possible values: {true,
+ false}
+
+ REFERENCE_SEQUENCE=File
+ R=File Reference sequence fasta Default value: null.
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectGcBiasMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectGcBiasMetrics.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,117 @@
+
+ charts the GC bias metrics
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectGcBiasMetrics
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ CHART_OUTPUT="${pdfFile}"
+ SUMMARY_OUTPUT="${summaryFile}"
+ WINDOW_SIZE="${window_size}"
+ MINIMUM_GENOME_FRACTION="${minimum_genome_fraction}"
+ IS_BISULFITE_SEQUENCED="${is_bisulfite_sequenced}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ASSUME_SORTED="${assume_sorted}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Program to chart the nucleotide distribution per cycle in a SAM or BAM file.
+
+@dataset_collections@
+
+@description@
+
+ ALIGNED_READS_ONLY=Boolean If set to true, calculate the base distribution over aligned reads only. Default value:
+ false. Possible values: {true, false}
+
+ PF_READS_ONLY=Boolean If set to true calculate the base distribution over PF reads only. Default value: false.
+ This option can be set to 'null' to clear the default value. Possible values: {true,
+ false}
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default: True
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectInsertSizeMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectInsertSizeMetrics.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,141 @@
+
+ plots distribution of insert sizes
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectInsertSizeMetrics
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ HISTOGRAM_FILE="${histFile}"
+ DEVIATIONS="${deviations}"
+
+ #if str( $hist_width ):
+ HISTOGRAM_WIDTH="${hist_width}"
+ #end if
+
+ MINIMUM_PCT="${min_pct}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ASSUME_SORTED="${assume_sorted}"
+ METRIC_ACCUMULATION_LEVEL="${metric_accumulation_level}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Reads a SAM or BAM dataset and writes a file containing metrics about the statistical distribution of insert size (excluding duplicates) and generates a Histogram plot.
+
+@dataset_collections@
+
+@description@
+
+
+ DEVIATIONS=Double Generate mean, sd and plots by trimming the data down to MEDIAN +
+ DEVIATIONS*MEDIAN_ABSOLUTE_DEVIATION. This is done because insert size data typically
+ includes enough anomalous values from chimeras and other artifacts to make the mean and
+ sd grossly misleading regarding the real distribution. Default value: 10.0.
+
+ HISTOGRAM_WIDTH=Integer
+ W=Integer Explicitly sets the Histogram width, overriding automatic truncation of Histogram tail.
+ Also, when calculating mean and standard deviation, only bins <= Histogram_WIDTH will be
+ included. Default value: not set.
+
+ MINIMUM_PCT=Float
+ M=Float When generating the Histogram, discard any data categories (out of FR, TANDEM, RF) that
+ have fewer than this percentage of overall reads. (Range: 0 to 1). Default value: 0.05.
+
+ METRIC_ACCUMULATION_LEVEL=MetricAccumulationLevel
+ LEVEL=MetricAccumulationLevel The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE,
+ LIBRARY, READ_GROUP} This option may be specified 0 or more times.
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default
+ value: true. This option can be set to 'null' to clear the default value. Possible
+ values: {true, false}
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectRnaSeqMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectRnaSeqMetrics.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,204 @@
+
+ collect metrics about the alignment of RNA to various functional classes of loci in the genome
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+
+
+ ## Set up input files
+
+ ## Reference sequences
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ ## refFlat data
+ ## The awk line below converts a file obtained from UCSC as specified in the tool help to refFlat format
+
+ grep -v '^#' ${refFlat} | awk '{print $11"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10}' > refFlat.tab &&
+
+ ## Start picard command
+
+ @java_options@
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectRnaSeqMetrics
+ REF_FLAT=refFlat.tab
+
+ #if str( $ribosomal_intervals ) != "None":
+ RIBOSOMAL_INTERVALS="${ribosomal_intervals}"
+ #end if
+
+ STRAND_SPECIFICITY="${strand_specificity}"
+ MINIMUM_LENGTH="${minimum_length}"
+ CHART_OUTPUT="${pdfFile}"
+
+ #for $sequence_to_ignore in $ignore_list:
+ IGNORE_SEQUENCE="${sequence_to_ignore.sequence}"
+ #end for
+
+ RRNA_FRAGMENT_PERCENTAGE="${rrna_fragment_percentage}"
+ METRIC_ACCUMULATION_LEVEL="${metric_accumulation_level}"
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ASSUME_SORTED="${assume_sorted}"
+
+ QUIET=true
+ VERBOSITY=ERROR
+ VALIDATION_STRINGENCY=${validation_stringency}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Collects metrics about the alignment of RNA to various functional classes of loci in the genome: coding, intronic, UTR, intergenic, ribosomal.
+
+@dataset_collections@
+
+-----
+
+.. class:: warningmark
+
+**Obtaining gene annotations in refFlat format**
+
+This tool requires gene annotations in refFlat_ format. These data can be obtained from UCSC table browser directly through Galaxy by following these steps:
+
+ 1. Click on **Get Data** in the upper part of left pane of Galaxy interface
+ 2. Click on **UCSC Main** link
+ 3. Set your genome and dataset of interest. It **must** be the same genome build against which you have mapped the reads contained in the BAM file you are analyzing
+ 4. In the **output format** field choose **selected fields from primary and related tables**
+ 5. Click **get output** button
+ 6. In the first table presented at the top of the page select (using checkboxes) first 11 fields:
+ name
+ chrom
+ strand
+ txStart
+ txEnd
+ cdsStart
+ cdsEnd
+ exonCount
+ exonStarts
+ exonEnds
+ proteinId
+ 7. Click **done with selection**
+ 8. Click **Send query to Galaxy**
+ 9. A new dataset will appear in the current Galaxy history
+ 10. Use this dataset as the input for **Gene annotations in refFlat form** dropdown of this tool
+
+.. _refFlat: http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat
+
+@description@
+
+ REF_FLAT=File Gene annotations in refFlat form. Format described here:
+ http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat Required.
+
+ RIBOSOMAL_INTERVALS=File Location of rRNA sequences in genome, in interval_list format. If not specified no bases
+ will be identified as being ribosomal. Format described here:
+ http://picard.sourceforge.net/javadoc/net/sf/picard/util/IntervalList.html and can be
+ generated from BED datasetes using Galaxy's wrapper for picard_BedToIntervalList tool
+
+ STRAND_SPECIFICITY=StrandSpecificity
+ STRAND=StrandSpecificity For strand-specific library prep. For unpaired reads, use FIRST_READ_TRANSCRIPTION_STRAND
+ if the reads are expected to be on the transcription strand. Required. Possible values:
+ {NONE, FIRST_READ_TRANSCRIPTION_STRAND, SECOND_READ_TRANSCRIPTION_STRAND}
+
+ MINIMUM_LENGTH=Integer When calculating coverage based values (e.g. CV of coverage) only use transcripts of this
+ length or greater. Default value: 500.
+
+ IGNORE_SEQUENCE=String If a read maps to a sequence specified with this option, all the bases in the read are
+ counted as ignored bases.
+
+ RRNA_FRAGMENT_PERCENTAGE=Double
+ This percentage of the length of a fragment must overlap one of the ribosomal intervals
+ for a read or read pair by this must in order to be considered rRNA. Default value: 0.8.
+
+ METRIC_ACCUMULATION_LEVEL=MetricAccumulationLevel
+ LEVEL=MetricAccumulationLevel The level(s) at which to accumulate metrics. Possible values: {ALL_READS, SAMPLE,
+ LIBRARY, READ_GROUP} This option may be specified 0 or more times.
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default
+ value: true. Possible values: {true, false}
+
+@more_info@
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_CollectWgsMetrics.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_CollectWgsMetrics.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,111 @@
+
+ compute metrics for evaluating of whole genome sequencing experiments
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ CollectWgsMetrics
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ MINIMUM_MAPPING_QUALITY="${minimum_mapping_quality}"
+ MINIMUM_BASE_QUALITY="${minimum_base_quality}"
+ COVERAGE_CAP="${coverage_cap}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments.
+
+@dataset_collections@
+
+@description@
+
+ MINIMUM_MAPPING_QUALITY=Integer
+ MQ=Integer Minimum mapping quality for a read to contribute coverage. Default value: 20.
+
+ MINIMUM_BASE_QUALITY=Integer
+ Q=Integer Minimum base quality for a base to contribute coverage. Default value: 20.
+
+ COVERAGE_CAP=Integer
+ CAP=Integer Treat bases with coverage exceeding this value as if they had coverage at this value.
+ Default value: 250.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_DownsampleSam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_DownsampleSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,79 @@
+
+ Downsample a file to retain a subset of the reads
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ java -jar
+ \$JAVA_JAR_PATH/picard.jar
+ DownsampleSam
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ PROBABILITY=${probability}
+ RANDOM_SEED=${seed}
+ QUIET=true
+ VERBOSITY=ERROR
+ VALIDATION_STRINGENCY=${validation_stringency}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Randomly down-sample a SAM or BAM file to retain a random subset of the reads. Mate-pairs are either both kept or both discarded. Reads marked as not primary alignments are all discarded. Each read is given a probability P of being retained - results with the exact same input in the same order and with the same value for RANDOM_SEED will produce the same results.
+
+@dataset_collections@
+
+@description@
+
+ INPUT=File
+ I=File The input SAM or BAM file to downsample. Required.
+
+ OUTPUT=File
+ O=File The output, downsampled, SAM or BAM file to write. Required.
+
+ RANDOM_SEED=Long
+ R=Long Random seed to use if reproducibilty is desired. Setting to null will cause multiple
+ invocations to produce different results.
+
+ PROBABILITY=Double
+ P=Double The probability of keeping any individual read, between 0 and 1.
+
+
+
+@more_info@
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_EstimateLibraryComplexity.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_EstimateLibraryComplexity.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,135 @@
+
+ assess sequence library complexity from read sequences
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ EstimateLibraryComplexity
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ MIN_IDENTICAL_BASES="${min_identical_bases}"
+ MAX_DIFF_RATE="${max_diff_rate}"
+ MIN_MEAN_QUALITY="${min_mean_quality}"
+ MAX_GROUP_RATIO="${max_group_ratio}"
+ READ_NAME_REGEX="${read_name_regex}"
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE="${optical_duplicate_pixel_distance}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Attempts to estimate library complexity from sequence of read pairs alone. Does so by sorting all reads by the first N bases (5 by default)
+of each read and then comparing reads with the first N bases identical to each other for duplicates. Reads are considered to be duplicates
+if they match each other with no gaps and an overall mismatch rate less than or equal to MAX_DIFF_RATE (0.03 by default).
+
+Reads of poor quality are filtered out so as to provide a more accurate estimate. The filtering removes reads with any no-calls in the first
+N bases or with a mean base quality lower than MIN_MEAN_QUALITY across either the first or second read.
+
+Unpaired reads are ignored in this computation.
+The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes these in the calculation of library size.
+
+Also, since there is no alignment to screen out technical reads one further filter is applied on the data. After examining all reads a Histogram
+is built of [#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are then removed from the Histogram
+as outliers before library size is estimated.
+
+@dataset_collections@
+
+@description@
+
+ MIN_IDENTICAL_BASES=Integer The minimum number of bases at the starts of reads that must be identical for reads to be
+ grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads
+ will be compared at a time, so lower numbers will produce more accurate results but
+ consume exponentially more memory and CPU. Default value: 5.
+
+ MAX_DIFF_RATE=Double The maximum rate of differences between two reads to call them identical. Default value:
+ 0.03.
+
+ MIN_MEAN_QUALITY=Integer The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads
+ with lower average quality are filtered out and not considered in any calculations.
+ Default value: 20.
+
+ MAX_GROUP_RATIO=Integer Do not process self-similar groups that are this many times over the mean expected group
+ size. I.e. if the input contains 10m read pairs and MIN_IDENTICAL_BASES is set to 5, then
+ the mean expected group size would be approximately 10 reads. Default value: 500.
+
+ READ_NAME_REGEX=String Regular expression that can be used to parse read names in the incoming SAM file. Read
+ names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
+ These values are used to estimate the rate of optical duplication in order to give a more
+ accurate estimated library size. Set this option to null to disable optical duplicate
+ detection. The regular expression should contain three capture groups for the three
+ variables, in order. It must match the entire read name. Note that if the default regex
+ is specified, a regex match is not actually done, but instead the read name is split on
+ colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
+ tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
+ are assumed to be tile, x and y values. Default value:
+ [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.
+
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
+ The maximum offset between two duplicte clusters in order to consider them optical
+ duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
+ unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
+ which case 50-100 is more normal. Default value: 100.
+
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_FastqToSam.xml
--- a/picard_FastqToSam.xml Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_FastqToSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -1,145 +1,230 @@
-
- creates an unaligned BAM file
- picard
-
- java -XX:DefaultMaxRAMFraction=1 -XX:+UseParallelGC
- -jar "\$JAVA_JAR_PATH/FastqToSam.jar"
- FASTQ="${input_fastq1}"
- #if str( $input_fastq2) != "None":
- FASTQ2="${input_fastq2}"
+
+ convert Fastq data into unaligned BAM
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ FastqToSam
+
+ #if str( $input_type.input_type_selector ) == "se":
+ FASTQ="${input_type.fastq}"
+ #elif str( $input_type.input_type_selector ) == "pe":
+ FASTQ="${input_type.fastq}"
+ FASTQ2="${input_type.fastq2}"
+ #else
+ FASTQ="${input_type.fastq.forward}"
+ FASTQ2="${input_type.fastq.reverse}"
#end if
- QUALITY_FORMAT="${ dict( fastqsanger='Standard', fastqcssanger='Standard', fastqillumina='Illumina', fastqsolexa='Solexa' )[ $input_fastq1.ext ] }" ##Solexa, Illumina, Standard
- OUTPUT="${output_bam}"
+
+ QUALITY_FORMAT="${quality_format}"
+ OUTPUT="${outFile}"
READ_GROUP_NAME="${read_group_name}"
- SAMPLE_NAME="${sample_name}"
- #if $param_type.param_type_selector == "advanced":
- #if str( $param_type.library_name ) != "":
- LIBRARY_NAME="${param_type.library_name}"
- #end if
- #if str( $param_type.platform_unit ) != "":
- PLATFORM_UNIT="${param_type.platform_unit}"
- #end if
- #if str( $param_type.platform ) != "":
- PLATFORM="${param_type.platform}"
- #end if
- #if str( $param_type.sequencing_center ) != "":
- SEQUENCING_CENTER="${param_type.sequencing_center}"
- #end if
- #if str( $param_type.predicted_insert_size ) != "":
- PREDICTED_INSERT_SIZE="${param_type.predicted_insert_size}"
- #end if
- #if str( $param_type.description.value ) != "":
- DESCRIPTION="${param_type.description}"
- #end if
- #if str( $param_type.run_date ) != "":
- RUN_DATE="${param_type.run_date}"
- #end if
- #if str( $param_type.min_q ) != "":
- MIN_Q="${param_type.min_q}"
- #end if
- #if str( $param_type.max_q ) != "":
- MAX_Q="${param_type.max_q}"
- #end if
- SORT_ORDER="${param_type.sort_order}"
- #else:
- SORT_ORDER=coordinate ##unsorted, queryname, coordinate; always use coordinate
+ SAMPLE_NAME="${sample_name}"
+
+ #if str( $library_name ):
+ LIBRARY_NAME="${library_name}"
+ #end if
+
+ #if str( $platform_unit ):
+ PLATFORM_UNIT="${platform_unit}"
+ #end if
+
+ #if str( $platform ):
+ PLATFORM="${platform}"
+ #end if
+
+ #if str( $sequencing_center ):
+ SEQUENCING_CENTER="${sequencing_center}"
+ #end if
+
+ #if str( $predicted_insert_size ):
+ PREDICTED_INSERT_SIZE="${predicted_insert_size}"
#end if
- 2>&1
- || echo "Error running Picard FastqToSAM" >&2
+
+ #if str( $comment ):
+ COMMENT="${comment}"
+ #end if
+
+ #if str( $description ):
+ DESCRIPTION="${description}"
+ #end if
+
+ #if str( $run_date ):
+ RUN_DATE="${run_date}"
+ #end if
+
+ MIN_Q="${min_q}"
+ MAX_Q="${max_q}"
+ STRIP_UNPAIRED_MATE_NUMBER="${strip_unpairied_mate_number}"
+ ALLOW_AND_IGNORE_EMPTY_LINES="${allow_and_ignore_empty_lines}"
+
+ SORT_ORDER=coordinate
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-**What it does**
+
+.. class:: infomark
+
+**Purpose**
+
+Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments.
+
+@dataset_collections@
+
+@RG@
+
+@description@
-Picard: FastqToSam converts FASTQ files to unaligned BAM files.
+ FASTQ=File
+ F1=File Input fastq file for single end data, or first read in paired end
+ data. Required.
+
+ FASTQ2=File
+ F2=File Input fastq file for the second read of paired end data (if used).
+
+ QUALITY_FORMAT=FastqQualityFormat
+ V=FastqQualityFormat A value describing how the quality values are encoded in the fastq. Either Solexa for
+ pre-pipeline 1.3 style scores (solexa scaling + 66), Illumina for pipeline 1.3 and above
+ (phred scaling + 64) or Standard for phred scaled scores with a character shift of 33.
+ If this value is not specified, the quality format will be detected automatically.
+ Default value: null. Possible values: {Solexa, Illumina, Standard}
-------
+ READ_GROUP_NAME=String
+ RG=String Read group name Default value: A.
+
+ SAMPLE_NAME=String
+ SM=String Sample name to insert into the read group header Required.
+
+ LIBRARY_NAME=String
+ LB=String The library name to place into the LB attribute in the read group header.
+
+ PLATFORM_UNIT=String
+ PU=String The platform unit (often run_barcode.lane) to insert into the read group header.
+
+ PLATFORM=String
+ PL=String The platform type (e.g. illumina, solid) to insert into the read group header.
+
+ SEQUENCING_CENTER=String
+ CN=String The sequencing center from which the data originated.
+
+ PREDICTED_INSERT_SIZE=Integer
+ PI=Integer Predicted median insert size, to insert into the read group header.
+
+ COMMENT=String
+ CO=String Comment to include in the merged output file's header.
+
+ DESCRIPTION=String
+ DS=String Inserted into the read group header.
+
+ RUN_DATE=Iso8601Date
+ DT=Iso8601Date Date the run was produced, to insert into the read group header.
+
+ MIN_Q=Integer Minimum quality allowed in the input fastq. An exception will be thrown if a quality is
+ less than this value. Default value: 0.
+
+ MAX_Q=Integer Maximum quality allowed in the input fastq. An exception will be thrown if a quality is
+ greater than this value. Default value: 93.
+
+ STRIP_UNPAIRED_MATE_NUMBER=Boolean
+ If true and this is an unpaired fastq any occurance of '/1' will be removed from the end
+ of a read name. Default value: false. Possible values: {true, false}
+
+ ALLOW_AND_IGNORE_EMPTY_LINES=Boolean
+ Allow (and ignore) empty lines Default value: false. Possible values: {true, false}
+
-Please cite the website "http://picard.sourceforge.net".
+@more_info@
-------
+
+
-**Input formats**
-
-FastqToSam accepts FASTQ input files. If using paired-end data, you should select two FASTQ files.
-
-------
-
-**Outputs**
-
-The output is in BAM format, see http://samtools.sourceforge.net for more details.
-
--------
-
-**FastqToSam settings**
-
-This is list of FastqToSam options::
-
- READ_GROUP_NAME=String Read group name Default value: A. This option can be set to 'null' to clear the default value.
- SAMPLE_NAME=String Sample name to insert into the read group header Required.
- LIBRARY_NAME=String The library name to place into the LB attribute in the read group header Default value: null.
- PLATFORM_UNIT=String The platform unit (often run_barcode.lane) to insert into the read group header Default value: null.
- PLATFORM=String The platform type (e.g. illumina, solid) to insert into the read group header Default value: null.
- SEQUENCING_CENTER=String The sequencing center from which the data originated Default value: null.
- PREDICTED_INSERT_SIZE=Integer Predicted median insert size, to insert into the read group header Default value: null.
- DESCRIPTION=String Inserted into the read group header Default value: null.
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_FilterSamReads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_FilterSamReads.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,134 @@
+
+ include or exclude aligned and unaligned reads and read lists
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ ##Sam Sorting is performed here because FilterSamReads requires input to be in query-sorted order
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ SortSam
+ INPUT="${inputFile}"
+ OUTPUT=query_sorted_bam.bam
+ SORT_ORDER=queryname
+ VALIDATION_STRINGENCY=LENIENT
+ QUIET=true
+ VERBOSITY=ERROR
+
+ &&
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ FilterSamReads
+ INPUT=query_sorted_bam.bam
+ FILTER="${filter_type.filter}"
+
+ #if ( str( $filter_type.filter ) == "includeReadList" or str( $filter_type.filter ) == "excludeReadList" ):
+ READ_LIST_FILE="${filter_type.read_list_file}"
+ #end if
+
+ OUTPUT="${outFile}"
+ SORT_ORDER=coordinate
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments.
+
+------
+
+.. class:: warningmark
+
+**Warning on using this tool on BWA-MEM output**
+
+This tool will likely fail on BAM datasets generated by BWA MEM as it generates partial read alignemnts.
+
+@dataset_collections@
+
+@description@
+
+ FILTER=Filter Filter. Required. Possible values:
+ includeAligned [OUTPUT SAM/BAM will contain aligned
+ reads only. (Note that *both* first and
+ second of paired reads must be aligned to be included
+ in the OUTPUT SAM or BAM)],
+
+ excludeAligned [OUTPUT SAM/BAM will contain un-mapped reads only.
+ (Note that *both* first and second of pair must be aligned to be
+ excluded from the OUTPUT SAM or BAM)]
+
+ includeReadList [OUTPUT SAM/BAM will contain reads
+ that are supplied in the READ_LIST_FILE file]
+
+ excludeReadList [OUTPUT bam will contain
+ reads that are *not* supplied in the READ_LIST_FILE file]}
+
+ READ_LIST_FILE=File
+ RLF=File Read List File containing reads that will be included or excluded from the OUTPUT SAM or
+ BAM file. Default value: null.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_FixMateInformation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_FixMateInformation.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,85 @@
+
+ ensure that all mate-pair information is in sync between each read and it's mate pair
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ FixMateInformation
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ ASSUME_SORTED=${assume_sorted}
+ ADD_MATE_CIGAR=${add_mate_cigar}
+
+ SORT_ORDER=coordinate
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Ensure that all mate-pair information is in sync between each read and it's mate pair. Reads marked with the secondary alignment flag are written to the output file unchanged.
+
+------
+
+.. class:: warningmark
+
+**Warning on using ASSUME_SORTED option**
+
+Datasets imported into Galaxy are automatically coordinate sorted. So use this option (set it to True) only if you are sure that this is necessary. If you are not sure - a good rule of thumb
+is to assume that the BAM you are working with is coordinate sorted.
+
+@dataset_collections@
+
+@description@
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true, assume that the input file is queryname sorted, even if the header says
+ otherwise. Default value: false.
+
+ ADD_MATE_CIGAR=Boolean
+ MC=Boolean Adds the mate CIGAR tag (MC) if true, does not if false. Default value: true.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_MarkDuplicates.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_MarkDuplicates.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,130 @@
+
+ examine aligned records in BAM datasets to locate duplicate molecules
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ MarkDuplicates
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ METRICS_FILE="${metrics_file}"
+ #for $element in $comments:
+ COMMENT="${element.comment}"
+ #end for
+ REMOVE_DUPLICATES="${remove_duplicates}"
+ ASSUME_SORTED="${assume_sorted}"
+
+ DUPLICATE_SCORING_STRATEGY="${duplicate_scoring_strategy}"
+
+ READ_NAME_REGEX="${read_name_regex}"
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE="${optical_duplicate_pixel_distance}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Examines aligned records in the supplied SAM or BAM dataset to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged.
+
+@dataset_collections@
+
+@description@
+
+ COMMENT=String
+ CO=String Comment(s) to include in the output file's header. This option may be specified 0 or
+ more times.
+
+ REMOVE_DUPLICATES=Boolean If true do not write duplicates to the output file instead of writing them with
+ appropriate flags set. Default value: false.
+
+ READ_NAME_REGEX=String Regular expression that can be used to parse read names in the incoming SAM file. Read
+ names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
+ These values are used to estimate the rate of optical duplication in order to give a more
+ accurate estimated library size. Set this option to null to disable optical duplicate
+ detection. The regular expression should contain three capture groups for the three
+ variables, in order. It must match the entire read name. Note that if the default regex
+ is specified, a regex match is not actually done, but instead the read name is split on
+ colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
+ tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
+ are assumed to be tile, x and y values. Default value:
+ [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.
+
+ DUPLICATE_SCORING_STRATEGY=ScoringStrategy
+ DS=ScoringStrategy The scoring strategy for choosing the non-duplicate among candidates. Default value:
+ SUM_OF_BASE_QUALITIES. Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH}
+
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
+ The maximum offset between two duplicte clusters in order to consider them optical
+ duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
+ unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
+ which case 50-100 is more normal. Default value: 100.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_MarkDuplicatesWithMateCigar.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_MarkDuplicatesWithMateCigar.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,165 @@
+
+ examine aligned records in BAM datasets to locate duplicate molecules
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ MarkDuplicatesWithMateCigar
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ METRICS_FILE="${metrics_file}"
+ #for $element in $comments:
+ COMMENT="${element.comment}"
+ #end for
+
+ MINIMUM_DISTANCE="${minimum_distance}"
+ SKIP_PAIRS_WITH_NO_MATE_CIGAR="${skip_pairs_with_no_mate_cigar}"
+
+
+ REMOVE_DUPLICATES="${remove_duplicates}"
+ ASSUME_SORTED="${assume_sorted}"
+
+ DUPLICATE_SCORING_STRATEGY="${duplicate_scoring_strategy}"
+
+ READ_NAME_REGEX="${read_name_regex}"
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE="${optical_duplicate_pixel_distance}"
+
+
+ BLOCK_SIZE=100000
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Examines aligned records in the supplied SAM or BAM dataset to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged.
+
+------
+
+.. class:: warningmark
+
+On the difference between **MarkDuplicates** and **picard_MarkDuplicatesWithMateCigar**
+
+From Samtools Announce MailingList_:
+
+This tool can replace MarkDuplicates if the input SAM/BAM has Mate CIGAR (MC) optional tags pre-computed
+(see the tools RevertOriginalBaseQualitiesAndAddMateCigar and FixMateInformation). This allows the new tool
+to perform a streaming duplicate marking routine (i.e. a single-pass). This tool cannot be used with
+alignments that have large gaps or reference skips, which happens frequently in RNA-seq data.
+
+.. _MailingList: http://sourceforge.net/p/samtools/mailman/message/32910359/
+
+@dataset_collections@
+
+@description@
+
+ MINIMUM_DISTANCE=Integer The minimum distance to buffer records to account for clipping on the 5' end of the
+ records.Set this number to -1 to use twice the first read's read length (or 100,
+ whichever is smaller). Default value: -1. This option can be set to 'null' to clear the
+ default value.
+
+ SKIP_PAIRS_WITH_NO_MATE_CIGAR=Boolean
+ Skip record pairs with no mate cigar and include them in the output. Default value:
+ true. This option can be set to 'null' to clear the default value. Possible values:
+ {true, false}
+
+ COMMENT=String
+ CO=String Comment(s) to include in the output file's header. This option may be specified 0 or
+ more times.
+
+ REMOVE_DUPLICATES=Boolean If true do not write duplicates to the output file instead of writing them with
+ appropriate flags set. Default value: false.
+
+ READ_NAME_REGEX=String Regular expression that can be used to parse read names in the incoming SAM file. Read
+ names are parsed to extract three variables: tile/region, x coordinate and y coordinate.
+ These values are used to estimate the rate of optical duplication in order to give a more
+ accurate estimated library size. Set this option to null to disable optical duplicate
+ detection. The regular expression should contain three capture groups for the three
+ variables, in order. It must match the entire read name. Note that if the default regex
+ is specified, a regex match is not actually done, but instead the read name is split on
+ colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be
+ tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements
+ are assumed to be tile, x and y values. Default value:
+ [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.
+
+ DUPLICATE_SCORING_STRATEGY=ScoringStrategy
+ DS=ScoringStrategy The scoring strategy for choosing the non-duplicate among candidates. Default value:
+ TOTAL_MAPPED_REFERENCE_LENGTH. Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH}
+
+ OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer
+ The maximum offset between two duplicte clusters in order to consider them optical
+ duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels)
+ unless using later versions of the Illumina pipeline that multiply pixel values by 10, in
+ which case 50-100 is more normal. Default value: 100.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_MeanQualityByCycle.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_MeanQualityByCycle.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,113 @@
+
+ chart distribution of base qualities
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ MeanQualityByCycle
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ CHART_OUTPUT="${pdfFile}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ALIGNED_READS_ONLY="${aligned_reads_only}"
+ PF_READS_ONLY="${pf_reads_only}"
+
+ ASSUME_SORTED="${assume_sorted}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Program to chart the distribution of base qualities by cycle within reads supplied in a SAM or BAM dataset.
+
+@dataset_collections@
+
+@description@
+
+ ALIGNED_READS_ONLY=Boolean If set to true, calculate the base distribution over aligned reads only. Default value:
+ false. Possible values: {true, false}
+
+ PF_READS_ONLY=Boolean If set to true calculate the base distribution over PF reads only. Default value: false.
+ This option can be set to 'null' to clear the default value. Possible values: {true,
+ false}
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default: True
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_MergeBamAlignment.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_MergeBamAlignment.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,329 @@
+
+ merge alignment data with additional info stored in an unmapped BAM dataset
+ picard
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ #set $picard_dict = "localref.dict"
+ #set $ref_fasta = "localref.fa" ## This is done because picards "likes" .fa extension
+
+ ln -s "${reference_source.ref_file}" "${ref_fasta}" &&
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+
+ java -jar \$JAVA_JAR_PATH/picard.jar CreateSequenceDictionary REFERENCE="${ref_fasta}" OUTPUT="${picard_dict}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+ &&
+
+ #else:
+
+ #set $ref_fasta = str( $reference_source.ref_file.fields.path )
+
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ MergeBamAlignment
+ UNMAPPED_BAM="${unmapped_bam}"
+
+ PAIRED_RUN=true ##This argument is ignored and will be removed. Required. Possible values: {true, false}
+
+ #if str( $aligned_or_read1_and_read2.aligned_or_read1_and_read2_selector ) == "paired_one_file":
+ #for $dataset in $aligned_or_read1_and_read2.aligned_bams:
+ ALIGNED_BAM="${dataset.aligned_bam}"
+ #end for
+ #elif str( $aligned_or_read1_and_read2.aligned_or_read1_and_read2_selector ) == "paired_two_files":
+ #for $dataset in $aligned_or_read1_and_read2.read1_aligned_bams:
+ READ1_ALIGNED_BAM="${dataset.read1_aligned_bam}"
+ #end for
+ #for $dataset in $aligned_or_read1_and_read2.read2_aligned_bams:
+ READ2_ALIGNED_BAM="${dataset.read1_aligned_bam}"
+ #end for
+ #else
+ #for $dataset in $aligned_or_read1_and_read2.read1_aligned_bams:
+ READ1_ALIGNED_BAM="${dataset.read1_aligned_bam}"
+ #end for
+ #end if
+
+ OUTPUT="${outFile}"
+ REFERENCE_SEQUENCE="${ref_fasta}"
+
+ CLIP_ADAPTERS="${clip_adapters}"
+ IS_BISULFITE_SEQUENCE="${is_bisulfite_sequence}"
+ ALIGNED_READS_ONLY="${aligned_reads_only}"
+ MAX_INSERTIONS_OR_DELETIONS="${max_insertions_or_deletions}"
+
+ #for $attribute in $attributes_to_retain:
+ ATTRIBUTES_TO_RETAIN="${$attribute.attribute}"
+ #end for
+
+ #for $attribute in $attributes_to_remove:
+ ATTRIBUTES_TO_REMOVE="${$attribute.attribute}"
+ #end for
+
+ READ1_TRIM="${read1_trim}"
+ READ2_TRIM="${read2_trim}"
+
+ #if str( $orientations ) != "None":
+ #for $orientation in str( $orientations ).split(','): ## See trello card https://trello.com/c/9nW02Zhd
+ EXPECTED_ORIENTATIONS="${orientation}"
+ #end for
+ #end if
+
+ ALIGNER_PROPER_PAIR_FLAGS="${aligner_proper_pair_flags}"
+ PRIMARY_ALIGNMENT_STRATEGY="${primary_alignment_strategy}"
+ CLIP_OVERLAPPING_READS="${clip_overlapping_reads}"
+ INCLUDE_SECONDARY_ALIGNMENTS="${include_secondary_alignments}"
+ ADD_MATE_CIGAR="${add_mate_cigar}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+
+ SORT_ORDER=coordinate
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Merges alignment data from a SAM or BAM dataset with additional data stored in an unmapped BAM dataset and produces a third SAM or BAM dataset of aligned and unaligned reads.
+
+@dataset_collections@
+
+@description@
+
+ UNMAPPED_BAM=File
+ UNMAPPED=File Original SAM or BAM file of unmapped reads, which must be in queryname order. Required.
+
+ ALIGNED_BAM=File
+ ALIGNED=File SAM or BAM file(s) with alignment data. This option may be specified 0 or more times.
+ Cannot be used in conjuction with option(s) READ1_ALIGNED_BAM (R1_ALIGNED)
+ READ2_ALIGNED_BAM (R2_ALIGNED)
+
+ READ1_ALIGNED_BAM=File
+ R1_ALIGNED=File SAM or BAM file(s) with alignment data from the first read of a pair. This option may be
+ specified 0 or more times. Cannot be used in conjuction with option(s) ALIGNED_BAM
+ (ALIGNED)
+
+ READ2_ALIGNED_BAM=File
+ R2_ALIGNED=File SAM or BAM file(s) with alignment data from the second read of a pair. This option may
+ be specified 0 or more times. Cannot be used in conjuction with option(s) ALIGNED_BAM
+ (ALIGNED)
+
+ PAIRED_RUN=Boolean
+ PE=Boolean This argument is ignored and will be removed. Required. Possible values: {true, false}
+
+ JUMP_SIZE=Integer
+ JUMP=Integer The expected jump size (required if this is a jumping library). Deprecated. Use
+ EXPECTED_ORIENTATIONS instead Default value: null. Cannot be used in conjuction with
+ option(s) EXPECTED_ORIENTATIONS (ORIENTATIONS)
+
+ CLIP_ADAPTERS=Boolean Whether to clip adapters where identified. Default value: true. Possible values: {true, false}
+
+ IS_BISULFITE_SEQUENCE=Boolean Whether the lane is bisulfite sequence (used when caculating the NM tag). Default value:
+ false. Possible values: {true, false}
+
+ ALIGNED_READS_ONLY=Boolean Whether to output only aligned reads. Default value: false. Possible values: {true, false}
+
+ MAX_INSERTIONS_OR_DELETIONS=Integer
+ MAX_GAPS=Integer The maximum number of insertions or deletions permitted for an alignment to be included.
+ Alignments with more than this many insertions or deletions will be ignored. Set to -1 to
+ allow any number of insertions or deletions. Default value: 1.
+
+ ATTRIBUTES_TO_RETAIN=String Reserved alignment attributes (tags starting with X, Y, or Z) that should be brought over
+ from the alignment data when merging. This option may be specified 0 or more times.
+
+ ATTRIBUTES_TO_REMOVE=String Attributes from the alignment record that should be removed when merging. This overrides
+ ATTRIBUTES_TO_RETAIN if they share common tags. This option may be specified 0 or more
+ times.
+
+ READ1_TRIM=Integer
+ R1_TRIM=Integer The number of bases trimmed from the beginning of read 1 prior to alignment Default
+ value: 0.
+
+ READ2_TRIM=Integer
+ R2_TRIM=Integer The number of bases trimmed from the beginning of read 2 prior to alignment Default
+ value: 0.
+
+ EXPECTED_ORIENTATIONS=PairOrientation
+ ORIENTATIONS=PairOrientation The expected orientation of proper read pairs. Replaces JUMP_SIZE Possible values: {FR,
+ RF, TANDEM} This option may be specified 0 or more times. Cannot be used in conjuction
+ with option(s) JUMP_SIZE (JUMP)
+
+ ALIGNER_PROPER_PAIR_FLAGS=Boolean
+ Use the aligner's idea of what a proper pair is rather than computing in this program.
+ Default value: false. Possible values: {true, false}
+
+ SORT_ORDER=SortOrder
+ SO=SortOrder The order in which the merged reads should be output. Default value: coordinate.
+ Possible values: {unsorted, queryname, coordinate}
+
+ PRIMARY_ALIGNMENT_STRATEGY=PrimaryAlignmentStrategy
+ Strategy for selecting primary alignment when the aligner has provided more than one
+ alignment for a pair or fragment, and none are marked as primary, more than one is marked
+ as primary, or the primary alignment is filtered out for some reason. BestMapq expects
+ that multiple alignments will be correlated with HI tag, and prefers the pair of
+ alignments with the largest MAPQ, in the absence of a primary selected by the aligner.
+ EarliestFragment prefers the alignment which maps the earliest base in the read. Note
+ that EarliestFragment may not be used for paired reads. BestEndMapq is appropriate for
+ cases in which the aligner is not pair-aware, and does not output the HI tag. It simply
+ picks the alignment for each end with the highest MAPQ, and makes those alignments
+ primary, regardless of whether the two alignments make sense together.MostDistant is also
+ for a non-pair-aware aligner, and picks the alignment pair with the largest insert size.
+ If all alignments would be chimeric, it picks the alignments for each end with the best
+ MAPQ. For all algorithms, ties are resolved arbitrarily. Default value: BestMapq.
+ Possible values: {BestMapq, EarliestFragment, BestEndMapq, MostDistant}
+
+ CLIP_OVERLAPPING_READS=BooleanFor paired reads, soft clip the 3' end of each read if necessary so that it does not
+ extend past the 5' end of its mate. Default value: true. Possible values: {true, false}
+
+ INCLUDE_SECONDARY_ALIGNMENTS=Boolean
+ If false, do not write secondary alignments to output. Default value: true.
+ Possible values: {true, false}
+
+ ADD_MATE_CIGAR=Boolean
+ MC=Boolean Adds the mate CIGAR tag (MC) if true, does not if false. Possible values: {true, false}
+
+
+
+
+@more_info@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_MergeSamFiles.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_MergeSamFiles.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,96 @@
+
+ merges multiple SAM/BAM datasets into one
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ MergeSamFiles
+
+ #for $element in $inputFile:
+ INPUT="${element}"
+ #end for
+
+ OUTPUT="${outFile}"
+ MERGE_SEQUENCE_DICTIONARIES="${merge_sequence_dictionaries}"
+
+ ASSUME_SORTED="${assume_sorted}"
+ #for $element in $comments:
+ COMMENT="${element.comment}"
+ #end for
+
+ USE_THREADING=true
+ SORT_ORDER=coordinate
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Merges multiple SAM/BAM datasets into one.
+
+@dataset_collections@
+
+@description@
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true, assume that the input files are in the same sort order as the requested output
+ sort order, even if their headers say otherwise. Default value: false. This option can
+ be set to 'null' to clear the default value. Possible values: {true, false}
+
+ MERGE_SEQUENCE_DICTIONARIES=Boolean
+ MSD=Boolean Merge the sequence dictionaries Default value: false. This option can be set to 'null'
+ to clear the default value. Possible values: {true, false}
+
+ COMMENT=String
+ CO=String Comment(s) to include in the merged output file's header. This option may be specified 0
+ or more times.
+
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_NormalizeFasta.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_NormalizeFasta.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,74 @@
+
+ normalize fasta datasets
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+
+ ## Two lines below are due to the fact that picard likes fasta files to have extension .fa
+ #set $fasta_file="local_fasta.fa"
+ ln -s "${inputFile}" "${fasta_file}" &&
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ NormalizeFasta
+
+ INPUT="${fasta_file}"
+ OUTPUT="${outFile}"
+ LINE_LENGTH="${line_length}"
+ TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE="${truncate_sequence_names_at_whitespaces}"
+
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Takes any dataset that conforms to the fasta format and normalizes it so that all lines of sequence except the last line per named sequence are of the same length.
+
+@dataset_collections@
+
+@description@
+
+ LINE_LENGTH=Integer The line length to be used for the output fasta file. Default value: 100.
+
+ TRUNCATE_SEQUENCE_NAMES_AT_WHITESPACE=Boolean
+ Truncate sequence names at first whitespace. Default value: false. Possible values: {true, false}
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_QualityScoreDistribution.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_QualityScoreDistribution.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,118 @@
+
+ chart quality score distribution
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ QualityScoreDistribution
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ CHART_OUTPUT="${pdfFile}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ ALIGNED_READS_ONLY="${aligned_reads_only}"
+ PF_READS_ONLY="${pf_reads_only}"
+ INCLUDE_NO_CALLS="${include_no_calls}"
+
+ ASSUME_SORTED="${assume_sorted}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Program to chart quality score distributions in a SAM or BAM dataset.
+
+@dataset_collections@
+
+@description@
+
+ ALIGNED_READS_ONLY=Boolean If set to true, calculate the base distribution over aligned reads only. Default value:
+ false. Possible values: {true, false}
+
+ PF_READS_ONLY=Boolean If set to true calculate the base distribution over PF reads only. Default value: false.
+ Possible values: {true, false}
+
+ INCLUDE_NO_CALLS=Boolean If set to true, include quality for no-call bases in the distribution. Default value:
+ false. Possible values: {true, false}
+
+ ASSUME_SORTED=Boolean
+ AS=Boolean If true (default), then the sort order in the header file will be ignored. Default: True
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_ReorderSam.xml
--- a/picard_ReorderSam.xml Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_ReorderSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -1,155 +1,120 @@
-
- picard
-
- picard_wrapper.py
- --input="${inputFile}"
- #if $source.indexSource == "built-in"
- --ref="${source.ref.fields.path}"
- #else
- --ref-file="${refFile}"
- --species-name="${source.speciesName}"
- --build-name="${source.buildName}"
- --trunc-names="${source.truncateSeqNames}"
- #end if
- --allow-inc-dict-concord="${allowIncDictConcord}"
- --allow-contig-len-discord="${allowContigLenDiscord}"
- --output-format="${outputFormat}"
- --output="${outFile}"
- --tmpdir "${__new_file_path__}"
- -j "\$JAVA_JAR_PATH/ReorderSam.jar"
+
+ reorder reads to match ordering in reference sequences
+ picard
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ #set $picard_dict = "localref.dict"
+ #set $ref_fasta = "localref.fa" ## This is done because picards "likes" .fa extension
+
+ ln -s "${reference_source.ref_file}" "${ref_fasta}" &&
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+
+ java -jar \$JAVA_JAR_PATH/picard.jar CreateSequenceDictionary REFERENCE="${ref_fasta}" OUTPUT="${picard_dict}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+ &&
+
+ #else:
+
+ #set $ref_fasta = str( $reference_source.ref_file.fields.path )
+
+ #end if
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ ReorderSam
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ REFERENCE="${ref_fasta}"
+ ALLOW_INCOMPLETE_DICT_CONCORDANCE="${allow_incomplete_dict_concordance}"
+ ALLOW_CONTIG_LENGTH_DISCORDANCE="${allow_contig_length_discordance}"
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
-
-
-
-
+
+
+
+
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
+
+
-
-
-
+
+
+
+
+
+
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
.. class:: infomark
**Purpose**
-Reorder SAM/BAM to match contig ordering in a particular reference file. Note that this is
-not the same as sorting as done by the SortSam tool, which sorts by either coordinate
-values or query name. The ordering in ReorderSam is based on exact name matching of
-contigs/chromosomes. Reads that are mapped to a contig that is not in the new reference file are
-not included in the output.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for ReorderSam, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
-------
-
-.. class:: infomark
+ReorderSam reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file, as determined by exact name matching of contigs. Reads mapped to contigs absent in the new reference are dropped.
-**Inputs, outputs, and parameters**
-
-For the file that needs to be reordered, either a sam file or a bam file must be supplied.
-If a bam file is used, it must be coordinate-sorted. A reference file is also required,
-so either a fasta file should be supplied or a built-in reference can be selected.
+@dataset_collections@
-The output contains the same reads as the input file but the reads have been rearranged so
-they appear in the same order as the provided reference file. The tool will output either
-bam (the default) or sam, according to user selection. Bam is recommended since it is smaller.
-
-The only extra parameters that can be set are flags for allowing incomplete dict concordance
-and allowing contig length discordance. If incomplete dict concordance is allowed, only a
-partial overlap of the bam contigs with the new reference sequence contigs is required. By
-default it is off, requiring a corresponding contig in the new reference for each read contig.
-If contig length discordance is allowed, contig names that are the same between a read and the
-new reference contig are allowed even if they have different lengths. This is usually not a
-good idea, unless you know exactly what you're doing. It's off by default.
+----
.. class:: warningmark
-**Warning on SAM/BAM quality**
+Not to be confused with **SortSam**.
+
+@description@
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
+ ALLOW_INCOMPLETE_DICT_CONCORDANCE=Boolean
+ S=Boolean If true, then allows only a partial overlap of the BAM contigs with the new reference
+ sequence contigs. By default, this tool requires a corresponding contig in the new
+ reference for each read contig Default value: false. Possible values: {true, false}
+
+ ALLOW_CONTIG_LENGTH_DISCORDANCE=Boolean
+ U=Boolean If true, then permits mapping from a read contig to a new reference contig with the same
+ name but a different length. Highly dangerous, only use if you know what you are doing.
+ Default value: false. Possible values: {true, false}
-
+@more_info@
@@ -164,3 +129,4 @@
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_ReplaceSamHeader.xml
--- a/picard_ReplaceSamHeader.xml Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_ReplaceSamHeader.xml Tue Dec 16 19:03:21 2014 -0500
@@ -1,115 +1,67 @@
-
- picard
-
- picard_wrapper.py
- --input "${inputFile}"
- -o "${outFile}"
- --header-file "${headerFile}"
- --output-format "${outputFormat}"
- -j "\$JAVA_JAR_PATH/ReplaceSamHeader.jar"
- --tmpdir "${__new_file_path__}"
+
+ replace header in a SAM/BAM dataset
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+
+ ## Two lines below are due to the fact that picard likes fasta files to have extension .fa
+ #set $fasta_file="local_fasta.fa"
+ ln -s "${inputFile}" "${fasta_file}" &&
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ ReplaceSamHeader
+
+ INPUT="${inputFile}"
+ HEADER="${header}"
+ OUTPUT="${outFile}"
+
+ QUIET=true
+ VERBOSITY=ERROR
+
-
-
-
-
+
+
+
+
-
-
-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-.. class:: infomark
-
**Purpose**
-Replace Sam Header with the header from another sam file. The tool does not do any
-significant validation, so it's up to the user to make sure that the elements in
-the header are relevant and that the new header has all the required things.
-
-Replace the SAMFileHeader in a SAM file with the given header. Validation is
-minimal. It is up to the user to ensure that all the elements referred to in the
-SAMRecords are present in the new header. Sort order of the two input files must
-be the same.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for ReplaceSamHeader, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
-------
-
-.. class:: infomark
+Replace the SAMFileHeader in a SAM/BAM dataset with the given header. Validation is minimal. It is up to the user to ensure that all the elements referred to in the SAMRecords are present in the new header. Sort order of the two input datasets must be the same.
+@dataset_collections@
-**Inputs and outputs**
-
-Either a sam file or a bam file is required as the file whose header will be replaced.
-The header file is also required and can also be either sam or bam (it does not have
-to be the same type as the other file). In both cases, if a bam file is used, it must
-be coordinate-sorted. Galaxy currently coordinate-sorts all bam files.
-
-The tool will output either bam (the default) or sam. Bam is recommended since it is smaller.
+@description@
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
+ HEADER=File SAM file from which SAMFileHeader will be read. Required.
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
+@more_info@
-
-
-
-
-
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_RevertOriginalBaseQualitiesAndAddMateCigar.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_RevertOriginalBaseQualitiesAndAddMateCigar.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,79 @@
+
+ revert the original base qualities and add the mate cigar tag
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ RevertOriginalBaseQualitiesAndAddMateCigar
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ RESTORE_ORIGINAL_QUALITIES="${restore_original_qualities}"
+ MAX_RECORDS_TO_EXAMINE="${max_records_to_examine}"
+
+ SORT_ORDER=coordinate
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Reverts the original base qualities and adds the mate cigar tag to SAM or BAMs.
+
+@dataset_collections@
+
+@description@
+
+ RESTORE_ORIGINAL_QUALITIES=Boolean
+ OQ=Boolean True to restore original qualities from the OQ field to the QUAL field if available.
+ Default value: true. Possible values: {true, false}
+
+ MAX_RECORDS_TO_EXAMINE=IntegerThe maximum number of records to examine to determine if we can exit early and not
+ output, given that there are a no original base qualities (if we are to restore) and mate
+ cigars exist. Set to 0 to never skip the file. Default value: 10000.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_RevertSam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_RevertSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,140 @@
+
+ revert SAM/BAM datasets to a previous state
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ RevertSam
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+
+ RESTORE_ORIGINAL_QUALITIES="${restore_original_qualities}"
+ REMOVE_DUPLICATE_INFORMATION="${remove_duplicate_information}"
+ REMOVE_ALIGNMENT_INFORMATION="${remove_alignment_information}"
+
+ #for $attribute_to_clear in $attributes_to_clear:
+ ATTRIBUTE_TO_CLEAR="${attribute_to_clear.attribute}"
+ #end for
+
+ SANITIZE="${sanitize}"
+ MAX_DISCARD_FRACTION="${max_discard_fraction}"
+ SAMPLE_ALIAS="${sample_alias}"
+ LIBRARY_NAME="${library_name}"
+
+ SORT_ORDER="${sort_order}"
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Reverts SAM or BAM files to a previous state by removing certain types of information and/or substituting in the original quality scores when available.
+
+@dataset_collections@
+
+@description@
+
+ SORT_ORDER=SortOrder
+ SO=SortOrder The sort order to create the reverted output file with. Default value: queryname.
+ Possible values: {unsorted, queryname, coordinate}
+
+ RESTORE_ORIGINAL_QUALITIES=Boolean
+ OQ=Boolean True to restore original qualities from the OQ field to the QUAL field if available.
+ Default value: true. Possible values: {true, false}
+
+ REMOVE_DUPLICATE_INFORMATION=Boolean
+ Remove duplicate read flags from all reads. Note that if this is true and
+ REMOVE_ALIGNMENT_INFORMATION==false, the output may have the unusual but sometimes
+ desirable trait of having unmapped reads that are marked as duplicates. Default value:
+ true. Possible values: {true, false}
+
+ REMOVE_ALIGNMENT_INFORMATION=Boolean
+ Remove all alignment information from the file. Default value: true. TPossible values: {true, false}
+
+ ATTRIBUTE_TO_CLEAR=String When removing alignment information, the set of optional tags to remove. This option may
+ be specified 0 or more times.
+
+ SANITIZE=Boolean WARNING: This option is potentially destructive. If enabled will discard reads in order
+ to produce a consistent output BAM. Reads discarded include (but are not limited to)
+ paired reads with missing mates, duplicated records, records with mismatches in length of
+ bases and qualities. This option can only be enabled if the output sort order is
+ queryname and will always cause sorting to occur. Possible values: {true, false}
+
+ MAX_DISCARD_FRACTION=Double If SANITIZE=true and higher than MAX_DISCARD_FRACTION reads are discarded due to
+ sanitization thenthe program will exit with an Exception instead of exiting cleanly.
+ Output BAM will still be valid. Default value: 0.01.
+
+ SAMPLE_ALIAS=String
+ ALIAS=String The sample alias to use in the reverted output file. This will override the existing
+ sample alias in the file and is used only if all the read groups in the input file have
+ the same sample alias Default value: null.
+
+ LIBRARY_NAME=String
+ LIB=String The library name to use in the reverted output file. This will override the existing
+ sample alias in the file and is used only if all the read groups in the input file have
+ the same sample alias Default value: null.
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_SamToFastq.xml
--- a/picard_SamToFastq.xml Fri Feb 21 12:07:49 2014 -0500
+++ b/picard_SamToFastq.xml Tue Dec 16 19:03:21 2014 -0500
@@ -1,189 +1,200 @@
-
- creates a FASTQ file
- picard
-
- picard_SamToFastq_wrapper.py
- -p '
- java -XX:DefaultMaxRAMFraction=1 -XX:+UseParallelGC
- -jar "\$JAVA_JAR_PATH/SamToFastq.jar"
- INPUT="${input_sam}"
- VALIDATION_STRINGENCY="LENIENT"
+
+ extract reads and qualities from SAM/BAM dataset and convert to fastq
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+
+ echo "BAM" > $report && ## This is necessary for output dataset detection (see output tags below)
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ SamToFastq
+
+ INPUT="${inputFile}"
+
+ #if str( $output_per_rg ) == "true":
+ OUTPUT_PER_RG=true
+ OUTPUT_DIR=.
+ #elif str( $output_per_rg ) == "false" and str( $interleave ) == "false":
+ FASTQ=READ1.fastq
+ SECOND_END_FASTQ=READ2.fastq
+ UNPAIRED_FASTQ=UNPAIRED_READS.fastq
+ #elif str( $output_per_rg ) == "false" and str( $interleave ) == "true":
+ FASTQ=INTERLEAVED.fastq
+ #end if
+
RE_REVERSE="${re_reverse}"
+ INTERLEAVE="${interleave}"
INCLUDE_NON_PF_READS="${include_non_pf_reads}"
- #if str( $clipping_attribute ):
- CLIPPING_ATTRIBUTE="${clipping_attribute}"
- #end if
- #if str( $clipping_action ):
- CLIPPING_ACTION="${clipping_action}"
+ CLIPPING_ATTRIBUTE="${clipping_attribute}"
+ CLIPPING_ACTION="${clipping_action}"
+ READ1_TRIM="${read1_trim}"
+
+ #if int($read1_max_bases_to_write) > -1:
+ READ1_MAX_BASES_TO_WRITE="${read1_max_bases_to_write}"
#end if
- #if str( $read1_trim ):
- READ1_TRIM="${read1_trim}"
+
+ READ2_TRIM="${read2_trim}"
+
+ #if int($read2_max_bases_to_write) > -1:
+ READ2_MAX_BASES_TO_WRITE="${read2_max_bases_to_write}"
#end if
- #if str( $read1_max_bases_to_write ):
- READ1_MAX_BASES_TO_WRITE="${read1_max_bases_to_write}"
- #end if
+
INCLUDE_NON_PRIMARY_ALIGNMENTS="${include_non_primary_alignments}"
- #if str( $output_per_read_group_selector ) == 'per_sam_file':
- ##OUTPUT_PER_RG=false
- FASTQ="${output_fastq1}"
-
- #if str( $single_paired_end_type.single_paired_end_type_selector ) == 'paired':
- SECOND_END_FASTQ="${output_fastq2}"
- #if str( $single_paired_end_type.read2_trim ):
- READ2_TRIM="${single_paired_end_type.read2_trim}"
- #end if
- #if str( $single_paired_end_type.read2_max_bases_to_write ):
- READ2_MAX_BASES_TO_WRITE="${single_paired_end_type.read2_max_bases_to_write}"
- #end if
- #end if
- '
- #else:
- OUTPUT_PER_RG=true
- #if str( $single_paired_end_type.single_paired_end_type_selector ) == 'paired':
- '
- --read_group_file_2 "${output_fastq2}"
- --file_id_2 "${output_fastq2.id}"
- -p '
- #if str( $single_paired_end_type.read2_trim ):
- READ2_TRIM="${single_paired_end_type.read2_trim}"
- #end if
- #if str( $single_paired_end_type.read2_max_bases_to_write ):
- READ2_MAX_BASES_TO_WRITE="${single_paired_end_type.read2_max_bases_to_write}"
- #end if
- #end if
- '
- --read_group_file_1 "${output_fastq1}"
- --new_files_path "${__new_file_path__}"
- --file_id_1 "${output_fastq1.id}"
- #end if
+
+ VALIDATION_STRINGENCY="${validation_stringency}"
+ QUIET=true
+ VERBOSITY=ERROR
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
- single_paired_end_type['single_paired_end_type_selector'] == 'paired'
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-**What it does**
-
-Picard: SamToFastq converts SAM files to FASTQ files.
-Extracts read sequences and qualities from the input SAM/BAM file and writes them into the output file in Sanger fastq format. In the RC mode (default is True), if the read is aligned and the alignment is to the reverse strand on the genome, the read's sequence from input SAM file will be reverse-complemented prior to writing it to fastq in order restore correctly the original read sequence as it was generated by the sequencer.
+**Purpose**
-------
-
-Please cite the website "http://picard.sourceforge.net".
+Extracts read sequences and qualities from the input SAM/BAM dataset and outputs them in Sanger fastq format. In the RE_REVERSE=True mode (default behavior), if the read is aligned and the alignment is to the reverse strand on the genome, the read's sequence from input SAM.BAM dataset will be reverse-complemented prior to writing it to fastq in order restore correctly the original read sequence as it was generated by the sequencer.
-------
+-----
+.. class:: warningmark
-**Input formats**
+**DANGER: Multiple Outputs**
-FastqToSam accepts SAM input files, see http://samtools.sourceforge.net for more details.
+Generating per readgroup fastq (setting **OUTPUT_PER_RG** to True) may produce very large numbers of outputs. Know what you are doing!
-------
+@dataset_collections@
-**Outputs**
+@description@
-The output is in FASTQ format. If using Paired end data, 2 fastq files are created.
-
--------
-
-**FastqToSam settings**
-
-This is list of SamToFastq options::
+ FASTQ=File
+ F=File Output fastq file (single-end fastq or, if paired, first end of the pair fastq).
+ Required. Cannot be used in conjuction with option(s) OUTPUT_PER_RG (OPRG)
+
+ SECOND_END_FASTQ=File
+ F2=File Output fastq file (if paired, second end of the pair fastq). Default value: null.
+ Cannot be used in conjuction with option(s) OUTPUT_PER_RG (OPRG)
+
+ UNPAIRED_FASTQ=File
+ FU=File Output fastq file for unpaired reads; may only be provided in paired-fastq mode Default
+ value: null. Cannot be used in conjuction with option(s) OUTPUT_PER_RG (OPRG)
+
+ OUTPUT_PER_RG=Boolean
+ OPRG=Boolean Output a fastq file per read group (two fastq files per read group if the group is
+ paired). Default value: false. Possible values: {true, false} Cannot be used in
+ conjuction with option(s) SECOND_END_FASTQ (F2) UNPAIRED_FASTQ (FU) FASTQ (F)
+
+ OUTPUT_DIR=File
+ ODIR=File Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true.
+ Default value: null.
+
+ RE_REVERSE=Boolean
+ RC=Boolean Re-reverse bases and qualities of reads with negative strand flag set before writing them
+ to fastq Default value: true. Possible values: {true, false}
+
+ INTERLEAVE=Boolean
+ INTER=Boolean Will generate an interleaved fastq if paired, each line will have /1 or /2 to describe
+ which end it came from Default value: false. Possible values: {true, false}
+
+ INCLUDE_NON_PF_READS=Boolean
+ NON_PF=Boolean Include non-PF reads from the SAM file into the output FASTQ files. PF means 'passes
+ filtering'. Reads whose 'not passing quality controls' flag is set are non-PF reads.
+ Default value: false. Possible values: {true, false}
- INPUT=File Input SAM/BAM file to extract reads from Required.
- FASTQ=File Output fastq file (single-end fastq or, if paired, first end of the pair fastq). Required. Cannot be used in conjuction with option(s) OUTPUT_PER_RG (OPRG)
- SECOND_END_FASTQ=File Output fastq file (if paired, second end of the pair fastq). Default value: null. Cannot be used in conjuction with option(s) OUTPUT_PER_RG (OPRG)
- OUTPUT_PER_RG=Boolean Output a fastq file per read group (two fastq files per read group if the group is paired). Default value: false. This option can be set to 'null' to clear the default value. Possible values: {true, false} Cannot be used in conjuction with option(s) SECOND_END_FASTQ (F2) FASTQ (F)
- OUTPUT_DIR=File Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true. Default value: null.
- RE_REVERSE=Boolean Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false}
- INCLUDE_NON_PF_READS=Boolean Include non-PF reads from the SAM file into the output FASTQ files. Default value: false. This option can be set to 'null' to clear the default value. Possible values: {true, false}
- CLIPPING_ATTRIBUTE=String The attribute that stores the position at which the SAM record should be clipped Default value: null.
- CLIPPING_ACTION=String The action that should be taken with clipped reads: 'X' means the reads and qualities should be trimmed at the clipped position; 'N' means the bases should be changed to Ns in the clipped region; and any integer means that the base qualities should be set to that value in the clipped region. Default value: null.
- READ1_TRIM=Integer The number of bases to trim from the beginning of read 1. Default value: 0. This option can be set to 'null' to clear the default value.
- READ1_MAX_BASES_TO_WRITE=Integer The maximum number of bases to write from read 1 after trimming. If there are fewer than this many bases left after trimming, all will be written. If this value is null then all bases left after trimming will be written. Default value: null.
- READ2_TRIM=Integer The number of bases to trim from the beginning of read 2. Default value: 0. This option can be set to 'null' to clear the default value.
- READ2_MAX_BASES_TO_WRITE=Integer The maximum number of bases to write from read 2 after trimming. If there are fewer than this many bases left after trimming, all will be written. If this value is null then all bases left after trimming will be written. Default value: null.
- INCLUDE_NON_PRIMARY_ALIGNMENTS=Boolean If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments. Default value: false. This option can be set to 'null' to clear the default value. Possible values: {true, false}
-
+ CLIPPING_ATTRIBUTE=String
+ CLIP_ATTR=String The attribute that stores the position at which the SAM record should be clipped Default
+ value: null.
+
+ CLIPPING_ACTION=String
+ CLIP_ACT=String The action that should be taken with clipped reads: 'X' means the reads and qualities
+ should be trimmed at the clipped position; 'N' means the bases should be changed to Ns in
+ the clipped region; and any integer means that the base qualities should be set to that
+ value in the clipped region. Default value: null.
+
+ READ1_TRIM=Integer
+ R1_TRIM=Integer The number of bases to trim from the beginning of read 1. Default value: 0.
+
+ READ1_MAX_BASES_TO_WRITE=Integer
+ R1_MAX_BASES=Integer The maximum number of bases to write from read 1 after trimming. If there are fewer than
+ this many bases left after trimming, all will be written. If this value is null then all
+ bases left after trimming will be written. Default value: null.
+
+ READ2_TRIM=Integer
+ R2_TRIM=Integer The number of bases to trim from the beginning of read 2. Default value: 0.
+
+ READ2_MAX_BASES_TO_WRITE=Integer
+ R2_MAX_BASES=Integer The maximum number of bases to write from read 2 after trimming. If there are fewer than
+ this many bases left after trimming, all will be written. If this value is null then all
+ bases left after trimming will be written. Default value: null.
+
+ INCLUDE_NON_PRIMARY_ALIGNMENTS=Boolean
+ If true, include non-primary alignments in the output. Support of non-primary alignments
+ in SamToFastq is not comprehensive, so there may be exceptions if this is set to true and
+ there are paired reads with non-primary alignments. Default value: false.
+ Possible values: {true, false}
+
+@more_info@
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_SamToFastq_wrapper.py
--- a/picard_SamToFastq_wrapper.py Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,93 +0,0 @@
-#!/usr/bin/env python
-#Dan Blankenberg
-
-"""
-A wrapper script for running the Picard SamToFastq command. Allows parsing read groups into separate files.
-"""
-
-import sys, optparse, os, tempfile, subprocess, shutil
-
-CHUNK_SIZE = 2**20 #1mb
-
-
-def cleanup_before_exit( tmp_dir ):
- if tmp_dir and os.path.exists( tmp_dir ):
- shutil.rmtree( tmp_dir )
-
-def open_file_from_option( filename, mode = 'rb' ):
- if filename:
- return open( filename, mode = mode )
- return None
-
-def __main__():
- #Parse Command Line
- parser = optparse.OptionParser()
- parser.add_option( '-p', '--pass_through', dest='pass_through_options', action='append', type="string", help='These options are passed through directly to PICARD, without any modification.' )
- parser.add_option( '-1', '--read_group_file_1', dest='read_group_file_1', action='store', type="string", default=None, help='Read Group 1 output file, when using multiple readgroups' )
- parser.add_option( '-2', '--read_group_file_2', dest='read_group_file_2', action='store', type="string", default=None, help='Read Group 2 output file, when using multiple readgroups and paired end' )
- parser.add_option( '', '--stdout', dest='stdout', action='store', type="string", default=None, help='If specified, the output of stdout will be written to this file.' )
- parser.add_option( '', '--stderr', dest='stderr', action='store', type="string", default=None, help='If specified, the output of stderr will be written to this file.' )
- parser.add_option( '-n', '--new_files_path', dest='new_files_path', action='store', type="string", default=None, help='new_files_path')
- parser.add_option( '-i', '--file_id_1', dest='file_id_1', action='store', type="string", default=None, help='file_id_1')
- parser.add_option( '-f', '--file_id_2', dest='file_id_2', action='store', type="string", default=None, help='file_id_2')
- (options, args) = parser.parse_args()
-
- tmp_dir = tempfile.mkdtemp( prefix='tmp-picard-' )
- if options.pass_through_options:
- cmd = ' '.join( options.pass_through_options )
- else:
- cmd = ''
- if options.new_files_path is not None:
- print 'Creating FASTQ files by Read Group'
- assert None not in [ options.read_group_file_1, options.new_files_path, options.file_id_1 ], 'When using read group aware, you need to specify --read_group_file_1, --read_group_file_2 (when paired end), --new_files_path, and --file_id'
- cmd = '%s OUTPUT_DIR="%s"' % ( cmd, tmp_dir)
- #set up stdout and stderr output options
- stdout = open_file_from_option( options.stdout, mode = 'wb' )
- if stdout is None:
- stdout = sys.stdout
- stderr = open_file_from_option( options.stderr, mode = 'wb' )
- #if no stderr file is specified, we'll use our own
- if stderr is None:
- stderr = tempfile.NamedTemporaryFile( prefix="picard-stderr-", dir=tmp_dir )
-
- proc = subprocess.Popen( args=cmd, stdout=stdout, stderr=stderr, shell=True, cwd=tmp_dir )
- return_code = proc.wait()
-
- if return_code:
- stderr_target = sys.stderr
- else:
- stderr_target = sys.stdout
- stderr.flush()
- stderr.seek(0)
- while True:
- chunk = stderr.read( CHUNK_SIZE )
- if chunk:
- stderr_target.write( chunk )
- else:
- break
- stderr.close()
- #if rg aware, put files where they belong
- if options.new_files_path is not None:
- fastq_1_name = options.read_group_file_1
- fastq_2_name = options.read_group_file_2
- file_id_1 = options.file_id_1
- file_id_2 = options.file_id_2
- if file_id_2 is None:
- file_id_2 = file_id_1
- for filename in sorted( os.listdir( tmp_dir ) ):
- if filename.endswith( '_1.fastq' ):
- if fastq_1_name:
- shutil.move( os.path.join( tmp_dir, filename ), fastq_1_name )
- fastq_1_name = None
- else:
- shutil.move( os.path.join( tmp_dir, filename ), os.path.join( options.new_files_path, 'primary_%s_%s - 1_visible_fastqsanger' % ( file_id_1, filename[:-len( '_1.fastq' )] ) ) )
- elif filename.endswith( '_2.fastq' ):
- if fastq_2_name:
- shutil.move( os.path.join( tmp_dir, filename ), fastq_2_name )
- fastq_2_name = None
- else:
- shutil.move( os.path.join( tmp_dir, filename ), os.path.join( options.new_files_path, 'primary_%s_%s - 2_visible_fastqsanger' % ( file_id_2, filename[:-len( '_2.fastq' )] ) ) )
-
- cleanup_before_exit( tmp_dir )
-
-if __name__=="__main__": __main__()
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_SortSam.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_SortSam.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,70 @@
+
+
+ sort SAM/BAM dataset
+
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+ @java_options@
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ SortSam
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ SORT_ORDER="${sort_order}"
+ QUIET=true
+ VERBOSITY=ERROR
+ VALIDATION_STRINGENCY=${validation_stringency}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+**Purpose**
+
+Sorts the input SAM or BAM.
+
+@dataset_collections@
+
+@description@
+
+ SORT_ORDER=SortOrder
+ SO=SortOrder Sort order of output file Required. Possible values: {unsorted, queryname, coordinate}
+
+ @more_info@
+
+
+
\ No newline at end of file
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_ValidateSamFile.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_ValidateSamFile.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,230 @@
+
+ assess validity of SAM/BAM dataset
+
+ picard
+
+
+
+ picard_macros.xml
+
+
+
+
+ ##set the maximum number of open file to hard maximum or 4096 if on a mac (mac gives 'unlimited' as output of `ulimit -Hn` command
+
+ [ `ulimit -Hn` = unlimited ] && ulimit -Sn 4096 || ulimit -Sn `ulimit -Hn`
+
+ &&
+
+ ##set up input files
+
+ #set $reference_fasta_filename = "localref.fa"
+
+ #if str( $reference_source.reference_source_selector ) == "history":
+ ln -s "${reference_source.ref_file}" "${reference_fasta_filename}" &&
+ #else:
+ #set $reference_fasta_filename = str( $reference_source.ref_file.fields.path )
+ #end if
+
+ @java_options@
+
+ java -jar \$JAVA_JAR_PATH/picard.jar
+ ValidateSamFile
+
+ INPUT="${inputFile}"
+ OUTPUT="${outFile}"
+ MODE="${mode}"
+
+ #if str( $ignore ) != "None":
+ #for $element in str( $ignore ).split(','): ## See trello card https://trello.com/c/9nW02Zhd
+ IGNORE="${element}"
+ #end for
+ #end if
+
+ MAX_OUTPUT="${max_output}"
+ REFERENCE_SEQUENCE="${reference_fasta_filename}"
+ IGNORE_WARNINGS="${ignore_warnings}"
+ IS_BISULFITE_SEQUENCED="${is_bisulfite_sequenced}"
+ MAX_OPEN_TEMP_FILES=`ulimit -Sn`
+
+ VERBOSITY=ERROR
+ QUIET=true
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+**Purpose**
+
+Reads a SAM/BAM dataset and report on its validity.
+
+@dataset_collections@
+
+@description@
+
+ MODE=Mode
+ M=Mode Mode of output Default value: VERBOSE. This option can be set to 'null' to clear the
+ default value. Possible values: {VERBOSE, SUMMARY}
+
+ IGNORE=Type List of validation error types to ignore. Possible values: {INVALID_QUALITY_FORMAT,
+ INVALID_FLAG_PROPER_PAIR, INVALID_FLAG_MATE_UNMAPPED, MISMATCH_FLAG_MATE_UNMAPPED,
+ INVALID_FLAG_MATE_NEG_STRAND, MISMATCH_FLAG_MATE_NEG_STRAND, INVALID_FLAG_FIRST_OF_PAIR,
+ INVALID_FLAG_SECOND_OF_PAIR, PAIRED_READ_NOT_MARKED_AS_FIRST_OR_SECOND,
+ INVALID_FLAG_NOT_PRIM_ALIGNMENT, INVALID_FLAG_SUPPLEMENTARY_ALIGNMENT,
+ INVALID_FLAG_READ_UNMAPPED, INVALID_INSERT_SIZE, INVALID_MAPPING_QUALITY, INVALID_CIGAR,
+ ADJACENT_INDEL_IN_CIGAR, INVALID_MATE_REF_INDEX, MISMATCH_MATE_REF_INDEX,
+ INVALID_REFERENCE_INDEX, INVALID_ALIGNMENT_START, MISMATCH_MATE_ALIGNMENT_START,
+ MATE_FIELD_MISMATCH, INVALID_TAG_NM, MISSING_TAG_NM, MISSING_HEADER,
+ MISSING_SEQUENCE_DICTIONARY, MISSING_READ_GROUP, RECORD_OUT_OF_ORDER,
+ READ_GROUP_NOT_FOUND, RECORD_MISSING_READ_GROUP, INVALID_INDEXING_BIN,
+ MISSING_VERSION_NUMBER, INVALID_VERSION_NUMBER, TRUNCATED_FILE,
+ MISMATCH_READ_LENGTH_AND_QUALS_LENGTH, EMPTY_READ, CIGAR_MAPS_OFF_REFERENCE,
+ MISMATCH_READ_LENGTH_AND_E2_LENGTH, MISMATCH_READ_LENGTH_AND_U2_LENGTH,
+ E2_BASE_EQUALS_PRIMARY_BASE, BAM_FILE_MISSING_TERMINATOR_BLOCK, UNRECOGNIZED_HEADER_TYPE,
+ POORLY_FORMATTED_HEADER_TAG, HEADER_TAG_MULTIPLY_DEFINED,
+ HEADER_RECORD_MISSING_REQUIRED_TAG, INVALID_DATE_STRING, TAG_VALUE_TOO_LARGE,
+ INVALID_INDEX_FILE_POINTER, INVALID_PREDICTED_MEDIAN_INSERT_SIZE,
+ DUPLICATE_READ_GROUP_ID, MISSING_PLATFORM_VALUE, INVALID_PLATFORM_VALUE,
+ DUPLICATE_PROGRAM_GROUP_ID, MATE_NOT_FOUND, MATES_ARE_SAME_END,
+ MISMATCH_MATE_CIGAR_STRING, MATE_CIGAR_STRING_INVALID_PRESENCE} This option may be
+ specified 0 or more times.
+
+ MAX_OUTPUT=Integer
+ MO=Integer The maximum number of lines output in verbose mode Default value: 100. This option can
+ be set to 'null' to clear the default value.
+
+ REFERENCE_SEQUENCE=File
+ R=File Reference sequence file, the NM tag check will be skipped if this is missing Default
+ value: null.
+
+ IGNORE_WARNINGS=Boolean If true, only report errors and ignore warnings. Default value: false. This option can
+ be set to 'null' to clear the default value. Possible values: {true, false}
+
+ VALIDATE_INDEX=Boolean If true and input is a BAM file with an index file, also validates the index. Default
+ value: true. This option can be set to 'null' to clear the default value. Possible
+ values: {true, false}
+
+ IS_BISULFITE_SEQUENCED=Boolean
+ BISULFITE=Boolean Whether the SAM or BAM file consists of bisulfite sequenced reads. If so, C->T is not
+ counted as an error in computing the value of the NM tag. Default value: false. This
+ option can be set to 'null' to clear the default value. Possible values: {true, false}
+
+@more_info@
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/picard_macros.xml Tue Dec 16 19:03:21 2014 -0500
@@ -0,0 +1,85 @@
+
+
+
+
+
+
+
+
+
+
+ _JAVA_OPTIONS=\${_JAVA_OPTIONS:-'-Xmx2048m -Xms256m'} &&
+ export _JAVA_OPTIONS &&
+
+
+
+------
+
+**Additional information**
+
+Additional information about Picard tools is available from Picard web site at http://broadinstitute.github.io/picard/.
+
+
+
+
+------
+
+**Inputs, outputs, and parameters**
+
+Either a SAM file or a BAM file must be supplied. Galaxy automatically coordinate-sorts all uploaded BAM files.
+
+From Picard documentation( http://broadinstitute.github.io/picard/)::
+
+
+
+-------
+
+**Read Groups are Important!**
+
+Setting read groups correctly from the start will simplify your life greatly because you can merge multiple BAM files into one significantly reducing the number of analysis steps. Below we provide an explanation of read groups fields taken from GATK FAQ webpage:
+
+.. csv-table::
+ :header-rows: 1
+
+ Tag,Importance,Definition,Meaning
+ "ID","Required","Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.","Ideally, this should be a globally unique identify across all sequencing data in the world, such as the Illumina flowcell + lane name and number. Will be referenced by each read with the RG:Z field, allowing tools to determine the read group information associated with each read, including the sample from which the read came. Also, a read group is effectively treated as a separate run of the NGS instrument in tools like base quality score recalibration (a GATK component) -- all reads within a read group are assumed to come from the same instrument run and to therefore share the same error model."
+ "SM","Sample. Use pool name where a pool is being sequenced.","Required. As important as ID.","The name of the sample sequenced in this read group. GATK tools treat all read groups with the same SM value as containing sequencing data for the same sample. Therefore it's critical that the SM field be correctly specified, especially when using multi-sample tools like the Unified Genotyper (a GATK component)."
+ "PL","Platform/technology used to produce the read. Valid values: ILLUMINA, SOLID, LS454, HELICOS and PACBIO.","Important. Not currently used in the GATK, but was in the past, and may return. The only way to known the sequencing technology used to generate the sequencing data","It's a good idea to use this field."
+ "LB","DNA preparation library identify","Essential for MarkDuplicates","MarkDuplicates uses the LB field to determine which read groups might contain molecular duplicates, in case the same DNA library was sequenced on multiple lanes."
+
+**Example of Read Group usage**
+
+Support we have a trio of samples: MOM, DAD, and KID. Each has two DNA libraries prepared, one with 400 bp inserts and another with 200 bp inserts. Each of these libraries is run on two lanes of an illumina hiseq, requiring 3 x 2 x 2 = 12 lanes of data. When the data come off the sequencer, we would create 12 BAM files, with the following @RG fields in the header::
+
+ Dad's data:
+ @RG ID:FLOWCELL1.LANE1 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200
+ @RG ID:FLOWCELL1.LANE2 PL:illumina LB:LIB-DAD-1 SM:DAD PI:200
+ @RG ID:FLOWCELL1.LANE3 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400
+ @RG ID:FLOWCELL1.LANE4 PL:illumina LB:LIB-DAD-2 SM:DAD PI:400
+
+ Mom's data:
+ @RG ID:FLOWCELL1.LANE5 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200
+ @RG ID:FLOWCELL1.LANE6 PL:illumina LB:LIB-MOM-1 SM:MOM PI:200
+ @RG ID:FLOWCELL1.LANE7 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
+ @RG ID:FLOWCELL1.LANE8 PL:illumina LB:LIB-MOM-2 SM:MOM PI:400
+
+ Kid's data:
+ @RG ID:FLOWCELL2.LANE1 PL:illumina LB:LIB-KID-1 SM:KID PI:200
+ @RG ID:FLOWCELL2.LANE2 PL:illumina LB:LIB-KID-1 SM:KID PI:200
+ @RG ID:FLOWCELL2.LANE3 PL:illumina LB:LIB-KID-2 SM:KID PI:400
+ @RG ID:FLOWCELL2.LANE4 PL:illumina LB:LIB-KID-2 SM:KID PI:400
+
+Note the hierarchical relationship between read groups (unique for each lane) to libraries (sequenced on two lanes) and samples (across four lanes, two lanes for each library).
+
+
+------
+
+**Dataset collections - processing large numbers of datasets at once**
+
+This will be added shortly
+
+
+
+
+
+
diff -r ab1f60c26526 -r 3d4f1fa26f0e picard_wrapper.py
--- a/picard_wrapper.py Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,776 +0,0 @@
-#!/usr/bin/env python
-"""
-Originally written by Kelly Vincent
-pretty output and additional picard wrappers by Ross Lazarus for rgenetics
-Runs all available wrapped Picard tools.
-usage: picard_wrapper.py [options]
-code Ross wrote licensed under the LGPL
-see http://www.gnu.org/copyleft/lesser.html
-"""
-
-import optparse, os, sys, subprocess, tempfile, shutil, time, logging
-
-galhtmlprefix = """
-
-
-
-
-
-
-
-
-
-
-"""
-galhtmlattr = """Galaxy tool %s run at %s """
-galhtmlpostfix = """
\n"""
-
-
-def stop_err( msg ):
- sys.stderr.write( '%s\n' % msg )
- sys.exit()
-
-
-def timenow():
- """return current time as a string
- """
- return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-class PicardBase():
- """
- simple base class with some utilities for Picard
- adapted and merged with Kelly Vincent's code april 2011 Ross
- lots of changes...
- """
-
- def __init__(self, opts=None,arg0=None):
- """ common stuff needed at init for a picard tool
- """
- assert opts <> None, 'PicardBase needs opts at init'
- self.opts = opts
- if self.opts.outdir == None:
- self.opts.outdir = os.getcwd() # fixmate has no html file eg so use temp dir
- assert self.opts.outdir <> None,'## PicardBase needs a temp directory if no output directory passed in'
- self.picname = self.baseName(opts.jar)
- if self.picname.startswith('picard'):
- self.picname = opts.picard_cmd # special case for some tools like replaceheader?
- self.progname = self.baseName(arg0)
- self.version = '0.002'
- self.delme = [] # list of files to destroy
- self.title = opts.title
- self.inputfile = opts.input
- try:
- os.makedirs(opts.outdir)
- except:
- pass
- try:
- os.makedirs(opts.tmpdir)
- except:
- pass
- self.log_filename = os.path.join(self.opts.outdir,'%s.log' % self.picname)
- self.metricsOut = os.path.join(opts.outdir,'%s.metrics.txt' % self.picname)
- self.setLogging(logfname=self.log_filename)
-
- def baseName(self,name=None):
- return os.path.splitext(os.path.basename(name))[0]
-
- def setLogging(self,logfname="picard_wrapper.log"):
- """setup a logger
- """
- logging.basicConfig(level=logging.INFO,
- filename=logfname,
- filemode='a')
-
-
- def readLarge(self,fname=None):
- """ read a potentially huge file.
- """
- try:
- # get stderr, allowing for case where it's very large
- tmp = open( fname, 'rb' )
- s = ''
- buffsize = 1048576
- try:
- while True:
- more = tmp.read( buffsize )
- if len(more) > 0:
- s += more
- else:
- break
- except OverflowError:
- pass
- tmp.close()
- except Exception, e:
- stop_err( 'Read Large Exception : %s' % str( e ) )
- return s
-
- def runCL(self,cl=None,output_dir=None):
- """ construct and run a command line
- we have galaxy's temp path as opt.temp_dir so don't really need isolation
- sometimes stdout is needed as the output - ugly hacks to deal with potentially vast artifacts
- """
- assert cl <> None, 'PicardBase runCL needs a command line as cl'
- if output_dir == None:
- output_dir = self.opts.outdir
- if type(cl) == type([]):
- cl = ' '.join(cl)
- fd,templog = tempfile.mkstemp(dir=output_dir,suffix='rgtempRun.txt')
- tlf = open(templog,'wb')
- fd,temperr = tempfile.mkstemp(dir=output_dir,suffix='rgtempErr.txt')
- tef = open(temperr,'wb')
- process = subprocess.Popen(cl, shell=True, stderr=tef, stdout=tlf, cwd=output_dir)
- rval = process.wait()
- tlf.close()
- tef.close()
- stderrs = self.readLarge(temperr)
- stdouts = self.readLarge(templog)
- if rval > 0:
- s = '## executing %s returned status %d and stderr: \n%s\n' % (cl,rval,stderrs)
- stdouts = '%s\n%s' % (stdouts,stderrs)
- else:
- s = '## executing %s returned status %d and nothing on stderr\n' % (cl,rval)
- logging.info(s)
- os.unlink(templog) # always
- os.unlink(temperr) # always
- return s, stdouts, rval # sometimes s is an output
-
- def runPic(self, jar, cl):
- """
- cl should be everything after the jar file name in the command
- """
- runme = ['java -Xmx%s' % self.opts.maxjheap]
- runme.append(" -Djava.io.tmpdir='%s' " % self.opts.tmpdir)
- runme.append('-jar %s' % jar)
- runme += cl
- s,stdouts,rval = self.runCL(cl=runme, output_dir=self.opts.outdir)
- return stdouts,rval
-
- def samToBam(self,infile=None,outdir=None):
- """
- use samtools view to convert sam to bam
- """
- fd,tempbam = tempfile.mkstemp(dir=outdir,suffix='rgutilsTemp.bam')
- cl = ['samtools view -h -b -S -o ',tempbam,infile]
- tlog,stdouts,rval = self.runCL(cl,outdir)
- return tlog,tempbam,rval
-
- def sortSam(self, infile=None,outfile=None,outdir=None):
- """
- """
- print '## sortSam got infile=%s,outfile=%s,outdir=%s' % (infile,outfile,outdir)
- cl = ['samtools sort',infile,outfile]
- tlog,stdouts,rval = self.runCL(cl,outdir)
- return tlog
-
- def cleanup(self):
- for fname in self.delme:
- try:
- os.unlink(fname)
- except:
- pass
-
- def prettyPicout(self,transpose,maxrows):
- """organize picard outpouts into a report html page
- """
- res = []
- try:
- r = open(self.metricsOut,'r').readlines()
- except:
- r = []
- if len(r) > 0:
- res.append('Picard on line resources
\n')
- if transpose:
- res.append('Picard output (transposed to make it easier to see)\n')
- else:
- res.append('Picard output\n')
- res.append('
\n')
- dat = []
- heads = []
- lastr = len(r) - 1
- # special case for estimate library complexity hist
- thist = False
- for i,row in enumerate(r):
- if row.strip() > '':
- srow = row.split('\t')
- if row.startswith('#'):
- heads.append(row.strip()) # want strings
- else:
- dat.append(srow) # want lists
- if row.startswith('## HISTOGRAM'):
- thist = True
- if len(heads) > 0:
- hres = ['
%s
' % (i % 2,x) for i,x in enumerate(heads)]
- res += hres
- heads = []
- if len(dat) > 0:
- if transpose and not thist:
- tdat = map(None,*dat) # transpose an arbitrary list of lists
- tdat = ['
%s
%s
\n' % ((i+len(heads)) % 2,x[0],x[1]) for i,x in enumerate(tdat)]
- else:
- tdat = ['\t'.join(x).strip() for x in dat] # back to strings :(
- tdat = ['
%s
\n' % ((i+len(heads)) % 2,x) for i,x in enumerate(tdat)]
- res += tdat
- dat = []
- res.append('
\n')
- return res
-
- def fixPicardOutputs(self,transpose,maxloglines):
- """
- picard produces long hard to read tab header files
- make them available but present them transposed for readability
- """
- logging.shutdown()
- self.cleanup() # remove temp files stored in delme
- rstyle=""""""
- res = [rstyle,]
- res.append(galhtmlprefix % self.progname)
- res.append(galhtmlattr % (self.picname,timenow()))
- flist = [x for x in os.listdir(self.opts.outdir) if not x.startswith('.')]
- pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
- if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
- for p in pdflist:
- pbase = os.path.splitext(p)[0] # removes .pdf
- imghref = '%s.jpg' % pbase
- mimghref = '%s-0.jpg' % pbase # multiple pages pdf -> multiple thumbnails without asking!
- if mimghref in flist:
- imghref=mimghref # only one for thumbnail...it's a multi page pdf
- res.append('
')
- if llen > maxloglines:
- rlog.append('\n## WARNING - %d log lines truncated - %s contains entire output' % (llen - maxloglines,self.log_filename,self.log_filename))
- res += rlog
- else:
- res.append("### Odd, Picard left no log file %s - must have really barfed badly?\n" % self.log_filename)
- res.append('The freely available Picard software \n')
- res.append( 'generated all outputs reported here running as a Galaxy tool')
- res.append(galhtmlpostfix)
- outf = open(self.opts.htmlout,'w')
- outf.write(''.join(res))
- outf.write('\n')
- outf.close()
-
- def makePicInterval(self,inbed=None,outf=None):
- """
- picard wants bait and target files to have the same header length as the incoming bam/sam
- a meaningful (ie accurate) representation will fail because of this - so this hack
- it would be far better to be able to supply the original bed untouched
- Additional checking added Ross Lazarus Dec 2011 to deal with two 'bug' reports on the list
- """
- assert inbed <> None
- bed = open(inbed,'r').readlines()
- sbed = [x.split('\t') for x in bed] # lengths MUST be 5
- lens = [len(x) for x in sbed]
- strands = [x[3] for x in sbed if not x[3] in ['+','-']]
- maxl = max(lens)
- minl = min(lens)
- e = []
- if maxl <> minl:
- e.append("## Input error: Inconsistent field count in %s - please read the documentation on bait/target format requirements, fix and try again" % inbed)
- if maxl <> 5:
- e.append("## Input error: %d fields found in %s, 5 required - please read the warning and documentation on bait/target format requirements, fix and try again" % (maxl,inbed))
- if len(strands) > 0:
- e.append("## Input error: Fourth column in %s is not the required strand (+ or -) - please read the warning and documentation on bait/target format requirements, fix and try again" % (inbed))
- if len(e) > 0: # write to stderr and quit
- print >> sys.stderr, '\n'.join(e)
- sys.exit(1)
- thead = os.path.join(self.opts.outdir,'tempSamHead.txt')
- if self.opts.datatype == 'sam':
- cl = ['samtools view -H -S',self.opts.input,'>',thead]
- else:
- cl = ['samtools view -H',self.opts.input,'>',thead]
- self.runCL(cl=cl,output_dir=self.opts.outdir)
- head = open(thead,'r').readlines()
- s = '## got %d rows of header\n' % (len(head))
- logging.info(s)
- o = open(outf,'w')
- o.write(''.join(head))
- o.write(''.join(bed))
- o.close()
- return outf
-
- def cleanSam(self, insam=None, newsam=None, picardErrors=[],outformat=None):
- """
- interesting problem - if paired, must remove mate pair of errors too or we have a new set of errors after cleaning - missing mate pairs!
- Do the work of removing all the error sequences
- pysam is cool
- infile = pysam.Samfile( "-", "r" )
- outfile = pysam.Samfile( "-", "w", template = infile )
- for s in infile: outfile.write(s)
-
- errors from ValidateSameFile.jar look like
- WARNING: Record 32, Read name SRR006041.1202260, NM tag (nucleotide differences) is missing
- ERROR: Record 33, Read name SRR006041.1042721, Empty sequence dictionary.
- ERROR: Record 33, Read name SRR006041.1042721, RG ID on SAMRecord not found in header: SRR006041
-
- """
- assert os.path.isfile(insam), 'rgPicardValidate cleansam needs an input sam file - cannot find %s' % insam
- assert newsam <> None, 'rgPicardValidate cleansam needs an output new sam file path'
- removeNames = [x.split(',')[1].replace(' Read name ','') for x in picardErrors if len(x.split(',')) > 2]
- remDict = dict(zip(removeNames,range(len(removeNames))))
- infile = pysam.Samfile(insam,'rb')
- info = 'found %d error sequences in picardErrors, %d unique' % (len(removeNames),len(remDict))
- if len(removeNames) > 0:
- outfile = pysam.Samfile(newsam,'wb',template=infile) # template must be an open file
- i = 0
- j = 0
- for row in infile:
- dropme = remDict.get(row.qname,None) # keep if None
- if not dropme:
- outfile.write(row)
- j += 1
- else: # discard
- i += 1
- info = '%s\n%s' % (info, 'Discarded %d lines writing %d to %s from %s' % (i,j,newsam,insam))
- outfile.close()
- infile.close()
- else: # we really want a nullop or a simple pointer copy
- infile.close()
- if newsam:
- shutil.copy(insam,newsam)
- logging.info(info)
-
-
-
-def __main__():
- doFix = False # tools returning htmlfile don't need this
- doTranspose = True # default
- maxloglines = 100 # default
- #Parse Command Line
- op = optparse.OptionParser()
- # All tools
- op.add_option('-i', '--input', dest='input', help='Input SAM or BAM file' )
- op.add_option('-e', '--inputext', default=None)
- op.add_option('-o', '--output', default=None)
- op.add_option('-n', '--title', default="Pick a Picard Tool")
- op.add_option('-t', '--htmlout', default=None)
- op.add_option('-d', '--outdir', default=None)
- op.add_option('-x', '--maxjheap', default='4g')
- op.add_option('-b', '--bisulphite', default='false')
- op.add_option('-s', '--sortorder', default='query')
- op.add_option('','--tmpdir', default='/tmp')
- op.add_option('-j','--jar',default='')
- op.add_option('','--picard-cmd',default=None)
- # Many tools
- op.add_option( '', '--output-format', dest='output_format', help='Output format' )
- op.add_option( '', '--bai-file', dest='bai_file', help='The path to the index file for the input bam file' )
- op.add_option( '', '--ref', dest='ref', help='Built-in reference with fasta and dict file', default=None )
- # CreateSequenceDictionary
- op.add_option( '', '--ref-file', dest='ref_file', help='Fasta to use as reference', default=None )
- op.add_option( '', '--species-name', dest='species_name', help='Species name to use in creating dict file from fasta file' )
- op.add_option( '', '--build-name', dest='build_name', help='Name of genome assembly to use in creating dict file from fasta file' )
- op.add_option( '', '--trunc-names', dest='trunc_names', help='Truncate sequence names at first whitespace from fasta file' )
- # MarkDuplicates
- op.add_option( '', '--remdups', default='true', help='Remove duplicates from output file' )
- op.add_option( '', '--optdupdist', default="100", help='Maximum pixels between two identical sequences in order to consider them optical duplicates.' )
- # CollectInsertSizeMetrics
- op.add_option('', '--taillimit', default="0")
- op.add_option('', '--histwidth', default="0")
- op.add_option('', '--minpct', default="0.01")
- op.add_option('', '--malevel', default='')
- op.add_option('', '--deviations', default="0.0")
- # CollectAlignmentSummaryMetrics
- op.add_option('', '--maxinsert', default="20")
- op.add_option('', '--adaptors', default='')
- # FixMateInformation and validate
- # CollectGcBiasMetrics
- op.add_option('', '--windowsize', default='100')
- op.add_option('', '--mingenomefrac', default='0.00001')
- # AddOrReplaceReadGroups
- op.add_option( '', '--rg-opts', dest='rg_opts', help='Specify extra (optional) arguments with full, otherwise preSet' )
- op.add_option( '', '--rg-lb', dest='rg_library', help='Read Group Library' )
- op.add_option( '', '--rg-pl', dest='rg_platform', help='Read Group platform (e.g. illumina, solid)' )
- op.add_option( '', '--rg-pu', dest='rg_plat_unit', help='Read Group platform unit (eg. run barcode) ' )
- op.add_option( '', '--rg-sm', dest='rg_sample', help='Read Group sample name' )
- op.add_option( '', '--rg-id', dest='rg_id', help='Read Group ID' )
- op.add_option( '', '--rg-cn', dest='rg_seq_center', help='Read Group sequencing center name' )
- op.add_option( '', '--rg-ds', dest='rg_desc', help='Read Group description' )
- # ReorderSam
- op.add_option( '', '--allow-inc-dict-concord', dest='allow_inc_dict_concord', help='Allow incomplete dict concordance' )
- op.add_option( '', '--allow-contig-len-discord', dest='allow_contig_len_discord', help='Allow contig length discordance' )
- # ReplaceSamHeader
- op.add_option( '', '--header-file', dest='header_file', help='sam or bam file from which header will be read' )
-
- op.add_option('','--assumesorted', default='true')
- op.add_option('','--readregex', default="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*")
- #estimatelibrarycomplexity
- op.add_option('','--minid', default="5")
- op.add_option('','--maxdiff', default="0.03")
- op.add_option('','--minmeanq', default="20")
- #hsmetrics
- op.add_option('','--baitbed', default=None)
- op.add_option('','--targetbed', default=None)
- #validate
- op.add_option('','--ignoreflags', action='append', type="string")
- op.add_option('','--maxerrors', default=None)
- op.add_option('','--datatype', default=None)
- op.add_option('','--bamout', default=None)
- op.add_option('','--samout', default=None)
-
- opts, args = op.parse_args()
- opts.sortme = opts.assumesorted == 'false'
- assert opts.input <> None
- # need to add
- # instance that does all the work
- pic = PicardBase(opts,sys.argv[0])
-
- tmp_dir = opts.outdir
- haveTempout = False # we use this where sam output is an option
- rval = 0
- stdouts = 'Not run yet'
- # set ref and dict files to use (create if necessary)
- ref_file_name = opts.ref
- if opts.ref_file <> None:
- csd = 'CreateSequenceDictionary'
- realjarpath = os.path.split(opts.jar)[0]
- jarpath = os.path.join(realjarpath,'%s.jar' % csd) # for refseq
- tmp_ref_fd, tmp_ref_name = tempfile.mkstemp( dir=opts.tmpdir , prefix = pic.picname)
- ref_file_name = '%s.fasta' % tmp_ref_name
- # build dict
- dict_file_name = '%s.dict' % tmp_ref_name
- os.symlink( opts.ref_file, ref_file_name )
- cl = ['REFERENCE=%s' % ref_file_name]
- cl.append('OUTPUT=%s' % dict_file_name)
- cl.append('URI=%s' % os.path.basename( opts.ref_file ))
- cl.append('TRUNCATE_NAMES_AT_WHITESPACE=%s' % opts.trunc_names)
- if opts.species_name:
- cl.append('SPECIES=%s' % opts.species_name)
- if opts.build_name:
- cl.append('GENOME_ASSEMBLY=%s' % opts.build_name)
- pic.delme.append(dict_file_name)
- pic.delme.append(ref_file_name)
- pic.delme.append(tmp_ref_name)
- stdouts,rval = pic.runPic(jarpath, cl)
- # run relevant command(s)
-
- # define temporary output
- # if output is sam, it must have that extension, otherwise bam will be produced
- # specify sam or bam file with extension
- if opts.output_format == 'sam':
- suff = '.sam'
- else:
- suff = ''
- tmp_fd, tempout = tempfile.mkstemp( dir=opts.tmpdir, suffix=suff )
-
- cl = ['VALIDATION_STRINGENCY=LENIENT',]
-
- if pic.picname == 'AddOrReplaceReadGroups':
- # sort order to match Galaxy's default
- cl.append('SORT_ORDER=coordinate')
- # input
- cl.append('INPUT=%s' % opts.input)
- # outputs
- cl.append('OUTPUT=%s' % tempout)
- # required read groups
- cl.append('RGLB="%s"' % opts.rg_library)
- cl.append('RGPL="%s"' % opts.rg_platform)
- cl.append('RGPU="%s"' % opts.rg_plat_unit)
- cl.append('RGSM="%s"' % opts.rg_sample)
- if opts.rg_id:
- cl.append('RGID="%s"' % opts.rg_id)
- # optional read groups
- if opts.rg_seq_center:
- cl.append('RGCN="%s"' % opts.rg_seq_center)
- if opts.rg_desc:
- cl.append('RGDS="%s"' % opts.rg_desc)
- stdouts,rval = pic.runPic(opts.jar, cl)
- haveTempout = True
-
- elif pic.picname == 'BamIndexStats':
- tmp_fd, tmp_name = tempfile.mkstemp( dir=tmp_dir )
- tmp_bam_name = '%s.bam' % tmp_name
- tmp_bai_name = '%s.bai' % tmp_bam_name
- os.symlink( opts.input, tmp_bam_name )
- os.symlink( opts.bai_file, tmp_bai_name )
- cl.append('INPUT=%s' % ( tmp_bam_name ))
- pic.delme.append(tmp_bam_name)
- pic.delme.append(tmp_bai_name)
- pic.delme.append(tmp_name)
- stdouts,rval = pic.runPic( opts.jar, cl )
- f = open(pic.metricsOut,'a')
- f.write(stdouts) # got this on stdout from runCl
- f.write('\n')
- f.close()
- doTranspose = False # but not transposed
-
- elif pic.picname == 'EstimateLibraryComplexity':
- cl.append('I=%s' % opts.input)
- cl.append('O=%s' % pic.metricsOut)
- if float(opts.minid) > 0:
- cl.append('MIN_IDENTICAL_BASES=%s' % opts.minid)
- if float(opts.maxdiff) > 0.0:
- cl.append('MAX_DIFF_RATE=%s' % opts.maxdiff)
- if float(opts.minmeanq) > 0:
- cl.append('MIN_MEAN_QUALITY=%s' % opts.minmeanq)
- if opts.readregex > '':
- cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
- if float(opts.optdupdist) > 0:
- cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
- stdouts,rval = pic.runPic(opts.jar, cl)
-
- elif pic.picname == 'CollectAlignmentSummaryMetrics':
- # Why do we do this fakefasta thing?
- # Because we need NO fai to be available or picard barfs unless it matches the input data.
- # why? Dunno Seems to work without complaining if the .bai file is AWOL....
- fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
- try:
- os.symlink(ref_file_name,fakefasta)
- except:
- s = '## unable to symlink %s to %s - different devices? Will shutil.copy'
- info = s
- shutil.copy(ref_file_name,fakefasta)
- pic.delme.append(fakefasta)
- cl.append('ASSUME_SORTED=true')
- adaptlist = opts.adaptors.split(',')
- adaptorseqs = ['ADAPTER_SEQUENCE=%s' % x for x in adaptlist]
- cl += adaptorseqs
- cl.append('IS_BISULFITE_SEQUENCED=%s' % opts.bisulphite)
- cl.append('MAX_INSERT_SIZE=%s' % opts.maxinsert)
- cl.append('OUTPUT=%s' % pic.metricsOut)
- cl.append('R=%s' % fakefasta)
- cl.append('TMP_DIR=%s' % opts.tmpdir)
- if not opts.assumesorted.lower() == 'true': # we need to sort input
- sortedfile = '%s.sorted' % os.path.basename(opts.input)
- if opts.datatype == 'sam': # need to work with a bam
- tlog,tempbam,trval = pic.samToBam(opts.input,opts.outdir)
- pic.delme.append(tempbam)
- try:
- tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
- except:
- print '## exception on sorting sam file %s' % opts.input
- else: # is already bam
- try:
- tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
- except : # bug - [bam_sort_core] not being ignored - TODO fixme
- print '## exception %s on sorting bam file %s' % (sys.exc_info()[0],opts.input)
- cl.append('INPUT=%s.bam' % os.path.abspath(os.path.join(opts.outdir,sortedfile)))
- pic.delme.append(os.path.join(opts.outdir,sortedfile))
- else:
- cl.append('INPUT=%s' % os.path.abspath(opts.input))
- stdouts,rval = pic.runPic(opts.jar, cl)
-
-
- elif pic.picname == 'CollectGcBiasMetrics':
- assert os.path.isfile(ref_file_name),'PicardGC needs a reference sequence - cannot read %s' % ref_file_name
- # sigh. Why do we do this fakefasta thing? Because we need NO fai to be available or picard barfs unless it has the same length as the input data.
- # why? Dunno
- fakefasta = os.path.join(opts.outdir,'%s_fake.fasta' % os.path.basename(ref_file_name))
- try:
- os.symlink(ref_file_name,fakefasta)
- except:
- s = '## unable to symlink %s to %s - different devices? May need to replace with shutil.copy'
- info = s
- shutil.copy(ref_file_name,fakefasta)
- pic.delme.append(fakefasta)
- x = 'rgPicardGCBiasMetrics'
- pdfname = '%s.pdf' % x
- jpgname = '%s.jpg' % x
- tempout = os.path.join(opts.outdir,'rgPicardGCBiasMetrics.out')
- temppdf = os.path.join(opts.outdir,pdfname)
- cl.append('R=%s' % fakefasta)
- cl.append('WINDOW_SIZE=%s' % opts.windowsize)
- cl.append('MINIMUM_GENOME_FRACTION=%s' % opts.mingenomefrac)
- cl.append('INPUT=%s' % opts.input)
- cl.append('OUTPUT=%s' % tempout)
- cl.append('TMP_DIR=%s' % opts.tmpdir)
- cl.append('CHART_OUTPUT=%s' % temppdf)
- cl.append('SUMMARY_OUTPUT=%s' % pic.metricsOut)
- stdouts,rval = pic.runPic(opts.jar, cl)
- if os.path.isfile(temppdf):
- cl2 = ['convert','-resize x400',temppdf,os.path.join(opts.outdir,jpgname)] # make the jpg for fixPicardOutputs to find
- s,stdouts,rval = pic.runCL(cl=cl2,output_dir=opts.outdir)
- else:
- s='### runGC: Unable to find pdf %s - please check the log for the causal problem\n' % temppdf
- lf = open(pic.log_filename,'a')
- lf.write(s)
- lf.write('\n')
- lf.close()
-
- elif pic.picname == 'CollectInsertSizeMetrics':
- """
- picard_wrapper.py -i "$input_file" -n "$out_prefix" --tmpdir "${__new_file_path__}" --deviations "$deviations"
- --histwidth "$histWidth" --minpct "$minPct" --malevel "$malevel"
- -j "${GALAXY_DATA_INDEX_DIR}/shared/jars/picard/CollectInsertSizeMetrics.jar" -d "$html_file.files_path" -t "$html_file"
-
- """
- isPDF = 'InsertSizeHist.pdf'
- pdfpath = os.path.join(opts.outdir,isPDF)
- histpdf = 'InsertSizeHist.pdf'
- cl.append('I=%s' % opts.input)
- cl.append('O=%s' % pic.metricsOut)
- cl.append('HISTOGRAM_FILE=%s' % histpdf)
- #if opts.taillimit <> '0': # this was deprecated although still mentioned in the docs at 1.56
- # cl.append('TAIL_LIMIT=%s' % opts.taillimit)
- if opts.histwidth <> '0':
- cl.append('HISTOGRAM_WIDTH=%s' % opts.histwidth)
- if float( opts.minpct) > 0.0:
- cl.append('MINIMUM_PCT=%s' % opts.minpct)
- if float(opts.deviations) > 0.0:
- cl.append('DEVIATIONS=%s' % opts.deviations)
- if opts.malevel:
- malists = opts.malevel.split(',')
- malist = ['METRIC_ACCUMULATION_LEVEL=%s' % x for x in malists]
- cl += malist
- stdouts,rval = pic.runPic(opts.jar, cl)
- if os.path.exists(pdfpath): # automake thumbnail - will be added to html
- cl2 = ['mogrify', '-format jpg -resize x400 %s' % pdfpath]
- pic.runCL(cl=cl2,output_dir=opts.outdir)
- else:
- s = 'Unable to find expected pdf file %s \n' % pdfpath
- s += 'This always happens if single ended data was provided to this tool,\n'
- s += 'so please double check that your input data really is paired-end NGS data. \n'
- s += 'If your input was paired data this may be a bug worth reporting to the galaxy-bugs list\n '
- logging.info(s)
- if len(stdouts) > 0:
- logging.info(stdouts)
-
- elif pic.picname == 'MarkDuplicates':
- # assume sorted even if header says otherwise
- cl.append('ASSUME_SORTED=%s' % (opts.assumesorted))
- # input
- cl.append('INPUT=%s' % opts.input)
- # outputs
- cl.append('OUTPUT=%s' % opts.output)
- cl.append('METRICS_FILE=%s' % pic.metricsOut )
- # remove or mark duplicates
- cl.append('REMOVE_DUPLICATES=%s' % opts.remdups)
- # the regular expression to be used to parse reads in incoming SAM file
- cl.append('READ_NAME_REGEX="%s"' % opts.readregex)
- # maximum offset between two duplicate clusters
- cl.append('OPTICAL_DUPLICATE_PIXEL_DISTANCE=%s' % opts.optdupdist)
- stdouts,rval = pic.runPic(opts.jar, cl)
-
- elif pic.picname == 'FixMateInformation':
- cl.append('I=%s' % opts.input)
- cl.append('O=%s' % tempout)
- cl.append('SORT_ORDER=%s' % opts.sortorder)
- stdouts,rval = pic.runPic(opts.jar,cl)
- haveTempout = True
-
- elif pic.picname == 'ReorderSam':
- # input
- cl.append('INPUT=%s' % opts.input)
- # output
- cl.append('OUTPUT=%s' % tempout)
- # reference
- cl.append('REFERENCE=%s' % ref_file_name)
- # incomplete dict concordance
- if opts.allow_inc_dict_concord == 'true':
- cl.append('ALLOW_INCOMPLETE_DICT_CONCORDANCE=true')
- # contig length discordance
- if opts.allow_contig_len_discord == 'true':
- cl.append('ALLOW_CONTIG_LENGTH_DISCORDANCE=true')
- stdouts,rval = pic.runPic(opts.jar, cl)
- haveTempout = True
-
- elif pic.picname == 'ReplaceSamHeader':
- cl.append('INPUT=%s' % opts.input)
- cl.append('OUTPUT=%s' % tempout)
- cl.append('HEADER=%s' % opts.header_file)
- stdouts,rval = pic.runPic(opts.jar, cl)
- haveTempout = True
-
- elif pic.picname == 'CalculateHsMetrics':
- maxloglines = 100
- baitfname = os.path.join(opts.outdir,'rgPicardHsMetrics.bait')
- targetfname = os.path.join(opts.outdir,'rgPicardHsMetrics.target')
- baitf = pic.makePicInterval(opts.baitbed,baitfname)
- if opts.targetbed == opts.baitbed: # same file sometimes
- targetf = baitf
- else:
- targetf = pic.makePicInterval(opts.targetbed,targetfname)
- cl.append('BAIT_INTERVALS=%s' % baitf)
- cl.append('TARGET_INTERVALS=%s' % targetf)
- cl.append('INPUT=%s' % os.path.abspath(opts.input))
- cl.append('OUTPUT=%s' % pic.metricsOut)
- cl.append('TMP_DIR=%s' % opts.tmpdir)
- stdouts,rval = pic.runPic(opts.jar,cl)
-
- elif pic.picname == 'ValidateSamFile':
- import pysam
- doTranspose = False
- sortedfile = os.path.join(opts.outdir,'rgValidate.sorted')
- stf = open(pic.log_filename,'w')
- tlog = None
- if opts.datatype == 'sam': # need to work with a bam
- tlog,tempbam,rval = pic.samToBam(opts.input,opts.outdir)
- try:
- tlog = pic.sortSam(tempbam,sortedfile,opts.outdir)
- except:
- print '## exception on sorting sam file %s' % opts.input
- else: # is already bam
- try:
- tlog = pic.sortSam(opts.input,sortedfile,opts.outdir)
- except: # bug - [bam_sort_core] not being ignored - TODO fixme
- print '## exception on sorting bam file %s' % opts.input
- if tlog:
- print '##tlog=',tlog
- stf.write(tlog)
- stf.write('\n')
- sortedfile = '%s.bam' % sortedfile # samtools does that
- cl.append('O=%s' % pic.metricsOut)
- cl.append('TMP_DIR=%s' % opts.tmpdir)
- cl.append('I=%s' % sortedfile)
- opts.maxerrors = '99999999'
- cl.append('MAX_OUTPUT=%s' % opts.maxerrors)
- if opts.ignoreflags[0] <> 'None': # picard error values to ignore
- igs = ['IGNORE=%s' % x for x in opts.ignoreflags if x <> 'None']
- cl.append(' '.join(igs))
- if opts.bisulphite.lower() <> 'false':
- cl.append('IS_BISULFITE_SEQUENCED=true')
- if opts.ref <> None or opts.ref_file <> None:
- cl.append('R=%s' % ref_file_name)
- stdouts,rval = pic.runPic(opts.jar,cl)
- if opts.datatype == 'sam':
- pic.delme.append(tempbam)
- newsam = opts.output
- outformat = 'bam'
- pe = open(pic.metricsOut,'r').readlines()
- pic.cleanSam(insam=sortedfile, newsam=newsam, picardErrors=pe,outformat=outformat)
- pic.delme.append(sortedfile) # not wanted
- stf.close()
- pic.cleanup()
- else:
- print >> sys.stderr,'picard.py got an unknown tool name - %s' % pic.picname
- sys.exit(1)
- if haveTempout:
- # Some Picard tools produced a potentially intermediate bam file.
- # Either just move to final location or create sam
- if os.path.exists(tempout):
- shutil.move(tempout, os.path.abspath(opts.output))
- if opts.htmlout <> None or doFix: # return a pretty html page
- pic.fixPicardOutputs(transpose=doTranspose,maxloglines=maxloglines)
- if rval <> 0:
- print >> sys.stderr, '## exit code=%d; stdout=%s' % (rval,stdouts)
- # signal failure
-if __name__=="__main__": __main__()
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardASMetrics.xml
--- a/rgPicardASMetrics.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-
-
- picard_wrapper.py -i "${input_file}" -d "${html_file.files_path}" -t "${html_file}"
- --assumesorted "${sorted}" -b "${bisulphite}" --adaptors "${adaptors}" --maxinsert "${maxinsert}" -n "${out_prefix}" --datatype "${input_file.ext}"
- -j \$JAVA_JAR_PATH/CollectAlignmentSummaryMetrics.jar --tmpdir "${__new_file_path__}"
-#if $genomeSource.refGenomeSource == "history":
- --ref-file "${genomeSource.ownFile}"
-#else
- --ref "${genomeSource.index.fields.path}"
-#end if
-
- picard
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Summary**
-
-This Galaxy tool uses Picard to report high-level measures of alignment based on a provided sam or bam file.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for CollectAlignmentSummaryMetrics, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
------
-
-.. class:: infomark
-
-**Syntax**
-
-- **Input** - SAM/BAM format aligned short read data in your current history
-- **Title** - the title to use for all output files from this job - use it for high level metadata
-- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:
-
- - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.
- - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.
- - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.
-
-- **Assume Sorted** - saves sorting time - but only if true!
-- **Bisulphite data** - see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
-- **Maximum acceptable insertion length** - see Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectAlignmentSummaryMetrics
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-The Picard documentation (reformatted for Galaxy) says:
-
-.. csv-table::
- :header-rows: 1
-
- Option,Description
- "INPUT=File","SAM or BAM file Required."
- "OUTPUT=File","File to write insert size metrics to Required."
- "REFERENCE_SEQUENCE=File","Reference sequence file Required."
- "ASSUME_SORTED=Boolean","If true (default), unsorted SAM/BAM files will be considerd coordinate sorted "
- "MAX_INSERT_SIZE=Integer","Paired end reads above this insert size will be considered chimeric along with inter-chromosomal pairs. Default value: 100000."
- "ADAPTER_SEQUENCE=String","This option may be specified 0 or more times. "
- "IS_BISULFITE_SEQUENCED=Boolean","Whether the SAM or BAM file consists of bisulfite sequenced reads. Default value: false. "
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created."
-
-The output produced by the tool has the following columns::
-
- 1. CATEGORY: One of either UNPAIRED (for a fragment run), FIRST_OF_PAIR when metrics are for only the first read in a paired run, SECOND_OF_PAIR when the metrics are for only the second read in a paired run or PAIR when the metrics are aggregeted for both first and second reads in a pair.
- 2. TOTAL_READS: The total number of reads including all PF and non-PF reads. When CATEGORY equals PAIR this value will be 2x the number of clusters.
- 3. PF_READS: The number of PF reads where PF is defined as passing Illumina's filter.
- 4. PCT_PF_READS: The percentage of reads that are PF (PF_READS / TOTAL_READS)
- 5. PF_NOISE_READS: The number of PF reads that are marked as noise reads. A noise read is one which is composed entirey of A bases and/or N bases. These reads are marked as they are usually artifactual and are of no use in downstream analysis.
- 6. PF_READS_ALIGNED: The number of PF reads that were aligned to the reference sequence. This includes reads that aligned with low quality (i.e. their alignments are ambiguous).
- 7. PCT_PF_READS_ALIGNED: The percentage of PF reads that aligned to the reference sequence. PF_READS_ALIGNED / PF_READS
- 8. PF_HQ_ALIGNED_READS: The number of PF reads that were aligned to the reference sequence with a mapping quality of Q20 or higher signifying that the aligner estimates a 1/100 (or smaller) chance that the alignment is wrong.
- 9. PF_HQ_ALIGNED_BASES: The number of bases aligned to the reference sequence in reads that were mapped at high quality. Will usually approximate PF_HQ_ALIGNED_READS * READ_LENGTH but may differ when either mixed read lengths are present or many reads are aligned with gaps.
- 10. PF_HQ_ALIGNED_Q20_BASES: The subest of PF_HQ_ALIGNED_BASES where the base call quality was Q20 or higher.
- 11. PF_HQ_MEDIAN_MISMATCHES: The median number of mismatches versus the reference sequence in reads that were aligned to the reference at high quality (i.e. PF_HQ_ALIGNED READS).
- 12. PF_HQ_ERROR_RATE: The percentage of bases that mismatch the reference in PF HQ aligned reads.
- 13. MEAN_READ_LENGTH: The mean read length of the set of reads examined. When looking at the data for a single lane with equal length reads this number is just the read length. When looking at data for merged lanes with differing read lengths this is the mean read length of all reads.
- 14. READS_ALIGNED_IN_PAIRS: The number of aligned reads who's mate pair was also aligned to the reference.
- 15. PCT_READS_ALIGNED_IN_PAIRS: The percentage of reads who's mate pair was also aligned to the reference. READS_ALIGNED_IN_PAIRS / PF_READS_ALIGNED
- 16. BAD_CYCLES: The number of instrument cycles in which 80% or more of base calls were no-calls.
- 17. STRAND_BALANCE: The number of PF reads aligned to the positive strand of the genome divided by the number of PF reads aligned to the genome.
- 18. PCT_CHIMERAS: The percentage of reads that map outside of a maximum insert size (usually 100kb) or that have the two ends mapping to different chromosomes.
- 19. PCT_ADAPTER: The percentage of PF reads that are unaligned and match to a known adapter sequence right from the start of the read.
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardFixMate.xml
--- a/rgPicardFixMate.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,107 +0,0 @@
-
- for paired data
-
- picard_wrapper.py -i "${input_file}" -o "${out_file}" --tmpdir "${__new_file_path__}" -n "${out_prefix}"
- --output-format "${outputFormat}" -j "\$JAVA_JAR_PATH/FixMateInformation.jar" --sortorder "${sortOrder}"
-
- picard
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Purpose**
-
-Ensure that all mate-pair information is in sync between each read and it's mate pair.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for FixMateInformation, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
-.. class:: warningmark
-
-**Useful for paired data only**
-
-Likely won't do anything helpful for single end sequence data
-Currently, Galaxy doesn't distinguish paired from single ended SAM/BAM so make sure
-the data you choose are valid (paired end) SAM or BAM data - unless you trust this
-tool not to harm your data.
-
------
-
-.. class:: infomark
-
-**Syntax**
-
-- **Input** - a paired read sam/bam format aligned short read data in your current history
-- **Sort order** - can be used to adjust the ordering of reads
-- **Title** - the title to use for all output files from this job - use it for high level metadata
-- **Output Format** - either SAM or compressed as BAM
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-.. csv-table::
-
- :header-rows: 1
-
- Option,Description
- "INPUT=File","The input file to fix. This option may be specified 0 or more times."
- "OUTPUT=File","The output file to write to"
- "SORT_ORDER=SortOrder","Optional sort order if the OUTPUT file should be sorted differently than the INPUT file. Default value: null. Possible values: {unsorted, queryname, coordinate}"
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false"
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardGCBiasMetrics.xml
--- a/rgPicardGCBiasMetrics.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,150 +0,0 @@
-
-
- picard_wrapper.py -i "${input_file}" -d "${html_file.files_path}" -t "${html_file}"
- --windowsize "${windowsize}" --mingenomefrac "${mingenomefrac}" -n "${out_prefix}" --tmpdir "${__new_file_path__}"
- -j "\$JAVA_JAR_PATH/CollectGcBiasMetrics.jar"
-#if $genomeSource.refGenomeSource == "history":
- --ref-file "${genomeSource.ownFile}"
-#else:
- --ref "${genomeSource.index.fields.path}"
-#end if
-
- picard
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Summary**
-
-This Galaxy tool uses Picard to report detailed metrics about reads that fall within windows of a certain GC bin on the reference genome.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for CollectGcBiasMetrics, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
------
-
-.. class:: infomark
-
-**Syntax**
-
-- **Input** - SAM/BAM format aligned short read data in your current history
-- **Title** - the title to use for all output files from this job - use it for high level metadata
-- **Reference Genome** - Galaxy (and Picard) needs to know which genomic reference was used to generate alignemnts within the input SAM/BAM dataset. Here you have three choices:
-
- - *Assigned data genome/build* - a genome specified for this dataset. If you your SAM/BAM dataset has an assigned reference genome it will be displayed below this dropdown. If it does not -> use one of the following two options.
- - *Select a different built-in genome* - this option will list all reference genomes presently cached at this instance of Galaxy.
- - *Select a reference genome from history* - alternatively you can upload your own version of reference genome into your history and use it with this option. This is however not advisable with large human-sized genomes. If your genome is large contact Galaxy team using "Help" link at the top of the interface and provide exact details on where we can download sequences you would like to use as the refenece. We will then install them as a part of locally cached genomic references.
-
-- **Window Size** see Picard documentation http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics
-- **Minimum Genome Fraction** See Picard documentation at http://picard.sourceforge.net/command-line-overview.shtml#CollectGCBiasMetrics
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-The Picard documentation (reformatted for Galaxy) says:
-
-.. csv-table::
- :header-rows: 1
-
- Option,Description
- "REFERENCE_SEQUENCE=File","The reference sequence fasta file. Required."
- "INPUT=File","The BAM or SAM file containing aligned reads. Required."
- "OUTPUT=File","The text file to write the metrics table to. Required."
- "CHART_OUTPUT=File","The PDF file to render the chart to. Required."
- "SUMMARY_OUTPUT=File","The text file to write summary metrics to. Default value: null."
- "WINDOW_SIZE=Integer","The size of windows on the genome that are used to bin reads. Default value: 100."
- "MINIMUM_GENOME_FRACTION=Double","For summary metrics, exclude GC windows that include less than this fraction of the genome. Default value: 1.0E-5."
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false."
-
-The output produced by the tool has the following columns::
-
- 1. GC: The G+C content of the reference sequence represented by this bin. Values are from 0% to 100%
- 2. WINDOWS: The number of windows on the reference genome that have this G+C content.
- 3. READ_STARTS: The number of reads who's start position is at the start of a window of this GC.
- 4. MEAN_BASE_QUALITY: The mean quality (determined via the error rate) of all bases of all reads that are assigned to windows of this GC.
- 5. NORMALIZED_COVERAGE: The ration of "coverage" in this GC bin vs. the mean coverage of all GC bins. A number of 1 represents mean coverage, a number less than one represents lower than mean coverage (e.g. 0.5 means half as much coverage as average) while a number greater than one represents higher than mean coverage (e.g. 3.1 means this GC bin has 3.1 times more reads per window than average).
- 6. ERROR_BAR_WIDTH: The radius of error bars in this bin based on the number of observations made. For example if the normalized coverage is 0.75 and the error bar width is 0.1 then the error bars would be drawn from 0.65 to 0.85.
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardHsMetrics.xml
--- a/rgPicardHsMetrics.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,156 +0,0 @@
-
- for targeted resequencing data
-
-
- picard_wrapper.py -i "${input_file}" -d "${html_file.files_path}" -t "${html_file}" --datatype "${input_file.ext}"
- --baitbed "${bait_bed}" --targetbed "${target_bed}" -n "${out_prefix}" --tmpdir "${__new_file_path__}"
- -j "\$JAVA_JAR_PATH/CalculateHsMetrics.jar"
-
-
- picard
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Summary**
-
-Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file.
-
-.. class:: warnmark
-
-**WARNING about bait and target files**
-
-Picard is very fussy about the bait and target file format. If these are not exactly right, it will fail with an error something like:
-
-Exception in thread "main" net.sf.picard.PicardException: Invalid interval record contains 6 fields: chr1 45787123 45787316 CASO_22G_25063 1000 +
-
-If you see an error like that from this tool, please do NOT report it to any of the Galaxy mailing lists as it is not a bug!
-It means you must reformat your bait and target files. Galaxy cannot do that for you automatically unfortunately.
-
-The required definition is described in the documentation at http://www.broadinstitute.org/gsa/wiki/index.php/Built-in_command-line_arguments
-and the sample provided looks like this:
-
-chr1 1104841 1104940 + target_1
-chr1 1105283 1105599 + target_2
-chr1 1105712 1105860 + target_3
-chr1 1105960 1106119 + target_4
-
-So your bait and target files MUST have 5 columns with chr, start, end, strand and name tab delimited and in exactly that order.
-Note that the Picard mandated sam header described in the documentation linked above is automagically added by the tool in Galaxy.
-
-.. class:: infomark
-
-**Picard documentation**
-
-This is a Galaxy wrapper for CalculateHsMetrics.jar, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-Picard documentation says (reformatted for Galaxy):
-
-Calculates a set of Hybrid Selection specific metrics from an aligned SAM or BAM file.
-
-.. csv-table::
- :header-rows: 1
-
- "Option", "Description"
- "BAIT_INTERVALS=File","An interval list file that contains the locations of the baits used. Required."
- "TARGET_INTERVALS=File","An interval list file that contains the locations of the targets. Required."
- "INPUT=File","An aligned SAM or BAM file. Required."
- "OUTPUT=File","The output file to write the metrics to. Required. Cannot be used in conjuction with option(s) METRICS_FILE (M)"
- "METRICS_FILE=File","Legacy synonym for OUTPUT, should not be used. Required. Cannot be used in conjuction with option(s) OUTPUT (O)"
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false"
-
-HsMetrics
-
- The set of metrics captured that are specific to a hybrid selection analysis.
-
-Output Column Definitions::
-
- 1. BAIT_SET: The name of the bait set used in the hybrid selection.
- 2. GENOME_SIZE: The number of bases in the reference genome used for alignment.
- 3. BAIT_TERRITORY: The number of bases which have one or more baits on top of them.
- 4. TARGET_TERRITORY: The unique number of target bases in the experiment where target is usually exons etc.
- 5. BAIT_DESIGN_EFFICIENCY: Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target.
- 6. TOTAL_READS: The total number of reads in the SAM or BAM file examine.
- 7. PF_READS: The number of reads that pass the vendor's filter.
- 8. PF_UNIQUE_READS: The number of PF reads that are not marked as duplicates.
- 9. PCT_PF_READS: PF reads / total reads. The percent of reads passing filter.
- 10. PCT_PF_UQ_READS: PF Unique Reads / Total Reads.
- 11. PF_UQ_READS_ALIGNED: The number of PF unique reads that are aligned with mapping score > 0 to the reference genome.
- 12. PCT_PF_UQ_READS_ALIGNED: PF Reads Aligned / PF Reads.
- 13. PF_UQ_BASES_ALIGNED: The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps.
- 14. ON_BAIT_BASES: The number of PF aligned bases that mapped to a baited region of the genome.
- 15. NEAR_BAIT_BASES: The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region.
- 16. OFF_BAIT_BASES: The number of PF aligned bases that mapped to neither on or near a bait.
- 17. ON_TARGET_BASES: The number of PF aligned bases that mapped to a targetted region of the genome.
- 18. PCT_SELECTED_BASES: On+Near Bait Bases / PF Bases Aligned.
- 19. PCT_OFF_BAIT: The percentage of aligned PF bases that mapped neither on or near a bait.
- 20. ON_BAIT_VS_SELECTED: The percentage of on+near bait bases that are on as opposed to near.
- 21. MEAN_BAIT_COVERAGE: The mean coverage of all baits in the experiment.
- 22. MEAN_TARGET_COVERAGE: The mean coverage of targets that recieved at least coverage depth = 2 at one base.
- 23. PCT_USABLE_BASES_ON_BAIT: The number of aligned, de-duped, on-bait bases out of the PF bases available.
- 24. PCT_USABLE_BASES_ON_TARGET: The number of aligned, de-duped, on-target bases out of the PF bases available.
- 25. FOLD_ENRICHMENT: The fold by which the baited region has been amplified above genomic background.
- 26. ZERO_CVG_TARGETS_PCT: The number of targets that did not reach coverage=2 over any base.
- 27. FOLD_80_BASE_PENALTY: The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to the mean coverage level in those targets.
- 28. PCT_TARGET_BASES_2X: The percentage of ALL target bases acheiving 2X or greater coverage.
- 29. PCT_TARGET_BASES_10X: The percentage of ALL target bases acheiving 10X or greater coverage.
- 30. PCT_TARGET_BASES_20X: The percentage of ALL target bases acheiving 20X or greater coverage.
- 31. PCT_TARGET_BASES_30X: The percentage of ALL target bases acheiving 30X or greater coverage.
- 32. HS_LIBRARY_SIZE: The estimated number of unique molecules in the selected part of the library.
- 33. HS_PENALTY_10X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 10X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 10 * HS_PENALTY_10X.
- 34. HS_PENALTY_20X: The "hybrid selection penalty" incurred to get 80% of target bases to 20X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 20X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 20 * HS_PENALTY_20X.
- 35. HS_PENALTY_30X: The "hybrid selection penalty" incurred to get 80% of target bases to 10X. This metric should be interpreted as: if I have a design with 10 megabases of target, and want to get 30X coverage I need to sequence until PF_ALIGNED_BASES = 10^6 * 30 * HS_PENALTY_30X.
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardInsertSize.xml
--- a/rgPicardInsertSize.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,97 +0,0 @@
-
- for PAIRED data
- picard
-
- picard_wrapper.py -i "${input_file}" -n "${out_prefix}" --tmpdir "${__new_file_path__}" --deviations "${deviations}"
- --histwidth "${histWidth}" --minpct "${minPct}" --malevel "${malevel}"
- -j "\$JAVA_JAR_PATH/CollectInsertSizeMetrics.jar" -d "${html_file.files_path}" -t "${html_file}"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Purpose**
-
-Reads a SAM or BAM file and describes the distribution
-of insert size (excluding duplicates) with metrics and a histogram plot.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for CollectInsertSizeMetrics, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
-.. class:: warningmark
-
-**Useful for paired data only**
-
-This tool works for paired data only and can be expected to fail for single end data.
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-Picard documentation says (reformatted for Galaxy):
-
-.. csv-table::
- :header-rows: 1
-
- Option,Description
- "INPUT=File","SAM or BAM file Required."
- "OUTPUT=File","File to write insert size metrics to Required."
- "HISTOGRAM_FILE=File","File to write insert size histogram chart to Required."
- "TAIL_LIMIT=Integer","When calculating mean and stdev stop when the bins in the tail of the distribution contain fewer than mode/TAIL_LIMIT items. This also limits how much data goes into each data category of the histogram."
- "HISTOGRAM_WIDTH=Integer","Explicitly sets the histogram width, overriding the TAIL_LIMIT option. Also, when calculating mean and stdev, only bins LE HISTOGRAM_WIDTH will be included. "
- "MINIMUM_PCT=Float","When generating the histogram, discard any data categories (out of FR, TANDEM, RF) that have fewer than this percentage of overall reads. (Range: 0 to 1) Default value: 0.01."
- "STOP_AFTER=Integer","Stop after processing N reads, mainly for debugging. Default value: 0."
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false."
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardLibComplexity.xml
--- a/rgPicardLibComplexity.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-
- picard
-
- picard_wrapper.py -i "${input_file}" -n "${out_prefix}" --tmpdir "${__new_file_path__}" --minid "${minIDbases}"
- --maxdiff "${maxDiff}" --minmeanq "${minMeanQ}" --readregex "${readRegex}" --optdupdist "${optDupeDist}"
- -j "\$JAVA_JAR_PATH/EstimateLibraryComplexity.jar" -d "${html_file.files_path}" -t "${html_file}"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Purpose**
-
-Attempts to estimate library complexity from sequence alone.
-Does so by sorting all reads by the first N bases (5 by default) of each read and then
-comparing reads with the first N bases identical to each other for duplicates. Reads are considered to be
-duplicates if they match each other with no gaps and an overall mismatch rate less than or equal to MAX_DIFF_RATE (0.03 by default).
-
-Reads of poor quality are filtered out so as to provide a more accurate estimate.
-The filtering removes reads with any no-calls in the first N bases or with a mean base quality lower than
-MIN_MEAN_QUALITY across either the first or second read.
-
-The algorithm attempts to detect optical duplicates separately from PCR duplicates and excludes these in the
-calculation of library size. Also, since there is no alignment to screen out technical reads one
-further filter is applied on the data. After examining all reads a histogram is built of
-[#reads in duplicate set -> #of duplicate sets]; all bins that contain exactly one duplicate set are
-then removed from the histogram as outliers before library size is estimated.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for EstimateLibraryComplexity, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-Picard documentation says (reformatted for Galaxy):
-
-.. csv-table::
- :header-rows: 1
-
- Option Description
- "INPUT=File","One or more files to combine and estimate library complexity from. Reads can be mapped or unmapped. This option may be specified 0 or more times."
- "OUTPUT=File","Output file to writes per-library metrics to. Required."
- "MIN_IDENTICAL_BASES=Integer","The minimum number of bases at the starts of reads that must be identical for reads to be grouped together for duplicate detection. In effect total_reads / 4^max_id_bases reads will be compared at a time, so lower numbers will produce more accurate results but consume exponentially more memory and CPU. Default value: 5."
- "MAX_DIFF_RATE=Double","The maximum rate of differences between two reads to call them identical. Default value: 0.03. "
- "MIN_MEAN_QUALITY=Integer","The minimum mean quality of the bases in a read pair for the read to be analyzed. Reads with lower average quality are filtered out and not considered in any calculations. Default value: 20."
- "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to clear the default value."
- "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
- "CREATE_MD5_FILE=Boolean","Whether to create an MD5 digest for any BAM files created. Default value: false. This option can be set to 'null' to clear the default value. "
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-
-.. class:: infomark
-
-**Note on the Regular Expression**
-
-(from the Picard docs)
-This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file.
-These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size.
-The regular expression should contain three capture groups for the three variables, in order.
-Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*.
-
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e rgPicardMarkDups.xml
--- a/rgPicardMarkDups.xml Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,130 +0,0 @@
-
-
- picard_wrapper.py -i "${input_file}" -n "${out_prefix}" --tmpdir "${__new_file_path__}" -o "${out_file}"
- --remdups "${remDups}" --assumesorted "${assumeSorted}" --readregex "${readRegex}" --optdupdist "${optDupeDist}"
- -j "\$JAVA_JAR_PATH/MarkDuplicates.jar" -d "${html_file.files_path}" -t "${html_file}" -e "${input_file.ext}"
-
- picard
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-**Purpose**
-
-Marks all duplicate reads in a provided SAM or BAM file and either removes them or flags them.
-
-**Picard documentation**
-
-This is a Galaxy wrapper for MarkDuplicates, a part of the external package Picard-tools_.
-
- .. _Picard-tools: http://www.google.com/search?q=picard+samtools
-
------
-
-.. class:: infomark
-
-**Inputs, outputs, and parameters**
-
-Picard documentation says (reformatted for Galaxy):
-
-.. csv-table:: Mark Duplicates docs
- :header-rows: 1
-
- Option,Description
- "INPUT=File","The input SAM or BAM file to analyze. Must be coordinate sorted. Required."
- "OUTPUT=File","The output file to right marked records to Required."
- "METRICS_FILE=File","File to write duplication metrics to Required."
- "REMOVE_DUPLICATES=Boolean","If true do not write duplicates to the output file instead of writing them with appropriate flags set. Default value: false."
- "ASSUME_SORTED=Boolean","If true, assume that the input file is coordinate sorted, even if the header says otherwise. Default value: false."
- "MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=Integer","This option is obsolete. ReadEnds will always be spilled to disk. Default value: 50000."
- "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=Integer","Maximum number of file handles to keep open when spilling read ends to disk."
- "READ_NAME_REGEX=String","Regular expression that can be used to parse read names in the incoming SAM file. Read names are parsed to extract three variables: tile/region, x coordinate and y coordinate. "
- "OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer","The maximum offset between two duplicte clusters in order to consider them optical duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) unless using later versions of the Illumina pipeline that multiply pixel values by 10, in which case 50-100 is more normal. Default value: 100"
-
-.. class:: warningmark
-
-**Warning on SAM/BAM quality**
-
-Many SAM/BAM files produced externally and uploaded to Galaxy do not fully conform to SAM/BAM specifications. Galaxy deals with this by using the **LENIENT**
-flag when it runs Picard, which allows reads to be discarded if they're empty or don't map. This appears
-to be the only way to deal with SAM/BAM that cannot be parsed.
-.. class:: infomark
-
-**Note on the Regular Expression**
-
-(from the Picard docs)
-This tool requires a valid regular expression to parse out the read names in the incoming SAM or BAM file. These values are used to estimate the rate of optical duplication in order to give a more accurate estimated library size. The regular expression should contain three capture groups for the three variables, in order. Default value: [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).
-
-Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. All records are then written to the output file with the duplicate records flagged unless the remove duplicates option is selected. In some cases you may want to do this, but please only do this if you really understand what you are doing.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/bfast_out1.sam
--- a/test-data/bfast_out1.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-@HD VN:0.1.2 SO:unsorted GO:none
-@SQ SN:phiX174 LN:5386
-@PG ID:bfast VN:0.6.4d
-random_phiX_region_0 0 phiX174 553 255 50M * 0 0 TTGAGGCTTGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_1 0 phiX174 3693 255 50M * 0 0 GTTAGTGCTGAGGTTGACTTAGTTCATCAGCAAACGCAGAATCAGCGGTA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_2 0 phiX174 375 255 50M * 0 0 AATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTTTCCA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_3 0 phiX174 3168 255 50M * 0 0 GGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_4 0 phiX174 5254 255 50M * 0 0 ACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGAC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_5 0 phiX174 5066 255 50M * 0 0 AGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACTTCCCA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_6 0 phiX174 1226 255 50M * 0 0 CACGTTTATGGTGAACAGTGGATTAAGTTCATGAAGGATGGTGTTAATGC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_7 0 phiX174 1096 255 50M * 0 0 AACTACTCCGGTTATCGCTGGCGACTCCTTCGAGATGGACGCCGTTGGCG ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_8 0 phiX174 535 255 50M * 0 0 CTCGTGCTCGTCGCTGCGTTGAGGCTTGCGTTTATGGTACGCTGGACTTT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_9 0 phiX174 3669 255 50M * 0 0 CAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGACTTAGTTCA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_10 0 phiX174 4887 255 50M * 0 0 TACAGTATGCCCATCGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_11 0 phiX174 1849 255 50M * 0 0 TATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_12 0 phiX174 4145 255 50M * 0 0 AGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_13 0 phiX174 1853 255 50M * 0 0 TTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGACTAAAGAGATTCA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_14 0 phiX174 2800 255 50M * 0 0 CCGGGCAATAACGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGC ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2300 NM:i:1 NH:i:1 IH:i:1 HI:i:1 MD:Z:11T38 XA:i:0
-random_phiX_region_15 0 phiX174 1910 255 50M * 0 0 AACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_16 0 phiX174 3366 255 50M * 0 0 GCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_17 0 phiX174 2165 255 50M * 0 0 CATGATTATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAG ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_18 0 phiX174 2051 255 50M * 0 0 TGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
-random_phiX_region_19 0 phiX174 5099 255 50M * 0 0 GCTGTCGCTACTTCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PG:Z:bfast AS:i:2500 NM:i:0 NH:i:1 IH:i:1 HI:i:1 MD:Z:50 XA:i:0
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/bwa_wrapper_in2.fastqsanger
--- a/test-data/bwa_wrapper_in2.fastqsanger Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-@seq1/1
-GGACTCAGATAGTAATCC
-+
-II#IIIIIII$5+.(9II
-@seq2/1
-ATTCGACCTATCCTTGCG
-+
-IIIIIIIIIIIIIIIIII
-@seq3/1
-GTAACAAAGTTTGGATTG
-+
-IIIIIIIIIIIIIIIIII
-@seq4/1
-AGCCGCTCGTCTTTTATG
-+
-IIIIIIIIIIIIIIIIII
-@seq5/1
-CAGTTATATGGCTTTTGG
-+
-IIIIIIIIIIIIIIIIII
-@seq6/1
-AGGCGCTCGTCTTGGTAT
-+
-IIIIIIIIIIIIIIIIII
-@seq7/1
-TGTAGGTGGTCAACCAAT
-+
-IIIIIIIIIIIIIIIIII
-@seq8/1
-ACACCCGTCCTTTACGTC
-+
-IIIIIIIIIIIIIIIIII
-@seq9/1
-GCCGCTATTCAGGTTGTT
-+
-IIIIIIIIIIIIIIIIII
-@seq10/1
-ATTCTTTCTTTTCGTATC
-+
-IIIIIIIIIIIIIIIIII
-@seq11/1
-GCATTTCTACTCCTTCTC
-+
-II#IIIIIII$5+.(9II
-@seq12/1
-CGCGCTTCGATAAAAATG
-+
-IIIIIIIIIIIIIIIIII
-@seq13/1
-ATTTCTACTCTTTCTCAT
-+
-IIIIIIIIIIIIIIIIII
-@seq14/1
-CCCTTTTGAATGTCACGC
-+
-IIIIIIIIIIIIIIIIII
-@seq15/1
-CCAACTTACCAAGGTGGG
-+
-IIIIIIIIIIIIIIIIII
-@seq16/1
-TCAGGGTATTAAAAGAGA
-+
-IIIIIIIIIIIIIIIIII
-@seq17/1
-GTGATGTGCTTGCTACCG
-+
-IIIIIIIIIIIIIIIIII
-@seq18/1
-TCAATCCCCCATGCTTGG
-+
-IIIIIIIIIIIIIIIIII
-@seq19/1
-TTCCTGCGCTTAATGCTT
-+
-IIIIIIIIIIIIIIIIII
-@seq20/1
-CTTATTACCATTTCAACT
-+
-IIIIIIIIIIIIIIIIII
-@seq21/1
-CTGATACCAATAAAACCC
-+
-II#IIIIIII$5+.(9II
-@seq22/1
-AATCAAACTTACCAAGGG
-+
-IIIIIIIIIIIIIIIIII
-@seq23/1
-TGTGCTTCCCCAACTTGA
-+
-IIIIIIIIIIIIIIIIII
-@seq24/1
-TTTCTCAATCCCCAATGC
-+
-IIIIIIIIIIIIIIIIII
-@seq25/1
-TTGCTACTGACCGCTCTT
-+
-IIIIIIIIIIIIIIIIII
-@seq26/1
-CCGCGTGAAATTTCTATG
-+
-IIIIIIIIIIIIIIIIII
-@seq27/1
-CGCTAATCAAGTTGTTTC
-+
-IIIIIIIIIIIIIIIIII
-@seq28/1
-AAAGAGATTATTTGTCGG
-+
-IIIIIIIIIIIIIIIIII
-@seq29/1
-CAAATTAATGCGCGCTTC
-+
-IIIIIIIIIIIIIIIIII
-@seq30/1
-ATCCCCTATGCTTGGCTT
-+
-IIIIIIIIIIIIIIIIII
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/bwa_wrapper_in3.fastqsanger
--- a/test-data/bwa_wrapper_in3.fastqsanger Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,120 +0,0 @@
-@seq1/2
-ACGCTCCTTTAAAATATC
-+
-IIIII$%*$G$A31I&&B
-@seq2/2
-CAGCTCGAGAAGCTCTTA
-+
-IIIIIIIIIIIIIIIIII
-@seq3/2
-CTACTGACCGCTCTCGTG
-+
-IIIIIIIIIIIIIIIIII
-@seq4/2
-TAGGTGGTCAACCATTTT
-+
-IIIIIIIIIIIIIIIIII
-@seq5/2
-TTTCTATGTGGCTTAATA
-+
-IIIIIIIIIIIIIIIIII
-@seq6/2
-GTAGGTGGTCAACAATTT
-+
-IIIIIIIIIIIIIIIIII
-@seq7/2
-TTTAATTGCAGGGGCTTC
-+
-IIIIIIIIIIIIIIIIII
-@seq8/2
-ATGCGCTCTATTCTCTGG
-+
-IIIIIIIIIIIIIIIIII
-@seq9/2
-TTCTGTTGGTGCTGATAT
-+
-IIIIIIIIIIIIIIIIII
-@seq10/2
-AGGGCGTTGAGTTCGATA
-+
-IIIIIIIIIIIIIIIIII
-@seq11/2
-ATCCCCAATGCTTGGCTT
-+
-IIIII$%*$G$A31I&&B
-@seq12/2
-GGATTGGCGTTTCCAACC
-+
-IIIIIIIIIIIIIIIIII
-@seq13/2
-CCCCAATCCTTGCCTTCC
-+
-IIIAAIIIIIIIIIIIII
-@seq14/2
-TGATATTTTGACTTTGAG
-+
-IIIIIIIIIIIIIIIIII
-@seq15/2
-TTACGAAACGCGACGCCG
-+
-IIIIIIIIIIIIIIIIII
-@seq16/2
-TTATTTTTCTCCAGCCAC
-+
-IIIIIIIIIIIIIIIIII
-@seq17/2
-AAACAATACTTTAGGCAT
-+
-IIIIIIIIIIIIIIIIII
-@seq18/2
-CCGTTCCATAAGCAGATG
-+
-IIIIIIIIIIIIIIIIII
-@seq19/2
-GAGCGTCCTGGTGCTGAT
-+
-IIIIIIIIIIIIIIIIII
-@seq20/2
-ACTCCGGTTATCGCTGGC
-+
-IIIIIIIIIIIIIIIIII
-@seq21/2
-TAAGCATTTGGTTCAGGG
-+
-IIIII$%*$G$A31I&&B
-@seq22/2
-GTTACGACGCGACGCCGT
-+
-IIIIIIIIIIIIIIIIII
-@seq23/2
-TTTAATAACCCTATAGAC
-+
-IIIIIIIIIIIIIIIIII
-@seq24/2
-CTTGGCTTCCCTAAGCAG
-+
-IIIIIIIIIIIIIIIIII
-@seq25/2
-CGTGCTCGTTGCTGCGTT
-+
-IIIIIIIIIIIIIIIIII
-@seq26/2
-AAGGATGTTTTCCGTTCT
-+
-IIIIIIIIIIIIIIIIII
-@seq27/2
-TGTTTGGTGCTGATATTG
-+
-IIIIIIIIIIIIIIIIII
-@seq28/2
-TCCAGCCACTAAAGTGAG
-+
-IIIIIIIIIIIIIIIIII
-@seq29/2
-GATAATGATTGGGGTATC
-+
-IIIIIIIIIIIIIIIIII
-@seq30/2
-ACCATAAGCAGATGGATA
-+
-IIIIIIIIIIIIIIIIII
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/bwa_wrapper_out3.sam
--- a/test-data/bwa_wrapper_out3.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,63 +0,0 @@
-@SQ SN:phiX174 LN:5386
-@RG ID:abcdefg LB:lib-mom-A PL:ILLUMINA SM:mom DS:descrip DT:2010-11-01 PI:400
-@PG ID:bwa PN:bwa VN:0.5.9-r16
-seq1 113 phiX174 340 37 18M = 322 -18 GGATTACTATCTGAGTCC II9(.+5$IIIIIII#II RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq1 177 phiX174 322 25 18M = 340 18 GATATTTTAAAGGAGCGT B&&I13A$G$*%$IIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:2C8A6
-seq2 65 phiX174 141 37 18M = 159 18 ATTCGACCTATCCTTGCG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq2 129 phiX174 159 37 18M = 141 -18 CAGCTCGAGAAGCTCTTA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq3 65 phiX174 505 37 18M = 523 18 GTAACAAAGTTTGGATTG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq3 129 phiX174 523 37 18M = 505 -18 CTACTGACCGCTCTCGTG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq4 69 phiX174 945 0 * = 945 0 AGCCGCTCGTCTTTTATG IIIIIIIIIIIIIIIIII RG:Z:abcdefg
-seq4 137 phiX174 945 23 18M = 945 0 TAGGTGGTCAACCATTTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:23 AM:i:0 X0:i:1 X1:i:1 XM:i:1 XO:i:0 XG:i:0 MD:Z:12A5 XA:Z:phiX174,+945,17M1S,2;
-seq5 65 phiX174 4985 37 18M = 5003 18 CAGTTATATGGCTTTTGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:13G4
-seq5 129 phiX174 5003 37 18M = 4985 -18 TTTCTATGTGGCTTAATA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:13A4
-seq6 65 phiX174 925 37 11M1D7M = 944 19 AGGCGCTCGTCTTGGTAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:11^T7
-seq6 129 phiX174 944 37 18M = 925 -19 GTAGGTGGTCAACAATTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq7 65 phiX174 943 25 18M = 960 17 TGTAGGTGGTCAACCAAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:1 XM:i:2 XO:i:0 XG:i:0 MD:Z:14A1T1 XA:Z:phiX174,+943,13M1I4M,2;
-seq7 129 phiX174 960 37 18M = 943 -17 TTTAATTGCAGGGGCTTC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq8 69 phiX174 1715 0 * = 1715 0 ACACCCGTCCTTTACGTC IIIIIIIIIIIIIIIIII RG:Z:abcdefg
-seq8 137 phiX174 1715 37 18M = 1715 0 ATGCGCTCTATTCTCTGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:10A7
-seq9 65 phiX174 2596 37 18M = 2613 17 GCCGCTATTCAGGTTGTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:7A10
-seq9 129 phiX174 2613 37 18M = 2596 -17 TTCTGTTGGTGCTGATAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq10 65 phiX174 4149 25 18M = 4168 19 ATTCTTTCTTTTCGTATC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:5G11G0
-seq10 129 phiX174 4168 37 18M = 4149 -19 AGGGCGTTGAGTTCGATA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq11 65 phiX174 4072 37 18M = 4091 19 GCATTTCTACTCCTTCTC II#IIIIIII$5+.(9II RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:12T5
-seq11 129 phiX174 4091 37 18M = 4072 -19 ATCCCCAATGCTTGGCTT IIIII$%*$G$A31I&&B RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq12 65 phiX174 5349 37 18M = 5365 16 CGCGCTTCGATAAAAATG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq12 129 phiX174 5365 25 18M = 5349 -16 GGATTGGCGTTTCCAACC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0T9A7
-seq13 65 phiX174 4074 37 18M = 4093 19 ATTTCTACTCTTTCTCAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17A0
-seq13 129 phiX174 4093 25 18M = 4074 -19 CCCCAATCCTTGCCTTCC IIIAAIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:7G4G5
-seq14 65 phiX174 3998 37 18M = 4016 18 CCCTTTTGAATGTCACGC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:5C12
-seq14 129 phiX174 4016 37 3M1D15M = 3998 -18 TGATATTTTGACTTTGAG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:3^T15
-seq15 65 phiX174 5198 37 18M = 5216 18 CCAACTTACCAAGGTGGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:13C4
-seq15 129 phiX174 5216 37 5M2I11M = 5198 -18 TTACGAAACGCGACGCCG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:2 MD:Z:16
-seq16 65 phiX174 2880 37 10M1I7M = 2897 17 TCAGGGTATTAAAAGAGA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:5T11
-seq16 129 phiX174 2897 37 18M = 2880 -17 TTATTTTTCTCCAGCCAC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6G11
-seq17 65 phiX174 3034 37 18M = 3053 19 GTGATGTGCTTGCTACCG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq17 129 phiX174 3053 25 18M = 3034 -19 AAACAATACTTTAGGCAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0T9G7
-seq18 73 phiX174 4088 37 18M = 4088 0 TCAATCCCCCATGCTTGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:9A8
-seq18 133 phiX174 4088 0 * = 4088 0 CCGTTCCATAAGCAGATG IIIIIIIIIIIIIIIIII RG:Z:abcdefg
-seq19 65 phiX174 3304 37 18M = 3324 20 TTCCTGCGCTTAATGCTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6A11
-seq19 129 phiX174 3324 37 18M = 3304 -20 GAGCGTCCTGGTGCTGAT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6G11
-seq20 65 phiX174 1082 37 18M = 1100 18 CTTATTACCATTTCAACT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq20 129 phiX174 1100 37 18M = 1082 -18 ACTCCGGTTATCGCTGGC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq21 65 phiX174 1344 23 18M = 1363 19 CTGATACCAATAAAACCC II#IIIIIII$5+.(9II RG:Z:abcdefg XT:A:U NM:i:1 SM:i:23 AM:i:23 X0:i:1 X1:i:1 XM:i:1 XO:i:0 XG:i:0 MD:Z:15T2 XA:Z:phiX174,+1344,15M1D3M,2;
-seq21 129 phiX174 1363 37 18M = 1344 -19 TAAGCATTTGGTTCAGGG IIIII$%*$G$A31I&&B RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:23 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:10T7
-seq22 69 phiX174 5215 0 * = 5215 0 AATCAAACTTACCAAGGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg
-seq22 137 phiX174 5215 37 18M = 5215 0 GTTACGACGCGACGCCGT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq23 65 phiX174 4289 37 18M = 4308 19 TGTGCTTCCCCAACTTGA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6C11
-seq23 129 phiX174 4308 25 18M = 4289 -19 TTTAATAACCCTATAGAC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:0A8A8
-seq24 65 phiX174 4084 37 18M = 4101 17 TTTCTCAATCCCCAATGC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq24 129 phiX174 4101 37 18M = 4084 -17 CTTGGCTTCCCTAAGCAG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:10A7
-seq25 65 phiX174 520 37 18M = 537 17 TTGCTACTGACCGCTCTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:17C0
-seq25 129 phiX174 537 37 18M = 520 -17 CGTGCTCGTTGCTGCGTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:9C8
-seq26 65 phiX174 1976 37 18M = 1994 18 CCGCGTGAAATTTCTATG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq26 129 phiX174 1994 37 18M = 1976 -18 AAGGATGTTTTCCGTTCT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:0 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:18
-seq27 65 phiX174 2598 37 18M = 2614 16 CGCTAATCAAGTTGTTTC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:9G8
-seq27 129 phiX174 2614 37 3M1D15M = 2598 -16 TGTTTGGTGCTGATATTG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:1C1^G15
-seq28 65 phiX174 2890 25 18M = 2906 16 AAAGAGATTATTTGTCGG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:2 SM:i:25 AM:i:25 X0:i:1 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:16T0C0
-seq28 129 phiX174 2906 37 18M = 2890 -16 TCCAGCCACTAAAGTGAG IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:25 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:10T7
-seq29 73 phiX174 5339 37 18M = 5339 0 CAAATTAATGCGCGCTTC IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6T11
-seq29 133 phiX174 5339 0 * = 5339 0 GATAATGATTGGGGTATC IIIIIIIIIIIIIIIIII RG:Z:abcdefg
-seq30 65 phiX174 4091 37 18M = 4108 17 ATCCCCTATGCTTGGCTT IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:6A11
-seq30 129 phiX174 4108 37 18M = 4091 -17 ACCATAAGCAGATGGATA IIIIIIIIIIIIIIIIII RG:Z:abcdefg XT:A:U NM:i:1 SM:i:37 AM:i:37 X0:i:1 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T17
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/phiX.fasta
--- a/test-data/phiX.fasta Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,79 +0,0 @@
->phiX174
-GAGTTTTATCGCTTCCATGACGCAGAAGTTAACACTTTCGGATATTTCTGATGAGTCGAAAAATTATCTT
-GATAAAGCAGGAATTACTACTGCTTGTTTACGAATTAAATCGAAGTGGACTGCTGGCGGAAAATGAGAAA
-ATTCGACCTATCCTTGCGCAGCTCGAGAAGCTCTTACTTTGCGACCTTTCGCCATCAACTAACGATTCTG
-TCAAAAACTGACGCGTTGGATGAGGAGAAGTGGCTTAATATGCTTGGCACGTTCGTCAAGGACTGGTTTA
-GATATGAGTCACATTTTGTTCATGGTAGAGATTCTCTTGTTGACATTTTAAAAGAGCGTGGATTACTATC
-TGAGTCCGATGCTGTTCAACCACTAATAGGTAAGAAATCATGAGTCAAGTTACTGAACAATCCGTACGTT
-TCCAGACCGCTTTGGCCTCTATTAAGCTCATTCAGGCTTCTGCCGTTTTGGATTTAACCGAAGATGATTT
-CGATTTTCTGACGAGTAACAAAGTTTGGATTGCTACTGACCGCTCTCGTGCTCGTCGCTGCGTTGAGGCT
-TGCGTTTATGGTACGCTGGACTTTGTGGGATACCCTCGCTTTCCTGCTCCTGTTGAGTTTATTGCTGCCG
-TCATTGCTTATTATGTTCATCCCGTCAACATTCAAACGGCCTGTCTCATCATGGAAGGCGCTGAATTTAC
-GGAAAACATTATTAATGGCGTCGAGCGTCCGGTTAAAGCCGCTGAATTGTTCGCGTTTACCTTGCGTGTA
-CGCGCAGGAAACACTGACGTTCTTACTGACGCAGAAGAAAACGTGCGTCAAAAATTACGTGCAGAAGGAG
-TGATGTAATGTCTAAAGGTAAAAAACGTTCTGGCGCTCGCCCTGGTCGTCCGCAGCCGTTGCGAGGTACT
-AAAGGCAAGCGTAAAGGCGCTCGTCTTTGGTATGTAGGTGGTCAACAATTTTAATTGCAGGGGCTTCGGC
-CCCTTACTTGAGGATAAATTATGTCTAATATTCAAACTGGCGCCGAGCGTATGCCGCATGACCTTTCCCA
-TCTTGGCTTCCTTGCTGGTCAGATTGGTCGTCTTATTACCATTTCAACTACTCCGGTTATCGCTGGCGAC
-TCCTTCGAGATGGACGCCGTTGGCGCTCTCCGTCTTTCTCCATTGCGTCGTGGCCTTGCTATTGACTCTA
-CTGTAGACATTTTTACTTTTTATGTCCCTCATCGTCACGTTTATGGTGAACAGTGGATTAAGTTCATGAA
-GGATGGTGTTAATGCCACTCCTCTCCCGACTGTTAACACTACTGGTTATATTGACCATGCCGCTTTTCTT
-GGCACGATTAACCCTGATACCAATAAAATCCCTAAGCATTTGTTTCAGGGTTATTTGAATATCTATAACA
-ACTATTTTAAAGCGCCGTGGATGCCTGACCGTACCGAGGCTAACCCTAATGAGCTTAATCAAGATGATGC
-TCGTTATGGTTTCCGTTGCTGCCATCTCAAAAACATTTGGACTGCTCCGCTTCCTCCTGAGACTGAGCTT
-TCTCGCCAAATGACGACTTCTACCACATCTATTGACATTATGGGTCTGCAAGCTGCTTATGCTAATTTGC
-ATACTGACCAAGAACGTGATTACTTCATGCAGCGTTACCGTGATGTTATTTCTTCATTTGGAGGTAAAAC
-CTCTTATGACGCTGACAACCGTCCTTTACTTGTCATGCGCTCTAATCTCTGGGCATCTGGCTATGATGTT
-GATGGAACTGACCAAACGTCGTTAGGCCAGTTTTCTGGTCGTGTTCAACAGACCTATAAACATTCTGTGC
-CGCGTTTCTTTGTTCCTGAGCATGGCACTATGTTTACTCTTGCGCTTGTTCGTTTTCCGCCTACTGCGAC
-TAAAGAGATTCAGTACCTTAACGCTAAAGGTGCTTTGACTTATACCGATATTGCTGGCGACCCTGTTTTG
-TATGGCAACTTGCCGCCGCGTGAAATTTCTATGAAGGATGTTTTCCGTTCTGGTGATTCGTCTAAGAAGT
-TTAAGATTGCTGAGGGTCAGTGGTATCGTTATGCGCCTTCGTATGTTTCTCCTGCTTATCACCTTCTTGA
-AGGCTTCCCATTCATTCAGGAACCGCCTTCTGGTGATTTGCAAGAACGCGTACTTATTCGCCACCATGAT
-TATGACCAGTGTTTCCAGTCCGTTCAGTTGTTGCAGTGGAATAGTCAGGTTAAATTTAATGTGACCGTTT
-ATCGCAATCTGCCGACCACTCGCGATTCAATCATGACTTCGTGATAAAAGATTGAGTGTGAGGTTATAAC
-GCCGAAGCGGTAAAAATTTTAATTTTTGCCGCTGAGGGGTTGACCAAGCGAAGCGCGGTAGGTTTTCTGC
-TTAGGAGTTTAATCATGTTTCAGACTTTTATTTCTCGCCATAATTCAAACTTTTTTTCTGATAAGCTGGT
-TCTCACTTCTGTTACTCCAGCTTCTTCGGCACCTGTTTTACAGACACCTAAAGCTACATCGTCAACGTTA
-TATTTTGATAGTTTGACGGTTAATGCTGGTAATGGTGGTTTTCTTCATTGCATTCAGATGGATACATCTG
-TCAACGCCGCTAATCAGGTTGTTTCTGTTGGTGCTGATATTGCTTTTGATGCCGACCCTAAATTTTTTGC
-CTGTTTGGTTCGCTTTGAGTCTTCTTCGGTTCCGACTACCCTCCCGACTGCCTATGATGTTTATCCTTTG
-AATGGTCGCCATGATGGTGGTTATTATACCGTCAAGGACTGTGTGACTATTGACGTCCTTCCCCGTACGC
-CGGGCAATAATGTTTATGTTGGTTTCATGGTTTGGTCTAACTTTACCGCTACTAAATGCCGCGGATTGGT
-TTCGCTGAATCAGGTTATTAAAGAGATTATTTGTCTCCAGCCACTTAAGTGAGGTGATTTATGTTTGGTG
-CTATTGCTGGCGGTATTGCTTCTGCTCTTGCTGGTGGCGCCATGTCTAAATTGTTTGGAGGCGGTCAAAA
-AGCCGCCTCCGGTGGCATTCAAGGTGATGTGCTTGCTACCGATAACAATACTGTAGGCATGGGTGATGCT
-GGTATTAAATCTGCCATTCAAGGCTCTAATGTTCCTAACCCTGATGAGGCCGCCCCTAGTTTTGTTTCTG
-GTGCTATGGCTAAAGCTGGTAAAGGACTTCTTGAAGGTACGTTGCAGGCTGGCACTTCTGCCGTTTCTGA
-TAAGTTGCTTGATTTGGTTGGACTTGGTGGCAAGTCTGCCGCTGATAAAGGAAAGGATACTCGTGATTAT
-CTTGCTGCTGCATTTCCTGAGCTTAATGCTTGGGAGCGTGCTGGTGCTGATGCTTCCTCTGCTGGTATGG
-TTGACGCCGGATTTGAGAATCAAAAAGAGCTTACTAAAATGCAACTGGACAATCAGAAAGAGATTGCCGA
-GATGCAAAATGAGACTCAAAAAGAGATTGCTGGCATTCAGTCGGCGACTTCACGCCAGAATACGAAAGAC
-CAGGTATATGCACAAAATGAGATGCTTGCTTATCAACAGAAGGAGTCTACTGCTCGCGTTGCGTCTATTA
-TGGAAAACACCAATCTTTCCAAGCAACAGCAGGTTTCCGAGATTATGCGCCAAATGCTTACTCAAGCTCA
-AACGGCTGGTCAGTATTTTACCAATGACCAAATCAAAGAAATGACTCGCAAGGTTAGTGCTGAGGTTGAC
-TTAGTTCATCAGCAAACGCAGAATCAGCGGTATGGCTCTTCTCATATTGGCGCTACTGCAAAGGATATTT
-CTAATGTCGTCACTGATGCTGCTTCTGGTGTGGTTGATATTTTTCATGGTATTGATAAAGCTGTTGCCGA
-TACTTGGAACAATTTCTGGAAAGACGGTAAAGCTGATGGTATTGGCTCTAATTTGTCTAGGAAATAACCG
-TCAGGATTGACACCCTCCCAATTGTATGTTTTCATGCCTCCAAATCTTGGAGGCTTTTTTATGGTTCGTT
-CTTATTACCCTTCTGAATGTCACGCTGATTATTTTGACTTTGAGCGTATCGAGGCTCTTAAACCTGCTAT
-TGAGGCTTGTGGCATTTCTACTCTTTCTCAATCCCCAATGCTTGGCTTCCATAAGCAGATGGATAACCGC
-ATCAAGCTCTTGGAAGAGATTCTGTCTTTTCGTATGCAGGGCGTTGAGTTCGATAATGGTGATATGTATG
-TTGACGGCCATAAGGCTGCTTCTGACGTTCGTGATGAGTTTGTATCTGTTACTGAGAAGTTAATGGATGA
-ATTGGCACAATGCTACAATGTGCTCCCCCAACTTGATATTAATAACACTATAGACCACCGCCCCGAAGGG
-GACGAAAAATGGTTTTTAGAGAACGAGAAGACGGTTACGCAGTTTTGCCGCAAGCTGGCTGCTGAACGCC
-CTCTTAAGGATATTCGCGATGAGTATAATTACCCCAAAAAGAAAGGTATTAAGGATGAGTGTTCAAGATT
-GCTGGAGGCCTCCACTATGAAATCGCGTAGAGGCTTTACTATTCAGCGTTTGATGAATGCAATGCGACAG
-GCTCATGCTGATGGTTGGTTTATCGTTTTTGACACTCTCACGTTGGCTGACGACCGATTAGAGGCGTTTT
-ATGATAATCCCAATGCTTTGCGTGACTATTTTCGTGATATTGGTCGTATGGTTCTTGCTGCCGAGGGTCG
-CAAGGCTAATGATTCACACGCCGACTGCTATCAGTATTTTTGTGTGCCTGAGTATGGTACAGCTAATGGC
-CGTCTTCATTTCCATGCGGTGCATTTTATGCGGACACTTCCTACAGGTAGCGTTGACCCTAATTTTGGTC
-GTCGGGTACGCAATCGCCGCCAGTTAAATAGCTTGCAAAATACGTGGCCTTATGGTTACAGTATGCCCAT
-CGCAGTTCGCTACACGCAGGACGCTTTTTCACGTTCTGGTTGGTTGTGGCCTGTTGATGCTAAAGGTGAG
-CCGCTTAAAGCTACCAGTTATATGGCTGTTGGTTTCTATGTGGCTAAATACGTTAACAAAAAGTCAGATA
-TGGACCTTGCTGCTAAAGGTCTAGGAGCTAAAGAATGGAACAACTCACTAAAAACCAAGCTGTCGCTACT
-TCCCAAGAAGCTGTTCAGAATCAGAATGAGCCGCAACTTCGGGATGAAAATGCTCACAATGACAAATCTG
-TCCACGGAGTGCTTAATCCAACTTACCAAGCTGGGTTACGACGCGACGCCGTTCAACCAGATATTGAAGC
-AGAACGCAAAAAGAGAGATGAGATTGAGGCTGGGAAAAGTTACTGTAGCCGACGTTTTGGCGGCGCAACC
-TGTGACGACAAATCTGCTCAAATTTATGCGCGCTTCGATAAAAATGATTGGCGTATCCAACCTGCA
-
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG.bam
Binary file test-data/picard_ARRG.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_input1.bam
Binary file test-data/picard_ARRG_input1.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_input1.sam
--- a/test-data/picard_ARRG_input1.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,25 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:10001
-@SQ SN:chr2 LN:100001
-@SQ SN:chr3 LN:10001
-@SQ SN:chr4 LN:1001
-@RG ID:rg1 SM:s1
-@RG ID:rg2 SM:s3
-bar:record:4 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1
-bar:record:6 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2
-bar:record:1 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1
-bar:record:3 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2
-bar:record:1 141 chr1 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:7 77 chr1 20 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2
-bar:record:8 77 chr1 30 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2
-bar:record:4 141 chr1 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:5 77 chr1 40 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg2
-bar:record:6 141 chr1 50 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2
-bar:record:2 77 chr2 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1
-bar:record:2 141 chr2 30 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg2
-bar:record:3 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:8 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:5 141 chr3 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:9 77 chr4 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:rg1
-bar:record:7 141 chr4 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
-bar:record:9 141 chr4 60 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:rg1
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_input2.sam
--- a/test-data/picard_ARRG_input2.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,23 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:10001
-@SQ SN:chr2 LN:100001
-@SQ SN:chr3 LN:10001
-@SQ SN:chr4 LN:1001
-bar:record:4 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:6 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:1 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:3 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:1 141 chr1 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:7 77 chr1 20 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:8 77 chr1 30 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:4 141 chr1 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:5 77 chr1 40 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:6 141 chr1 50 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:2 77 chr2 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:2 141 chr2 30 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:3 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:8 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:5 141 chr3 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:9 77 chr4 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111
-bar:record:7 141 chr4 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
-bar:record:9 141 chr4 60 0 * * 0 0 CCCCCCCCCCCCC 2222222222222
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output1.sam
--- a/test-data/picard_ARRG_output1.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:10001
-@SQ SN:chr2 LN:100001
-@SQ SN:chr3 LN:10001
-@SQ SN:chr4 LN:1001
-@RG ID:one PL:illumina PU:peaewe LB:lib SM:sam1
-bar:record:4 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:6 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:1 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:3 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:1 141 chr1 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:7 77 chr1 20 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:8 77 chr1 30 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:4 141 chr1 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:5 77 chr1 40 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:6 141 chr1 50 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:2 77 chr2 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:2 141 chr2 30 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:3 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:8 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:5 141 chr3 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:9 77 chr4 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:one
-bar:record:7 141 chr4 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
-bar:record:9 141 chr4 60 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:one
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output2.bam
Binary file test-data/picard_ARRG_output2.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output2.sam
--- a/test-data/picard_ARRG_output2.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:10001
-@SQ SN:chr2 LN:100001
-@SQ SN:chr3 LN:10001
-@SQ SN:chr4 LN:1001
-@RG ID:M5 PL:IL PU:PLAT LB:LIB DS:description with spaces SM:smp CN:FamousCenter
-bar:record:4 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:6 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:1 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:3 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:1 141 chr1 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:7 77 chr1 20 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:8 77 chr1 30 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:4 141 chr1 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:5 77 chr1 40 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:6 141 chr1 50 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:2 77 chr2 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:2 141 chr2 30 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:3 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:8 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:5 141 chr3 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:9 77 chr4 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M5
-bar:record:7 141 chr4 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
-bar:record:9 141 chr4 60 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M5
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output3.bam
Binary file test-data/picard_ARRG_output3.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output3.bam.bai
Binary file test-data/picard_ARRG_output3.bam.bai has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_output3.sam
--- a/test-data/picard_ARRG_output3.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,24 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:10001
-@SQ SN:chr2 LN:100001
-@SQ SN:chr3 LN:10001
-@SQ SN:chr4 LN:1001
-@RG ID:M6 PL:IL PU:PLAT LB:LIB SM:smp1
-bar:record:4 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:6 77 chr1 1 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:1 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:3 77 chr1 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:1 141 chr1 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:7 77 chr1 20 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:8 77 chr1 30 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:4 141 chr1 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:5 77 chr1 40 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:6 141 chr1 50 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:2 77 chr2 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:2 141 chr2 30 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:3 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:8 141 chr3 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:5 141 chr3 40 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:9 77 chr4 10 0 * * 0 0 AAAAAAAAAAAAA 1111111111111 RG:Z:M6
-bar:record:7 141 chr4 20 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
-bar:record:9 141 chr4 60 0 * * 0 0 CCCCCCCCCCCCC 2222222222222 RG:Z:M6
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_ARRG_test1.bam
Binary file test-data/picard_ARRG_test1.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_AddCommentsToBam.bam
Binary file test-data/picard_AddCommentsToBam.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_AddCommentsToBam_test1.bam
Binary file test-data/picard_AddCommentsToBam_test1.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_BIS_input1.bam
Binary file test-data/picard_BIS_input1.bam has changed
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_BIS_input1.sam
--- a/test-data/picard_BIS_input1.sam Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,18 +0,0 @@
-@HD VN:1.0 SO:coordinate
-@SQ SN:chr1 LN:101
-@SQ SN:chr7 LN:404
-@SQ SN:chr8 LN:202
-@SQ SN:chr10 LN:303
-@SQ SN:chr14 LN:505
-@RG ID:0 SM:Hi,Mom!
-@RG ID:1 SM:samplesample DS:ClearDescription
-@PG ID:1 PN:Hey! VN:2.0
-@CO Just a generic comment to make the header longer
-both_reads_align_clip_marked 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
-both_reads_present_only_first_aligns 89 chr7 1 255 101M * 0 0 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
-read_2_too_many_gaps 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
-both_reads_align_clip_adapter 147 chr7 16 255 101M = 21 -96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
-both_reads_align_clip_adapter 99 chr7 21 255 101M = 16 96 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& RG:Z:0
-both_reads_align_clip_marked 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0
-read_2_too_many_gaps 163 chr7 302 255 10M1D10M5I76M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0
-both_reads_present_only_first_aligns 165 * 0 0 * chr7 1 0 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 RG:Z:0
diff -r ab1f60c26526 -r 3d4f1fa26f0e test-data/picard_BIS_output1.txt
--- a/test-data/picard_BIS_output1.txt Fri Feb 21 12:07:49 2014 -0500
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,39 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-Galaxy tool BamIndexStats run at 12/05/2011 14:18:06 The following output files were created (click the filename to view/download a copy):