Mercurial > repos > halley > gatk_2_7
diff GenomeAnalysisTK-2.7-2-g6bda569/resources/PrintReads.java @ 0:1485d70afa12 draft default tip
Uploaded
author | halley |
---|---|
date | Tue, 15 Oct 2013 03:09:34 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GenomeAnalysisTK-2.7-2-g6bda569/resources/PrintReads.java Tue Oct 15 03:09:34 2013 -0400 @@ -0,0 +1,277 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.walkers.readutils; + +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpConstants; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.File; +import java.util.*; + +/** + * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. + * + * <p> + * PrintReads can dynamically merge the contents of multiple input BAM files, resulting + * in merged output sorted in coordinate order. Can also optionally filter reads based on the + * --read_filter command line argument. + * </p> + * + * <p> + * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, + * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. + * </p> + * + * <h3>Input</h3> + * <p> + * One or more bam files. + * </p> + * + * <h3>Output</h3> + * <p> + * A single processed bam file. + * </p> + * + * <h3>Examples</h3> + * <pre> + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input1.bam \ + * -I input2.bam \ + * --read_filter MappingQualityZero + * + * // Prints the first 2000 reads in the BAM file + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -n 2000 + * + * // Downsamples BAM file to 25% + * java -Xmx2g -jar GenomeAnalysisTK.jar \ + * -R ref.fasta \ + * -T PrintReads \ + * -o output.bam \ + * -I input.bam \ + * -dfrac 0.25 + * </pre> + * + */ +@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) +@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable { + + @Output(doc="Write output to this BAM filename instead of STDOUT") + StingSAMFileWriter out; + + @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) + String readGroup = null; + + /** + * For example, --platform ILLUMINA or --platform 454. + */ + @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) + String platform = null; + + /** + * Only prints the first n reads of the file + */ + @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) + int nReadsToPrint = -1; + + /** + * Only reads from samples listed in the provided file(s) will be included in the output. + */ + @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) + public Set<File> sampleFile = new TreeSet<File>(); + + /** + * Only reads from the sample(s) will be included in the output. + */ + @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) + public Set<String> sampleNames = new TreeSet<String>(); + + /** + * Erase all extra attributes in the read but keep the read group information + */ + @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) + public boolean simplifyReads = false; + + @Hidden + @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) + public boolean NO_PG_TAG = false; + + List<ReadTransformer> readTransformers = Collections.emptyList(); + private TreeSet<String> samplesToChoose = new TreeSet<String>(); + private boolean SAMPLES_SPECIFIED = false; + + public static final String PROGRAM_RECORD_NAME = "GATK PrintReads"; // The name that will go in the @PG tag + + Random random; + + + /** + * The initialize function. + */ + public void initialize() { + final GenomeAnalysisEngine toolkit = getToolkit(); + + if ( platform != null ) + platform = platform.toUpperCase(); + + if ( getToolkit() != null ) + readTransformers = getToolkit().getReadTransformers(); + + Collection<String> samplesFromFile; + if (!sampleFile.isEmpty()) { + samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); + samplesToChoose.addAll(samplesFromFile); + } + + if (!sampleNames.isEmpty()) + samplesToChoose.addAll(sampleNames); + + if(!samplesToChoose.isEmpty()) { + SAMPLES_SPECIFIED = true; + } + + random = GenomeAnalysisEngine.getRandomGenerator(); + + final boolean preSorted = true; + if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { + Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); + } + + } + + /** + * The reads filter function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param read the read itself, as a GATKSAMRecord + * @return true if the read passes the filter, false if it doesn't + */ + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + // check the read group + if ( readGroup != null ) { + SAMReadGroupRecord myReadGroup = read.getReadGroup(); + if ( myReadGroup == null || !readGroup.equals(myReadGroup.getReadGroupId()) ) + return false; + } + + // check the platform + if ( platform != null ) { + SAMReadGroupRecord readGroup = read.getReadGroup(); + if ( readGroup == null ) + return false; + + Object readPlatformAttr = readGroup.getAttribute("PL"); + if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform)) + return false; + } + if (SAMPLES_SPECIFIED ) { + // user specified samples to select + // todo - should be case-agnostic but for simplicity and speed this is ignored. + // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup. + if (!samplesToChoose.contains(read.getReadGroup().getSample())) + return false; + } + + + // check if we've reached the output limit + if ( nReadsToPrint == 0 ) { + return false; // n == 0 means we've printed all we needed. + } + else if (nReadsToPrint > 0) { + nReadsToPrint--; // n > 0 means there are still reads to be printed. + } + + return true; + } + + /** + * The reads map function. + * + * @param ref the reference bases that correspond to our read, if a reference was provided + * @param readIn the read itself, as a GATKSAMRecord + * @return the read itself + */ + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + GATKSAMRecord workingRead = readIn; + + for ( final ReadTransformer transformer : readTransformers ) { + workingRead = transformer.apply(workingRead); + } + + if ( simplifyReads ) workingRead = workingRead.simplify(); + + return workingRead; + } + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise + */ + public SAMFileWriter reduceInit() { + return out; + } + + /** + * given a read and a output location, reduce by emitting the read + * + * @param read the read itself + * @param output the output source + * @return the SAMFileWriter, so that the next reduce can emit to the same source + */ + public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { + output.addAlignment(read); + return output; + } +}