Mercurial > repos > halley > gatk_2_7
view GenomeAnalysisTK-2.7-2-g6bda569/resources/PrintReads.java @ 0:1485d70afa12 draft default tip
Uploaded
author | halley |
---|---|
date | Tue, 15 Oct 2013 03:09:34 -0400 |
parents | |
children |
line wrap: on
line source
/* * Copyright (c) 2012 The Broad Institute * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ package org.broadinstitute.sting.gatk.walkers.readutils; import net.sf.samtools.SAMFileWriter; import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; import java.util.*; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. * * <p> * PrintReads can dynamically merge the contents of multiple input BAM files, resulting * in merged output sorted in coordinate order. Can also optionally filter reads based on the * --read_filter command line argument. * </p> * * <p> * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. * </p> * * <h3>Input</h3> * <p> * One or more bam files. * </p> * * <h3>Output</h3> * <p> * A single processed bam file. * </p> * * <h3>Examples</h3> * <pre> * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T PrintReads \ * -o output.bam \ * -I input1.bam \ * -I input2.bam \ * --read_filter MappingQualityZero * * // Prints the first 2000 reads in the BAM file * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T PrintReads \ * -o output.bam \ * -I input.bam \ * -n 2000 * * // Downsamples BAM file to 25% * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T PrintReads \ * -o output.bam \ * -I input.bam \ * -dfrac 0.25 * </pre> * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable { @Output(doc="Write output to this BAM filename instead of STDOUT") StingSAMFileWriter out; @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) String readGroup = null; /** * For example, --platform ILLUMINA or --platform 454. */ @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) String platform = null; /** * Only prints the first n reads of the file */ @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) int nReadsToPrint = -1; /** * Only reads from samples listed in the provided file(s) will be included in the output. */ @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) public Set<File> sampleFile = new TreeSet<File>(); /** * Only reads from the sample(s) will be included in the output. */ @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) public Set<String> sampleNames = new TreeSet<String>(); /** * Erase all extra attributes in the read but keep the read group information */ @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) public boolean simplifyReads = false; @Hidden @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) public boolean NO_PG_TAG = false; List<ReadTransformer> readTransformers = Collections.emptyList(); private TreeSet<String> samplesToChoose = new TreeSet<String>(); private boolean SAMPLES_SPECIFIED = false; public static final String PROGRAM_RECORD_NAME = "GATK PrintReads"; // The name that will go in the @PG tag Random random; /** * The initialize function. */ public void initialize() { final GenomeAnalysisEngine toolkit = getToolkit(); if ( platform != null ) platform = platform.toUpperCase(); if ( getToolkit() != null ) readTransformers = getToolkit().getReadTransformers(); Collection<String> samplesFromFile; if (!sampleFile.isEmpty()) { samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); samplesToChoose.addAll(samplesFromFile); } if (!sampleNames.isEmpty()) samplesToChoose.addAll(sampleNames); if(!samplesToChoose.isEmpty()) { SAMPLES_SPECIFIED = true; } random = GenomeAnalysisEngine.getRandomGenerator(); final boolean preSorted = true; if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); } } /** * The reads filter function. * * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a GATKSAMRecord * @return true if the read passes the filter, false if it doesn't */ public boolean filter(ReferenceContext ref, GATKSAMRecord read) { // check the read group if ( readGroup != null ) { SAMReadGroupRecord myReadGroup = read.getReadGroup(); if ( myReadGroup == null || !readGroup.equals(myReadGroup.getReadGroupId()) ) return false; } // check the platform if ( platform != null ) { SAMReadGroupRecord readGroup = read.getReadGroup(); if ( readGroup == null ) return false; Object readPlatformAttr = readGroup.getAttribute("PL"); if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform)) return false; } if (SAMPLES_SPECIFIED ) { // user specified samples to select // todo - should be case-agnostic but for simplicity and speed this is ignored. // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup. if (!samplesToChoose.contains(read.getReadGroup().getSample())) return false; } // check if we've reached the output limit if ( nReadsToPrint == 0 ) { return false; // n == 0 means we've printed all we needed. } else if (nReadsToPrint > 0) { nReadsToPrint--; // n > 0 means there are still reads to be printed. } return true; } /** * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided * @param readIn the read itself, as a GATKSAMRecord * @return the read itself */ public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { GATKSAMRecord workingRead = readIn; for ( final ReadTransformer transformer : readTransformers ) { workingRead = transformer.apply(workingRead); } if ( simplifyReads ) workingRead = workingRead.simplify(); return workingRead; } /** * reduceInit is called once before any calls to the map function. We use it here to setup the output * bam file, if it was specified on the command line * * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise */ public SAMFileWriter reduceInit() { return out; } /** * given a read and a output location, reduce by emitting the read * * @param read the read itself * @param output the output source * @return the SAMFileWriter, so that the next reduce can emit to the same source */ public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { output.addAlignment(read); return output; } }