comparison GenomeAnalysisTK-2.7-2-g6bda569/resources/PrintReads.java @ 0:1485d70afa12 draft default tip

Uploaded
author halley
date Tue, 15 Oct 2013 03:09:34 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1485d70afa12
1 /*
2 * Copyright (c) 2012 The Broad Institute
3 *
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use,
8 * copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following
11 * conditions:
12 *
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
23 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26 package org.broadinstitute.sting.gatk.walkers.readutils;
27
28 import net.sf.samtools.SAMFileWriter;
29 import net.sf.samtools.SAMReadGroupRecord;
30 import org.broadinstitute.sting.commandline.Argument;
31 import org.broadinstitute.sting.commandline.Hidden;
32 import org.broadinstitute.sting.commandline.Output;
33 import org.broadinstitute.sting.gatk.CommandLineGATK;
34 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
35 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
36 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
37 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
38 import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
39 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
40 import org.broadinstitute.sting.gatk.walkers.*;
41 import org.broadinstitute.sting.utils.SampleUtils;
42 import org.broadinstitute.sting.utils.Utils;
43 import org.broadinstitute.sting.utils.baq.BAQ;
44 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
45 import org.broadinstitute.sting.utils.help.HelpConstants;
46 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
47
48 import java.io.File;
49 import java.util.*;
50
51 /**
52 * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
53 *
54 * <p>
55 * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
56 * in merged output sorted in coordinate order. Can also optionally filter reads based on the
57 * --read_filter command line argument.
58 * </p>
59 *
60 * <p>
61 * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow,
62 * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below.
63 * </p>
64 *
65 * <h3>Input</h3>
66 * <p>
67 * One or more bam files.
68 * </p>
69 *
70 * <h3>Output</h3>
71 * <p>
72 * A single processed bam file.
73 * </p>
74 *
75 * <h3>Examples</h3>
76 * <pre>
77 * java -Xmx2g -jar GenomeAnalysisTK.jar \
78 * -R ref.fasta \
79 * -T PrintReads \
80 * -o output.bam \
81 * -I input1.bam \
82 * -I input2.bam \
83 * --read_filter MappingQualityZero
84 *
85 * // Prints the first 2000 reads in the BAM file
86 * java -Xmx2g -jar GenomeAnalysisTK.jar \
87 * -R ref.fasta \
88 * -T PrintReads \
89 * -o output.bam \
90 * -I input.bam \
91 * -n 2000
92 *
93 * // Downsamples BAM file to 25%
94 * java -Xmx2g -jar GenomeAnalysisTK.jar \
95 * -R ref.fasta \
96 * -T PrintReads \
97 * -o output.bam \
98 * -I input.bam \
99 * -dfrac 0.25
100 * </pre>
101 *
102 */
103 @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
104 @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
105 @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
106 @Requires({DataSource.READS, DataSource.REFERENCE})
107 public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {
108
109 @Output(doc="Write output to this BAM filename instead of STDOUT")
110 StingSAMFileWriter out;
111
112 @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
113 String readGroup = null;
114
115 /**
116 * For example, --platform ILLUMINA or --platform 454.
117 */
118 @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
119 String platform = null;
120
121 /**
122 * Only prints the first n reads of the file
123 */
124 @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
125 int nReadsToPrint = -1;
126
127 /**
128 * Only reads from samples listed in the provided file(s) will be included in the output.
129 */
130 @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
131 public Set<File> sampleFile = new TreeSet<File>();
132
133 /**
134 * Only reads from the sample(s) will be included in the output.
135 */
136 @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
137 public Set<String> sampleNames = new TreeSet<String>();
138
139 /**
140 * Erase all extra attributes in the read but keep the read group information
141 */
142 @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false)
143 public boolean simplifyReads = false;
144
145 @Hidden
146 @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
147 public boolean NO_PG_TAG = false;
148
149 List<ReadTransformer> readTransformers = Collections.emptyList();
150 private TreeSet<String> samplesToChoose = new TreeSet<String>();
151 private boolean SAMPLES_SPECIFIED = false;
152
153 public static final String PROGRAM_RECORD_NAME = "GATK PrintReads"; // The name that will go in the @PG tag
154
155 Random random;
156
157
158 /**
159 * The initialize function.
160 */
161 public void initialize() {
162 final GenomeAnalysisEngine toolkit = getToolkit();
163
164 if ( platform != null )
165 platform = platform.toUpperCase();
166
167 if ( getToolkit() != null )
168 readTransformers = getToolkit().getReadTransformers();
169
170 Collection<String> samplesFromFile;
171 if (!sampleFile.isEmpty()) {
172 samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
173 samplesToChoose.addAll(samplesFromFile);
174 }
175
176 if (!sampleNames.isEmpty())
177 samplesToChoose.addAll(sampleNames);
178
179 if(!samplesToChoose.isEmpty()) {
180 SAMPLES_SPECIFIED = true;
181 }
182
183 random = GenomeAnalysisEngine.getRandomGenerator();
184
185 final boolean preSorted = true;
186 if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
187 Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME);
188 }
189
190 }
191
192 /**
193 * The reads filter function.
194 *
195 * @param ref the reference bases that correspond to our read, if a reference was provided
196 * @param read the read itself, as a GATKSAMRecord
197 * @return true if the read passes the filter, false if it doesn't
198 */
199 public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
200 // check the read group
201 if ( readGroup != null ) {
202 SAMReadGroupRecord myReadGroup = read.getReadGroup();
203 if ( myReadGroup == null || !readGroup.equals(myReadGroup.getReadGroupId()) )
204 return false;
205 }
206
207 // check the platform
208 if ( platform != null ) {
209 SAMReadGroupRecord readGroup = read.getReadGroup();
210 if ( readGroup == null )
211 return false;
212
213 Object readPlatformAttr = readGroup.getAttribute("PL");
214 if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform))
215 return false;
216 }
217 if (SAMPLES_SPECIFIED ) {
218 // user specified samples to select
219 // todo - should be case-agnostic but for simplicity and speed this is ignored.
220 // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup.
221 if (!samplesToChoose.contains(read.getReadGroup().getSample()))
222 return false;
223 }
224
225
226 // check if we've reached the output limit
227 if ( nReadsToPrint == 0 ) {
228 return false; // n == 0 means we've printed all we needed.
229 }
230 else if (nReadsToPrint > 0) {
231 nReadsToPrint--; // n > 0 means there are still reads to be printed.
232 }
233
234 return true;
235 }
236
237 /**
238 * The reads map function.
239 *
240 * @param ref the reference bases that correspond to our read, if a reference was provided
241 * @param readIn the read itself, as a GATKSAMRecord
242 * @return the read itself
243 */
244 public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
245 GATKSAMRecord workingRead = readIn;
246
247 for ( final ReadTransformer transformer : readTransformers ) {
248 workingRead = transformer.apply(workingRead);
249 }
250
251 if ( simplifyReads ) workingRead = workingRead.simplify();
252
253 return workingRead;
254 }
255
256 /**
257 * reduceInit is called once before any calls to the map function. We use it here to setup the output
258 * bam file, if it was specified on the command line
259 *
260 * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
261 */
262 public SAMFileWriter reduceInit() {
263 return out;
264 }
265
266 /**
267 * given a read and a output location, reduce by emitting the read
268 *
269 * @param read the read itself
270 * @param output the output source
271 * @return the SAMFileWriter, so that the next reduce can emit to the same source
272 */
273 public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) {
274 output.addAlignment(read);
275 return output;
276 }
277 }