Mercurial > repos > halley > gatk_2_7
comparison GenomeAnalysisTK-2.7-2-g6bda569/resources/PrintReads.java @ 0:1485d70afa12 draft default tip
Uploaded
author | halley |
---|---|
date | Tue, 15 Oct 2013 03:09:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1485d70afa12 |
---|---|
1 /* | |
2 * Copyright (c) 2012 The Broad Institute | |
3 * | |
4 * Permission is hereby granted, free of charge, to any person | |
5 * obtaining a copy of this software and associated documentation | |
6 * files (the "Software"), to deal in the Software without | |
7 * restriction, including without limitation the rights to use, | |
8 * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 * copies of the Software, and to permit persons to whom the | |
10 * Software is furnished to do so, subject to the following | |
11 * conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be | |
14 * included in all copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
18 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR | |
23 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
24 */ | |
25 | |
26 package org.broadinstitute.sting.gatk.walkers.readutils; | |
27 | |
28 import net.sf.samtools.SAMFileWriter; | |
29 import net.sf.samtools.SAMReadGroupRecord; | |
30 import org.broadinstitute.sting.commandline.Argument; | |
31 import org.broadinstitute.sting.commandline.Hidden; | |
32 import org.broadinstitute.sting.commandline.Output; | |
33 import org.broadinstitute.sting.gatk.CommandLineGATK; | |
34 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; | |
35 import org.broadinstitute.sting.gatk.contexts.ReferenceContext; | |
36 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; | |
37 import org.broadinstitute.sting.gatk.iterators.ReadTransformer; | |
38 import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; | |
39 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; | |
40 import org.broadinstitute.sting.gatk.walkers.*; | |
41 import org.broadinstitute.sting.utils.SampleUtils; | |
42 import org.broadinstitute.sting.utils.Utils; | |
43 import org.broadinstitute.sting.utils.baq.BAQ; | |
44 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; | |
45 import org.broadinstitute.sting.utils.help.HelpConstants; | |
46 import org.broadinstitute.sting.utils.sam.GATKSAMRecord; | |
47 | |
48 import java.io.File; | |
49 import java.util.*; | |
50 | |
51 /** | |
52 * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. | |
53 * | |
54 * <p> | |
55 * PrintReads can dynamically merge the contents of multiple input BAM files, resulting | |
56 * in merged output sorted in coordinate order. Can also optionally filter reads based on the | |
57 * --read_filter command line argument. | |
58 * </p> | |
59 * | |
60 * <p> | |
61 * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow, | |
62 * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below. | |
63 * </p> | |
64 * | |
65 * <h3>Input</h3> | |
66 * <p> | |
67 * One or more bam files. | |
68 * </p> | |
69 * | |
70 * <h3>Output</h3> | |
71 * <p> | |
72 * A single processed bam file. | |
73 * </p> | |
74 * | |
75 * <h3>Examples</h3> | |
76 * <pre> | |
77 * java -Xmx2g -jar GenomeAnalysisTK.jar \ | |
78 * -R ref.fasta \ | |
79 * -T PrintReads \ | |
80 * -o output.bam \ | |
81 * -I input1.bam \ | |
82 * -I input2.bam \ | |
83 * --read_filter MappingQualityZero | |
84 * | |
85 * // Prints the first 2000 reads in the BAM file | |
86 * java -Xmx2g -jar GenomeAnalysisTK.jar \ | |
87 * -R ref.fasta \ | |
88 * -T PrintReads \ | |
89 * -o output.bam \ | |
90 * -I input.bam \ | |
91 * -n 2000 | |
92 * | |
93 * // Downsamples BAM file to 25% | |
94 * java -Xmx2g -jar GenomeAnalysisTK.jar \ | |
95 * -R ref.fasta \ | |
96 * -T PrintReads \ | |
97 * -o output.bam \ | |
98 * -I input.bam \ | |
99 * -dfrac 0.25 | |
100 * </pre> | |
101 * | |
102 */ | |
103 @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} ) | |
104 @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) | |
105 @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) | |
106 @Requires({DataSource.READS, DataSource.REFERENCE}) | |
107 public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable { | |
108 | |
109 @Output(doc="Write output to this BAM filename instead of STDOUT") | |
110 StingSAMFileWriter out; | |
111 | |
112 @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false) | |
113 String readGroup = null; | |
114 | |
115 /** | |
116 * For example, --platform ILLUMINA or --platform 454. | |
117 */ | |
118 @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false) | |
119 String platform = null; | |
120 | |
121 /** | |
122 * Only prints the first n reads of the file | |
123 */ | |
124 @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false) | |
125 int nReadsToPrint = -1; | |
126 | |
127 /** | |
128 * Only reads from samples listed in the provided file(s) will be included in the output. | |
129 */ | |
130 @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false) | |
131 public Set<File> sampleFile = new TreeSet<File>(); | |
132 | |
133 /** | |
134 * Only reads from the sample(s) will be included in the output. | |
135 */ | |
136 @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false) | |
137 public Set<String> sampleNames = new TreeSet<String>(); | |
138 | |
139 /** | |
140 * Erase all extra attributes in the read but keep the read group information | |
141 */ | |
142 @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false) | |
143 public boolean simplifyReads = false; | |
144 | |
145 @Hidden | |
146 @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) | |
147 public boolean NO_PG_TAG = false; | |
148 | |
149 List<ReadTransformer> readTransformers = Collections.emptyList(); | |
150 private TreeSet<String> samplesToChoose = new TreeSet<String>(); | |
151 private boolean SAMPLES_SPECIFIED = false; | |
152 | |
153 public static final String PROGRAM_RECORD_NAME = "GATK PrintReads"; // The name that will go in the @PG tag | |
154 | |
155 Random random; | |
156 | |
157 | |
158 /** | |
159 * The initialize function. | |
160 */ | |
161 public void initialize() { | |
162 final GenomeAnalysisEngine toolkit = getToolkit(); | |
163 | |
164 if ( platform != null ) | |
165 platform = platform.toUpperCase(); | |
166 | |
167 if ( getToolkit() != null ) | |
168 readTransformers = getToolkit().getReadTransformers(); | |
169 | |
170 Collection<String> samplesFromFile; | |
171 if (!sampleFile.isEmpty()) { | |
172 samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); | |
173 samplesToChoose.addAll(samplesFromFile); | |
174 } | |
175 | |
176 if (!sampleNames.isEmpty()) | |
177 samplesToChoose.addAll(sampleNames); | |
178 | |
179 if(!samplesToChoose.isEmpty()) { | |
180 SAMPLES_SPECIFIED = true; | |
181 } | |
182 | |
183 random = GenomeAnalysisEngine.getRandomGenerator(); | |
184 | |
185 final boolean preSorted = true; | |
186 if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) { | |
187 Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME); | |
188 } | |
189 | |
190 } | |
191 | |
192 /** | |
193 * The reads filter function. | |
194 * | |
195 * @param ref the reference bases that correspond to our read, if a reference was provided | |
196 * @param read the read itself, as a GATKSAMRecord | |
197 * @return true if the read passes the filter, false if it doesn't | |
198 */ | |
199 public boolean filter(ReferenceContext ref, GATKSAMRecord read) { | |
200 // check the read group | |
201 if ( readGroup != null ) { | |
202 SAMReadGroupRecord myReadGroup = read.getReadGroup(); | |
203 if ( myReadGroup == null || !readGroup.equals(myReadGroup.getReadGroupId()) ) | |
204 return false; | |
205 } | |
206 | |
207 // check the platform | |
208 if ( platform != null ) { | |
209 SAMReadGroupRecord readGroup = read.getReadGroup(); | |
210 if ( readGroup == null ) | |
211 return false; | |
212 | |
213 Object readPlatformAttr = readGroup.getAttribute("PL"); | |
214 if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform)) | |
215 return false; | |
216 } | |
217 if (SAMPLES_SPECIFIED ) { | |
218 // user specified samples to select | |
219 // todo - should be case-agnostic but for simplicity and speed this is ignored. | |
220 // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup. | |
221 if (!samplesToChoose.contains(read.getReadGroup().getSample())) | |
222 return false; | |
223 } | |
224 | |
225 | |
226 // check if we've reached the output limit | |
227 if ( nReadsToPrint == 0 ) { | |
228 return false; // n == 0 means we've printed all we needed. | |
229 } | |
230 else if (nReadsToPrint > 0) { | |
231 nReadsToPrint--; // n > 0 means there are still reads to be printed. | |
232 } | |
233 | |
234 return true; | |
235 } | |
236 | |
237 /** | |
238 * The reads map function. | |
239 * | |
240 * @param ref the reference bases that correspond to our read, if a reference was provided | |
241 * @param readIn the read itself, as a GATKSAMRecord | |
242 * @return the read itself | |
243 */ | |
244 public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { | |
245 GATKSAMRecord workingRead = readIn; | |
246 | |
247 for ( final ReadTransformer transformer : readTransformers ) { | |
248 workingRead = transformer.apply(workingRead); | |
249 } | |
250 | |
251 if ( simplifyReads ) workingRead = workingRead.simplify(); | |
252 | |
253 return workingRead; | |
254 } | |
255 | |
256 /** | |
257 * reduceInit is called once before any calls to the map function. We use it here to setup the output | |
258 * bam file, if it was specified on the command line | |
259 * | |
260 * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise | |
261 */ | |
262 public SAMFileWriter reduceInit() { | |
263 return out; | |
264 } | |
265 | |
266 /** | |
267 * given a read and a output location, reduce by emitting the read | |
268 * | |
269 * @param read the read itself | |
270 * @param output the output source | |
271 * @return the SAMFileWriter, so that the next reduce can emit to the same source | |
272 */ | |
273 public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) { | |
274 output.addAlignment(read); | |
275 return output; | |
276 } | |
277 } |