0
|
1 /*
|
|
2 * Copyright (c) 2012 The Broad Institute
|
|
3 *
|
|
4 * Permission is hereby granted, free of charge, to any person
|
|
5 * obtaining a copy of this software and associated documentation
|
|
6 * files (the "Software"), to deal in the Software without
|
|
7 * restriction, including without limitation the rights to use,
|
|
8 * copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9 * copies of the Software, and to permit persons to whom the
|
|
10 * Software is furnished to do so, subject to the following
|
|
11 * conditions:
|
|
12 *
|
|
13 * The above copyright notice and this permission notice shall be
|
|
14 * included in all copies or substantial portions of the Software.
|
|
15 *
|
|
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
18 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
23 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
24 */
|
|
25
|
|
26 package org.broadinstitute.sting.gatk.walkers.readutils;
|
|
27
|
|
28 import net.sf.samtools.SAMFileWriter;
|
|
29 import net.sf.samtools.SAMReadGroupRecord;
|
|
30 import org.broadinstitute.sting.commandline.Argument;
|
|
31 import org.broadinstitute.sting.commandline.Hidden;
|
|
32 import org.broadinstitute.sting.commandline.Output;
|
|
33 import org.broadinstitute.sting.gatk.CommandLineGATK;
|
|
34 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|
35 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|
36 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
|
37 import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
|
38 import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
|
39 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|
40 import org.broadinstitute.sting.gatk.walkers.*;
|
|
41 import org.broadinstitute.sting.utils.SampleUtils;
|
|
42 import org.broadinstitute.sting.utils.Utils;
|
|
43 import org.broadinstitute.sting.utils.baq.BAQ;
|
|
44 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
|
45 import org.broadinstitute.sting.utils.help.HelpConstants;
|
|
46 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
|
47
|
|
48 import java.io.File;
|
|
49 import java.util.*;
|
|
50
|
|
51 /**
|
|
52 * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
|
|
53 *
|
|
54 * <p>
|
|
55 * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
|
|
56 * in merged output sorted in coordinate order. Can also optionally filter reads based on the
|
|
57 * --read_filter command line argument.
|
|
58 * </p>
|
|
59 *
|
|
60 * <p>
|
|
61 * Note that when PrintReads is used as part of the Base Quality Score Recalibration workflow,
|
|
62 * it takes the --BQSR engine argument, which is listed under Inherited Arguments > CommandLineGATK below.
|
|
63 * </p>
|
|
64 *
|
|
65 * <h3>Input</h3>
|
|
66 * <p>
|
|
67 * One or more bam files.
|
|
68 * </p>
|
|
69 *
|
|
70 * <h3>Output</h3>
|
|
71 * <p>
|
|
72 * A single processed bam file.
|
|
73 * </p>
|
|
74 *
|
|
75 * <h3>Examples</h3>
|
|
76 * <pre>
|
|
77 * java -Xmx2g -jar GenomeAnalysisTK.jar \
|
|
78 * -R ref.fasta \
|
|
79 * -T PrintReads \
|
|
80 * -o output.bam \
|
|
81 * -I input1.bam \
|
|
82 * -I input2.bam \
|
|
83 * --read_filter MappingQualityZero
|
|
84 *
|
|
85 * // Prints the first 2000 reads in the BAM file
|
|
86 * java -Xmx2g -jar GenomeAnalysisTK.jar \
|
|
87 * -R ref.fasta \
|
|
88 * -T PrintReads \
|
|
89 * -o output.bam \
|
|
90 * -I input.bam \
|
|
91 * -n 2000
|
|
92 *
|
|
93 * // Downsamples BAM file to 25%
|
|
94 * java -Xmx2g -jar GenomeAnalysisTK.jar \
|
|
95 * -R ref.fasta \
|
|
96 * -T PrintReads \
|
|
97 * -o output.bam \
|
|
98 * -I input.bam \
|
|
99 * -dfrac 0.25
|
|
100 * </pre>
|
|
101 *
|
|
102 */
|
|
103 @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_DATA, extraDocs = {CommandLineGATK.class} )
|
|
104 @ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
|
|
105 @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER)
|
|
106 @Requires({DataSource.READS, DataSource.REFERENCE})
|
|
107 public class PrintReads extends ReadWalker<GATKSAMRecord, SAMFileWriter> implements NanoSchedulable {
|
|
108
|
|
109 @Output(doc="Write output to this BAM filename instead of STDOUT")
|
|
110 StingSAMFileWriter out;
|
|
111
|
|
112 @Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
|
|
113 String readGroup = null;
|
|
114
|
|
115 /**
|
|
116 * For example, --platform ILLUMINA or --platform 454.
|
|
117 */
|
|
118 @Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
|
|
119 String platform = null;
|
|
120
|
|
121 /**
|
|
122 * Only prints the first n reads of the file
|
|
123 */
|
|
124 @Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
|
|
125 int nReadsToPrint = -1;
|
|
126
|
|
127 /**
|
|
128 * Only reads from samples listed in the provided file(s) will be included in the output.
|
|
129 */
|
|
130 @Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
|
|
131 public Set<File> sampleFile = new TreeSet<File>();
|
|
132
|
|
133 /**
|
|
134 * Only reads from the sample(s) will be included in the output.
|
|
135 */
|
|
136 @Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
|
|
137 public Set<String> sampleNames = new TreeSet<String>();
|
|
138
|
|
139 /**
|
|
140 * Erase all extra attributes in the read but keep the read group information
|
|
141 */
|
|
142 @Argument(fullName="simplify", shortName="s", doc="Simplify all reads.", required=false)
|
|
143 public boolean simplifyReads = false;
|
|
144
|
|
145 @Hidden
|
|
146 @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
|
147 public boolean NO_PG_TAG = false;
|
|
148
|
|
149 List<ReadTransformer> readTransformers = Collections.emptyList();
|
|
150 private TreeSet<String> samplesToChoose = new TreeSet<String>();
|
|
151 private boolean SAMPLES_SPECIFIED = false;
|
|
152
|
|
153 public static final String PROGRAM_RECORD_NAME = "GATK PrintReads"; // The name that will go in the @PG tag
|
|
154
|
|
155 Random random;
|
|
156
|
|
157
|
|
158 /**
|
|
159 * The initialize function.
|
|
160 */
|
|
161 public void initialize() {
|
|
162 final GenomeAnalysisEngine toolkit = getToolkit();
|
|
163
|
|
164 if ( platform != null )
|
|
165 platform = platform.toUpperCase();
|
|
166
|
|
167 if ( getToolkit() != null )
|
|
168 readTransformers = getToolkit().getReadTransformers();
|
|
169
|
|
170 Collection<String> samplesFromFile;
|
|
171 if (!sampleFile.isEmpty()) {
|
|
172 samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile);
|
|
173 samplesToChoose.addAll(samplesFromFile);
|
|
174 }
|
|
175
|
|
176 if (!sampleNames.isEmpty())
|
|
177 samplesToChoose.addAll(sampleNames);
|
|
178
|
|
179 if(!samplesToChoose.isEmpty()) {
|
|
180 SAMPLES_SPECIFIED = true;
|
|
181 }
|
|
182
|
|
183 random = GenomeAnalysisEngine.getRandomGenerator();
|
|
184
|
|
185 final boolean preSorted = true;
|
|
186 if (getToolkit() != null && getToolkit().getArguments().BQSR_RECAL_FILE != null && !NO_PG_TAG ) {
|
|
187 Utils.setupWriter(out, toolkit, toolkit.getSAMFileHeader(), preSorted, this, PROGRAM_RECORD_NAME);
|
|
188 }
|
|
189
|
|
190 }
|
|
191
|
|
192 /**
|
|
193 * The reads filter function.
|
|
194 *
|
|
195 * @param ref the reference bases that correspond to our read, if a reference was provided
|
|
196 * @param read the read itself, as a GATKSAMRecord
|
|
197 * @return true if the read passes the filter, false if it doesn't
|
|
198 */
|
|
199 public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
|
|
200 // check the read group
|
|
201 if ( readGroup != null ) {
|
|
202 SAMReadGroupRecord myReadGroup = read.getReadGroup();
|
|
203 if ( myReadGroup == null || !readGroup.equals(myReadGroup.getReadGroupId()) )
|
|
204 return false;
|
|
205 }
|
|
206
|
|
207 // check the platform
|
|
208 if ( platform != null ) {
|
|
209 SAMReadGroupRecord readGroup = read.getReadGroup();
|
|
210 if ( readGroup == null )
|
|
211 return false;
|
|
212
|
|
213 Object readPlatformAttr = readGroup.getAttribute("PL");
|
|
214 if ( readPlatformAttr == null || !readPlatformAttr.toString().toUpperCase().contains(platform))
|
|
215 return false;
|
|
216 }
|
|
217 if (SAMPLES_SPECIFIED ) {
|
|
218 // user specified samples to select
|
|
219 // todo - should be case-agnostic but for simplicity and speed this is ignored.
|
|
220 // todo - can check at initialization intersection of requested samples and samples in BAM header to further speedup.
|
|
221 if (!samplesToChoose.contains(read.getReadGroup().getSample()))
|
|
222 return false;
|
|
223 }
|
|
224
|
|
225
|
|
226 // check if we've reached the output limit
|
|
227 if ( nReadsToPrint == 0 ) {
|
|
228 return false; // n == 0 means we've printed all we needed.
|
|
229 }
|
|
230 else if (nReadsToPrint > 0) {
|
|
231 nReadsToPrint--; // n > 0 means there are still reads to be printed.
|
|
232 }
|
|
233
|
|
234 return true;
|
|
235 }
|
|
236
|
|
237 /**
|
|
238 * The reads map function.
|
|
239 *
|
|
240 * @param ref the reference bases that correspond to our read, if a reference was provided
|
|
241 * @param readIn the read itself, as a GATKSAMRecord
|
|
242 * @return the read itself
|
|
243 */
|
|
244 public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) {
|
|
245 GATKSAMRecord workingRead = readIn;
|
|
246
|
|
247 for ( final ReadTransformer transformer : readTransformers ) {
|
|
248 workingRead = transformer.apply(workingRead);
|
|
249 }
|
|
250
|
|
251 if ( simplifyReads ) workingRead = workingRead.simplify();
|
|
252
|
|
253 return workingRead;
|
|
254 }
|
|
255
|
|
256 /**
|
|
257 * reduceInit is called once before any calls to the map function. We use it here to setup the output
|
|
258 * bam file, if it was specified on the command line
|
|
259 *
|
|
260 * @return SAMFileWriter, set to the BAM output file if the command line option was set, null otherwise
|
|
261 */
|
|
262 public SAMFileWriter reduceInit() {
|
|
263 return out;
|
|
264 }
|
|
265
|
|
266 /**
|
|
267 * given a read and a output location, reduce by emitting the read
|
|
268 *
|
|
269 * @param read the read itself
|
|
270 * @param output the output source
|
|
271 * @return the SAMFileWriter, so that the next reduce can emit to the same source
|
|
272 */
|
|
273 public SAMFileWriter reduce( GATKSAMRecord read, SAMFileWriter output ) {
|
|
274 output.addAlignment(read);
|
|
275 return output;
|
|
276 }
|
|
277 }
|