6
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Strelka Workflow Software
|
|
6 Copyright (c) 2009-2013 Illumina, Inc.
|
|
7
|
|
8 This software is provided under the terms and conditions of the
|
|
9 Illumina Open Source Software License 1.
|
|
10
|
|
11 You should have received a copy of the Illumina Open Source
|
|
12 Software License 1 along with this program. If not, see
|
|
13 <https://github.com/downloads/sequencing/licenses/>.
|
|
14
|
|
15 =head1 SYNOPSIS
|
|
16
|
|
17 configureStrelkaWorkflow.pl --tumor FILE --normal FILE --ref FILE --config FILE [options]
|
|
18
|
|
19 This script configures the strelka workflow for somatic variant
|
|
20 calling on matched tumor-normal BAM files. The configuration process
|
|
21 will produce an analysis makefile and directory structure. The
|
|
22 makefile can be used to run the analysis on a workstation or compute
|
|
23 cluster via make/qmake or other makefile compatible process.
|
|
24
|
|
25 =head1 ARGUMENTS
|
|
26
|
|
27 =over 4
|
|
28
|
|
29 =item --tumor FILE
|
|
30
|
|
31 Path to tumor sample BAM file (required)
|
|
32
|
|
33 =item --normal FILE
|
|
34
|
|
35 Path to normal sample BAM file (required)
|
|
36
|
|
37 =item --ref FILE
|
|
38
|
|
39 Path to reference genome fasta (required)
|
|
40
|
|
41 =item --config FILE
|
|
42
|
|
43 Strelka configuration file. Default config files can be found in
|
|
44 ${STRELKA_INSTALL_ROOT}/etc/ for both ELAND and BWA alignments. (required)
|
|
45
|
|
46 =back
|
|
47
|
|
48 =head1 OPTIONS
|
|
49
|
|
50 =over 4
|
|
51
|
|
52 =item --output-dir DIRECTORY
|
|
53
|
|
54 Root of the analysis directory. This script will place all
|
|
55 configuration files in the analysis directory, and after configuration
|
|
56 all results and intermediate files will be written within the analysis
|
|
57 directory during a run. This directory must not already
|
|
58 exist. (default: ./strelkaAnalysis)
|
|
59
|
|
60 =back
|
|
61
|
|
62 =cut
|
|
63
|
|
64 use warnings FATAL => 'all';
|
|
65 use strict;
|
|
66
|
|
67 use Carp;
|
|
68 $SIG{__DIE__} = \&Carp::confess;
|
|
69
|
|
70 use Cwd qw(getcwd);
|
|
71 use File::Spec;
|
|
72 use Getopt::Long;
|
|
73 use Pod::Usage;
|
|
74
|
|
75 my $baseDir;
|
|
76 my $libDir;
|
|
77 BEGIN {
|
|
78 my $thisDir=(File::Spec->splitpath($0))[1];
|
|
79 $baseDir=$thisDir;#$baseDir=File::Spec->catdir($thisDir,File::Spec->updir());
|
|
80 $libDir=File::Spec->catdir($baseDir,'lib');
|
|
81 }
|
|
82 use lib $libDir;
|
|
83 use Utils;
|
|
84
|
|
85 if(getAbsPath($baseDir)) {
|
|
86 errorX("Can't resolve path for strelka_workflow install directory: '$baseDir'");
|
|
87 }
|
|
88 my $libexecDir=File::Spec->catdir($baseDir,'libexec');
|
|
89 #my $optDir=File::Spec->catdir($baseDir,'opt');
|
|
90
|
|
91
|
|
92 my $scriptName=(File::Spec->splitpath($0))[2];
|
|
93 my $argCount=scalar(@ARGV);
|
|
94 my $cmdline = join(' ',$0,@ARGV);
|
|
95
|
|
96
|
|
97 sub usage() { pod2usage(-verbose => 1,
|
|
98 -exitval => 2); }
|
|
99
|
|
100 #
|
|
101 # user configuration:
|
|
102 #
|
|
103
|
|
104 my ($tumorBam, $normalBam, $refFile, $configFile, $outDir);
|
|
105 my $help;
|
|
106
|
|
107 GetOptions( "tumor=s" => \$tumorBam,
|
|
108 "normal=s" => \$normalBam,
|
|
109 "ref=s" => \$refFile,
|
|
110 "config=s" => \$configFile,
|
|
111 "output-dir=s" => \$outDir,
|
|
112 "help|h" => \$help) || usage();
|
|
113
|
|
114 usage() if($help);
|
|
115 usage() unless($argCount);
|
|
116
|
|
117
|
|
118 #
|
|
119 # Validate input conditions:
|
|
120 #
|
|
121
|
|
122 sub checkFileArg($$) {
|
|
123 my ($file,$label) = @_;
|
|
124
|
|
125 errorX("Must specify $label file") unless(defined($file)); #raise an error if file not defined
|
|
126 checkFile($file,$label);
|
|
127 }
|
|
128
|
|
129 checkFileArg($tumorBam,"tumor BAM");
|
|
130 checkFileArg($normalBam,"normal BAM");
|
|
131 checkFileArg($refFile,"reference fasta");
|
|
132 checkFileArg($configFile,"configuration ini");
|
|
133
|
|
134 sub makeAbsoluteFilePaths(\$) {
|
|
135 my ($filePathRef) = @_;
|
|
136
|
|
137 my ($v,$fileDir,$fileName) = File::Spec->splitpath($$filePathRef);
|
|
138 if(getAbsPath($fileDir)) {
|
|
139 errorX("Can't resolve directory path for '$fileDir' from input file argument: '$$filePathRef'");
|
|
140 }
|
|
141 $$filePathRef = File::Spec->catfile($fileDir,$fileName);
|
|
142 }
|
|
143
|
|
144 makeAbsoluteFilePaths($tumorBam);
|
|
145 makeAbsoluteFilePaths($normalBam);
|
|
146 makeAbsoluteFilePaths($refFile);
|
|
147 makeAbsoluteFilePaths($configFile);
|
|
148
|
|
149 # also check for BAM index files:
|
|
150 sub checkBamIndex($) {
|
|
151 my ($file) = @_;
|
|
152 my $ifile = $file . ".bai";
|
|
153 if(! -f $ifile) {
|
|
154 errorX("Can't find index for BAM file '$file'");
|
|
155 }
|
|
156 }
|
|
157
|
|
158 checkBamIndex($tumorBam);
|
|
159 checkBamIndex($normalBam);
|
|
160
|
|
161
|
|
162 sub checkFaIndex($) {
|
|
163 my ($file) = @_;
|
|
164 my $ifile = $file . ".fai";
|
|
165 if(! -f $ifile) {
|
|
166 errorX("Can't find index for fasta file '$file'");
|
|
167 }
|
|
168 # check that fai file isn't improperly formatted (a la the GATK bundle NCBI 37 fai files)
|
|
169 open(my $FH,"< $ifile") || errorX("Can't open fai file '$ifile'");
|
|
170 my $lineno=1;
|
|
171 while(<$FH>) {
|
|
172 chomp;
|
|
173 my @F=split();
|
|
174 if(scalar(@F) != 5) {
|
|
175 errorX("Unexpected format for line number '$lineno' of fasta index file: '$ifile'\n\tRe-running fasta indexing may fix the issue. To do so, run: \"samtools faidx $file\"");
|
|
176 }
|
|
177 $lineno++;
|
|
178 }
|
|
179 close($FH);
|
|
180 }
|
|
181
|
|
182 checkFaIndex($refFile);
|
|
183
|
|
184
|
|
185 if(defined($outDir)) {
|
|
186 if(getAbsPath($outDir)) {
|
|
187 errorX("Can't resolve path for ouput directory: '$outDir'");
|
|
188 }
|
|
189 } else {
|
|
190 $outDir=File::Spec->catdir(Cwd::getcwd(),'strelkaAnalysis');
|
|
191 }
|
|
192
|
|
193 if(-e $outDir) {
|
|
194 errorX("Output path already exists: '$outDir'");
|
|
195 }
|
|
196
|
|
197 if(getAbsPath($baseDir)) {
|
|
198 errorX("Can't resolve path for strelka install directory: '$baseDir'");
|
|
199 }
|
|
200
|
|
201 #my $samtoolsDir = File::Spec->catdir($optDir,'samtools');
|
|
202 checkDir($libexecDir,"strelka libexec");
|
|
203 #checkDir($samtoolsDir,"samtools");
|
|
204
|
|
205 my $callScriptName = "callSomaticVariants.pl";
|
|
206 my $filterScriptName = "filterSomaticVariants.pl";
|
|
207 my $finishScriptName = "consolidateResults.pl";
|
|
208 my $callScript = File::Spec->catfile($libexecDir,$callScriptName);
|
|
209 my $filterScript = File::Spec->catfile($libexecDir,$filterScriptName);
|
|
210 my $finishScript = File::Spec->catfile($libexecDir,$finishScriptName);
|
|
211 my $countFasta = File::Spec->catfile($libexecDir,"countFastaBases");
|
|
212 #my $samtoolsBin = File::Spec->catfile($samtoolsDir,"samtools");
|
|
213 checkFile($callScript,"somatic variant call script");
|
|
214 checkFile($filterScript,"somatic variant filter script");
|
|
215 checkFile($finishScript,"result consolidation script");
|
|
216 checkFile($countFasta,"fasta scanner");
|
|
217 #checkFile($samtoolsBin,"samtools");
|
|
218
|
|
219 #
|
|
220 # Configure bin runs:
|
|
221 #
|
|
222 checkMakeDir($outDir);
|
|
223
|
|
224 #
|
|
225 # Configure bin runs: open and validate config ini
|
|
226 #
|
|
227 my $config = parseConfigIni($configFile);
|
|
228
|
|
229 sub checkConfigKeys($) {
|
|
230 my ($keyref) = @_;
|
|
231 for (@$keyref) {
|
|
232 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_}));
|
|
233 }
|
|
234 }
|
|
235
|
|
236 # these are the keys we need at configuration time:
|
|
237 my @config_keys = qw(binSize);
|
|
238
|
|
239 # these are additional keys we will need to run the workflow:
|
|
240 # (note we don't check for (maxInputDepth,minTier2Mapq) for back compatibility with older config files)
|
|
241 my @workflow_keys = qw(
|
|
242 minTier1Mapq isWriteRealignedBam ssnvPrior sindelPrior ssnvNoise sindelNoise ssnvNoiseStrandBiasFrac
|
|
243 ssnvQuality_LowerBound sindelQuality_LowerBound isSkipDepthFilters depthFilterMultiple
|
|
244 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac indelMaxRefRepeat
|
|
245 indelMaxWindowFilteredBasecallFrac indelMaxIntHpolLength);
|
|
246
|
|
247 checkConfigKeys(\@config_keys);
|
|
248 checkConfigKeys(\@workflow_keys);
|
|
249
|
|
250
|
|
251 my $binSize = int($config->{user}{binSize});
|
|
252
|
|
253 $config->{derived}{configurationCmdline} = $cmdline;
|
|
254 $config->{derived}{normalBam} = $normalBam;
|
|
255 $config->{derived}{tumorBam} = $tumorBam;
|
|
256 $config->{derived}{refFile} = $refFile;
|
|
257 $config->{derived}{outDir} = $outDir;
|
|
258
|
|
259 #
|
|
260 # Configure bin runs: check for consistent chrom info between BAMs and reference
|
|
261 #
|
|
262 sub getBamChromInfo($) {
|
|
263 my $file = shift;
|
|
264 my $cmd = "samtools view -H $file |";
|
|
265 open(my $FH,$cmd) || errorX("Can't open process $cmd");
|
|
266
|
|
267 my %info;
|
|
268 my $n=0;
|
|
269 while(<$FH>) {
|
|
270 next unless(/^\@SQ/);
|
|
271 chomp;
|
|
272 my @F = split(/\t/);
|
|
273 scalar(@F) >= 3 || errorX("Unexpected bam header for file '$file'");
|
|
274
|
|
275 my %h = ();
|
|
276 foreach (@F) {
|
|
277 my @vals = split(':');
|
|
278 $h{$vals[0]} = $vals[1];
|
|
279 }
|
|
280 $F[1] = $h{'SN'};
|
|
281 $F[2] = $h{'LN'};
|
|
282
|
|
283 my $size = int($F[2]);
|
|
284 ($size > 0) || errorX("Unexpected chromosome size '$size' in bam header for file '$file'");
|
|
285 $info{$F[1]}{size} = $size;
|
|
286 $info{$F[1]}{order} = $n;
|
|
287 $n++;
|
|
288 }
|
|
289 close($FH) || errorX("Can't close process $cmd");
|
|
290 return %info;
|
|
291 }
|
|
292
|
|
293
|
|
294 my %chromInfo = getBamChromInfo($normalBam);
|
|
295 my @chroms = sort { $chromInfo{$a}{order} <=> $chromInfo{$b}{order} } (keys(%chromInfo));
|
|
296
|
|
297 {
|
|
298 #consistency check:
|
|
299 my %tumorChromInfo = getBamChromInfo($tumorBam);
|
|
300 for my $chrom (@chroms) {
|
|
301 my $ln = $chromInfo{$chrom}{size};
|
|
302 my $tln = $tumorChromInfo{$chrom}{size};
|
|
303 my $order = $chromInfo{$chrom}{order};
|
|
304 my $torder = $tumorChromInfo{$chrom}{order};
|
|
305 unless(defined($tln) && ($tln==$ln) && ($torder==$order)) {
|
|
306 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'");
|
|
307 }
|
|
308 delete $tumorChromInfo{$chrom};
|
|
309 }
|
|
310 for my $chrom (keys(%tumorChromInfo)) {
|
|
311 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'");
|
|
312 }
|
|
313 }
|
|
314
|
|
315
|
|
316 my %refChromInfo;
|
|
317 logX("Scanning reference genome");
|
|
318 {
|
|
319 my $knownGenomeSize=0;
|
|
320 my $cmd="$countFasta $refFile |";
|
|
321 open(my $FFH,$cmd) || errorX("Failed to open process '$cmd'");
|
|
322
|
|
323 while(<$FFH>) {
|
|
324 chomp;
|
|
325 my @F = split(/\t/);
|
|
326 scalar(@F) == 4 || errorX("Unexpected value from '$cmd'");
|
|
327 $knownGenomeSize += int($F[2]);
|
|
328 $refChromInfo{$F[1]}{knownSize} = int($F[2]);
|
|
329 $refChromInfo{$F[1]}{size} = int($F[3]);
|
|
330 }
|
|
331 close($FFH) || errorX("Failed to close process '$cmd'");
|
|
332
|
|
333 #consistency check:
|
|
334 for my $chrom (@chroms) {
|
|
335 my $ln = $chromInfo{$chrom}{size};
|
|
336 my $rln = $refChromInfo{$chrom}{size};
|
|
337 unless(defined($rln) && ($rln==$ln)) {
|
|
338 errorX("BAM headers and reference fasta disagree on chromosome: '$chrom'");
|
|
339 }
|
|
340 $config->{derived}{"chrom_${chrom}_size"} = $rln;
|
|
341 $config->{derived}{"chrom_${chrom}_knownSize"} = $refChromInfo{$chrom}{knownSize};
|
|
342 }
|
|
343 $config->{derived}{chromOrder} = join("\t",@chroms);
|
|
344
|
|
345 $config->{derived}{knownGenomeSize} = $knownGenomeSize;
|
|
346 }
|
|
347 logX("Scanning reference genome complete");
|
|
348
|
|
349
|
|
350
|
|
351 #
|
|
352 # Configure bin runs: create directory structure
|
|
353 #
|
|
354 my $resultsDir = File::Spec->catdir($outDir,'results');
|
|
355 checkMakeDir($resultsDir);
|
|
356 if($config->{user}{isWriteRealignedBam}) {
|
|
357 my $bamDir = File::Spec->catdir($outDir,'realigned');
|
|
358 checkMakeDir($bamDir);
|
|
359 }
|
|
360 my $chromRootDir = File::Spec->catdir($outDir,'chromosomes');
|
|
361 checkMakeDir($chromRootDir);
|
|
362 for my $chrom (@chroms) {
|
|
363 my $chromDir = File::Spec->catdir($chromRootDir,$chrom);
|
|
364 checkMakeDir($chromDir);
|
|
365
|
|
366 my $chromRef = $chromInfo{$chrom};
|
|
367 $chromRef->{dir} = $chromDir;
|
|
368 $chromRef->{binList} = getBinList($chromRef->{size},$binSize);
|
|
369
|
|
370 my $binRootDir = File::Spec->catdir($chromDir,'bins');
|
|
371 checkMakeDir($binRootDir);
|
|
372
|
|
373 for my $binId ( @{$chromRef->{binList}} ) {
|
|
374 my $binDir = File::Spec->catdir($binRootDir,$binId);
|
|
375 checkMakeDir($binDir);
|
|
376 }
|
|
377 }
|
|
378
|
|
379
|
|
380
|
|
381 #
|
|
382 # write run config file:
|
|
383 #
|
|
384 my $runConfigFile;
|
|
385 {
|
|
386 my $cstr = <<END;
|
|
387 ;
|
|
388 ; Strelka workflow configuration file
|
|
389 ;
|
|
390 ; This is an automatically generated file, you probably don't want to edit it. If starting a new run,
|
|
391 ; input configuration templates (with comments) can be found in the Strelka etc/ directory.
|
|
392 ;
|
|
393 END
|
|
394
|
|
395 $cstr .= writeConfigIni($config);
|
|
396
|
|
397 my $configDir = File::Spec->catdir($outDir,'config');
|
|
398 checkMakeDir($configDir);
|
|
399 $runConfigFile = File::Spec->catdir($configDir,'run.config.ini');
|
|
400 open(my $FH,"> $runConfigFile") || errorX("Can't open file '$runConfigFile'");
|
|
401 print $FH $cstr;
|
|
402 close($FH);
|
|
403 }
|
|
404
|
|
405
|
|
406
|
|
407 #
|
|
408 # create makefile
|
|
409 #
|
|
410 my $makeFile = File::Spec->catfile($outDir,"Makefile");
|
|
411 open(my $MAKEFH, "> $makeFile") || errorX("Can't open file: '$makeFile'");
|
|
412
|
|
413 my $completeFile = "task.complete";
|
|
414
|
|
415 print $MAKEFH <<ENDE;
|
|
416 # This makefile was automatically generated by $scriptName
|
|
417 #
|
|
418 # Please do not edit.
|
|
419
|
|
420 script_dir := $libexecDir
|
|
421 call_script := \$(script_dir)/$callScriptName
|
|
422 filter_script := \$(script_dir)/$filterScriptName
|
|
423 finish_script := \$(script_dir)/$finishScriptName
|
|
424
|
|
425 config_file := $runConfigFile
|
|
426
|
|
427 analysis_dir := $outDir
|
|
428 results_dir := \$(analysis_dir)/results
|
|
429
|
|
430 ENDE
|
|
431
|
|
432 print $MAKEFH <<'ENDE';
|
|
433
|
|
434 complete_tag := task.complete
|
|
435
|
|
436 finish_task := $(analysis_dir)/$(complete_tag)
|
|
437
|
|
438 get_chrom_dir = $(analysis_dir)/chromosomes/$1
|
|
439 get_chrom_task = $(call get_chrom_dir,$1)/$(complete_tag)
|
|
440 get_bin_task = $(call get_chrom_dir,$1)/bins/$2/$(complete_tag)
|
|
441
|
|
442
|
|
443
|
|
444 all: $(finish_task)
|
|
445 @$(print_success)
|
|
446
|
|
447
|
|
448 define print_success
|
|
449 echo;\
|
|
450 echo Analysis complete. Final somatic calls can be found in $(results_dir);\
|
|
451 echo
|
|
452 endef
|
|
453
|
|
454
|
|
455 # top level results target:
|
|
456 #
|
|
457 $(finish_task):
|
|
458 $(finish_script) --config=$(config_file) && touch $@
|
|
459
|
|
460
|
|
461 # chromosome targets:
|
|
462 #
|
|
463 ENDE
|
|
464
|
|
465 for my $chrom (@chroms) {
|
|
466
|
|
467 print $MAKEFH <<ENDE;
|
|
468 chrom_${chrom}_task := \$(call get_chrom_task,$chrom)
|
|
469 \$(finish_task): \$(chrom_${chrom}_task)
|
|
470 \$(chrom_${chrom}_task):
|
|
471 \$(filter_script) --config=\$(config_file) --chrom=$chrom && touch \$@
|
|
472
|
|
473 ENDE
|
|
474
|
|
475 }
|
|
476
|
|
477 print $MAKEFH <<ENDE;
|
|
478
|
|
479 # chromosome bin targets:
|
|
480 #
|
|
481 ENDE
|
|
482
|
|
483 for my $chrom (@chroms) {
|
|
484 for my $bin (@{$chromInfo{$chrom}{binList}}) {
|
|
485
|
|
486 print $MAKEFH <<ENDE;
|
|
487 chrom_${chrom}_bin_${bin}_task := \$(call get_bin_task,$chrom,$bin)
|
|
488 \$(chrom_${chrom}_task): \$(chrom_${chrom}_bin_${bin}_task)
|
|
489 \$(chrom_${chrom}_bin_${bin}_task):
|
|
490 \$(call_script) --config=\$(config_file) --chrom=$chrom --bin=$bin && touch \$@
|
|
491
|
|
492 ENDE
|
|
493
|
|
494 }
|
|
495 }
|
|
496
|
|
497
|
|
498 # If the eval function is available, this is the way we could finish
|
|
499 # the makefile without being so verbose but it doesn't look like qmake
|
|
500 # understands this function.
|
|
501
|
|
502 =cut
|
|
503
|
|
504 print $MAKEFH <<ENDE;
|
|
505
|
|
506 chroms := @chroms
|
|
507
|
|
508 ENDE
|
|
509
|
|
510 for my $chrom (@chroms) {
|
|
511 print $MAKEFH "${chrom}_bins := " . join(" ",@{$chromInfo{$chrom}{binList}}) . "\n";
|
|
512 }
|
|
513
|
|
514 print $MAKEFH <<'ENDE';
|
|
515
|
|
516 define chrom_task_template
|
|
517 chrom_$1_task := $(call get_chrom_task,$1)
|
|
518 $(finish_task): $$(chrom_$1_task)
|
|
519 $$(chrom_$1_task):
|
|
520 $$(filter_script) --config=$$(config_file) --chrom=$1 && touch $$@
|
|
521 endef
|
|
522
|
|
523 $(foreach c,$(chroms),$(eval $(call chrom_task_template,$c)))
|
|
524
|
|
525
|
|
526 # chromosome bin targets:
|
|
527 #
|
|
528 define chrom_bin_task_template
|
|
529 chrom_$1_bin_$2_task := $(call get_bin_task,$1,$2)
|
|
530 $$(chrom_$1_task): $$(chrom_$1_bin_$2_task)
|
|
531 $$(chrom_$1_bin_$2_task):
|
|
532 $$(call_script) --config=$$(config_file) --chrom=$1 --bin=$2 && touch $$@
|
|
533 endef
|
|
534
|
|
535 $(foreach c,$(chroms), \
|
|
536 $(foreach b,$($c_bins),$(eval $(call chrom_bin_task_template,$c,$b))) \
|
|
537 )
|
|
538
|
|
539 ENDE
|
|
540
|
|
541 =cut
|
|
542
|
|
543
|
|
544
|
|
545 print <<END;
|
|
546
|
|
547
|
|
548 Successfully configured analysis and created makefile '$makeFile'.
|
|
549
|
|
550 To run the analysis locally using make, run:
|
|
551
|
|
552 make -C $outDir
|
|
553
|
|
554 ...or:
|
|
555
|
|
556 cd $outDir
|
|
557 make
|
|
558
|
|
559 END
|
|
560
|
|
561 1;
|
|
562
|
|
563 __END__
|
|
564
|