comparison configureStrelkaWorkflow.pl @ 6:87568e5a7d4f

Testing strelka version 0.0.1
author mini
date Fri, 26 Sep 2014 13:24:13 +0200
parents
children
comparison
equal deleted inserted replaced
5:07cbbd662111 6:87568e5a7d4f
1 #!/usr/bin/env perl
2
3 =head1 LICENSE
4
5 Strelka Workflow Software
6 Copyright (c) 2009-2013 Illumina, Inc.
7
8 This software is provided under the terms and conditions of the
9 Illumina Open Source Software License 1.
10
11 You should have received a copy of the Illumina Open Source
12 Software License 1 along with this program. If not, see
13 <https://github.com/downloads/sequencing/licenses/>.
14
15 =head1 SYNOPSIS
16
17 configureStrelkaWorkflow.pl --tumor FILE --normal FILE --ref FILE --config FILE [options]
18
19 This script configures the strelka workflow for somatic variant
20 calling on matched tumor-normal BAM files. The configuration process
21 will produce an analysis makefile and directory structure. The
22 makefile can be used to run the analysis on a workstation or compute
23 cluster via make/qmake or other makefile compatible process.
24
25 =head1 ARGUMENTS
26
27 =over 4
28
29 =item --tumor FILE
30
31 Path to tumor sample BAM file (required)
32
33 =item --normal FILE
34
35 Path to normal sample BAM file (required)
36
37 =item --ref FILE
38
39 Path to reference genome fasta (required)
40
41 =item --config FILE
42
43 Strelka configuration file. Default config files can be found in
44 ${STRELKA_INSTALL_ROOT}/etc/ for both ELAND and BWA alignments. (required)
45
46 =back
47
48 =head1 OPTIONS
49
50 =over 4
51
52 =item --output-dir DIRECTORY
53
54 Root of the analysis directory. This script will place all
55 configuration files in the analysis directory, and after configuration
56 all results and intermediate files will be written within the analysis
57 directory during a run. This directory must not already
58 exist. (default: ./strelkaAnalysis)
59
60 =back
61
62 =cut
63
64 use warnings FATAL => 'all';
65 use strict;
66
67 use Carp;
68 $SIG{__DIE__} = \&Carp::confess;
69
70 use Cwd qw(getcwd);
71 use File::Spec;
72 use Getopt::Long;
73 use Pod::Usage;
74
75 my $baseDir;
76 my $libDir;
77 BEGIN {
78 my $thisDir=(File::Spec->splitpath($0))[1];
79 $baseDir=$thisDir;#$baseDir=File::Spec->catdir($thisDir,File::Spec->updir());
80 $libDir=File::Spec->catdir($baseDir,'lib');
81 }
82 use lib $libDir;
83 use Utils;
84
85 if(getAbsPath($baseDir)) {
86 errorX("Can't resolve path for strelka_workflow install directory: '$baseDir'");
87 }
88 my $libexecDir=File::Spec->catdir($baseDir,'libexec');
89 #my $optDir=File::Spec->catdir($baseDir,'opt');
90
91
92 my $scriptName=(File::Spec->splitpath($0))[2];
93 my $argCount=scalar(@ARGV);
94 my $cmdline = join(' ',$0,@ARGV);
95
96
97 sub usage() { pod2usage(-verbose => 1,
98 -exitval => 2); }
99
100 #
101 # user configuration:
102 #
103
104 my ($tumorBam, $normalBam, $refFile, $configFile, $outDir);
105 my $help;
106
107 GetOptions( "tumor=s" => \$tumorBam,
108 "normal=s" => \$normalBam,
109 "ref=s" => \$refFile,
110 "config=s" => \$configFile,
111 "output-dir=s" => \$outDir,
112 "help|h" => \$help) || usage();
113
114 usage() if($help);
115 usage() unless($argCount);
116
117
118 #
119 # Validate input conditions:
120 #
121
122 sub checkFileArg($$) {
123 my ($file,$label) = @_;
124
125 errorX("Must specify $label file") unless(defined($file)); #raise an error if file not defined
126 checkFile($file,$label);
127 }
128
129 checkFileArg($tumorBam,"tumor BAM");
130 checkFileArg($normalBam,"normal BAM");
131 checkFileArg($refFile,"reference fasta");
132 checkFileArg($configFile,"configuration ini");
133
134 sub makeAbsoluteFilePaths(\$) {
135 my ($filePathRef) = @_;
136
137 my ($v,$fileDir,$fileName) = File::Spec->splitpath($$filePathRef);
138 if(getAbsPath($fileDir)) {
139 errorX("Can't resolve directory path for '$fileDir' from input file argument: '$$filePathRef'");
140 }
141 $$filePathRef = File::Spec->catfile($fileDir,$fileName);
142 }
143
144 makeAbsoluteFilePaths($tumorBam);
145 makeAbsoluteFilePaths($normalBam);
146 makeAbsoluteFilePaths($refFile);
147 makeAbsoluteFilePaths($configFile);
148
149 # also check for BAM index files:
150 sub checkBamIndex($) {
151 my ($file) = @_;
152 my $ifile = $file . ".bai";
153 if(! -f $ifile) {
154 errorX("Can't find index for BAM file '$file'");
155 }
156 }
157
158 checkBamIndex($tumorBam);
159 checkBamIndex($normalBam);
160
161
162 sub checkFaIndex($) {
163 my ($file) = @_;
164 my $ifile = $file . ".fai";
165 if(! -f $ifile) {
166 errorX("Can't find index for fasta file '$file'");
167 }
168 # check that fai file isn't improperly formatted (a la the GATK bundle NCBI 37 fai files)
169 open(my $FH,"< $ifile") || errorX("Can't open fai file '$ifile'");
170 my $lineno=1;
171 while(<$FH>) {
172 chomp;
173 my @F=split();
174 if(scalar(@F) != 5) {
175 errorX("Unexpected format for line number '$lineno' of fasta index file: '$ifile'\n\tRe-running fasta indexing may fix the issue. To do so, run: \"samtools faidx $file\"");
176 }
177 $lineno++;
178 }
179 close($FH);
180 }
181
182 checkFaIndex($refFile);
183
184
185 if(defined($outDir)) {
186 if(getAbsPath($outDir)) {
187 errorX("Can't resolve path for ouput directory: '$outDir'");
188 }
189 } else {
190 $outDir=File::Spec->catdir(Cwd::getcwd(),'strelkaAnalysis');
191 }
192
193 if(-e $outDir) {
194 errorX("Output path already exists: '$outDir'");
195 }
196
197 if(getAbsPath($baseDir)) {
198 errorX("Can't resolve path for strelka install directory: '$baseDir'");
199 }
200
201 #my $samtoolsDir = File::Spec->catdir($optDir,'samtools');
202 checkDir($libexecDir,"strelka libexec");
203 #checkDir($samtoolsDir,"samtools");
204
205 my $callScriptName = "callSomaticVariants.pl";
206 my $filterScriptName = "filterSomaticVariants.pl";
207 my $finishScriptName = "consolidateResults.pl";
208 my $callScript = File::Spec->catfile($libexecDir,$callScriptName);
209 my $filterScript = File::Spec->catfile($libexecDir,$filterScriptName);
210 my $finishScript = File::Spec->catfile($libexecDir,$finishScriptName);
211 my $countFasta = File::Spec->catfile($libexecDir,"countFastaBases");
212 #my $samtoolsBin = File::Spec->catfile($samtoolsDir,"samtools");
213 checkFile($callScript,"somatic variant call script");
214 checkFile($filterScript,"somatic variant filter script");
215 checkFile($finishScript,"result consolidation script");
216 checkFile($countFasta,"fasta scanner");
217 #checkFile($samtoolsBin,"samtools");
218
219 #
220 # Configure bin runs:
221 #
222 checkMakeDir($outDir);
223
224 #
225 # Configure bin runs: open and validate config ini
226 #
227 my $config = parseConfigIni($configFile);
228
229 sub checkConfigKeys($) {
230 my ($keyref) = @_;
231 for (@$keyref) {
232 errorX("Undefined configuration option: '$_'") unless(defined($config->{user}{$_}));
233 }
234 }
235
236 # these are the keys we need at configuration time:
237 my @config_keys = qw(binSize);
238
239 # these are additional keys we will need to run the workflow:
240 # (note we don't check for (maxInputDepth,minTier2Mapq) for back compatibility with older config files)
241 my @workflow_keys = qw(
242 minTier1Mapq isWriteRealignedBam ssnvPrior sindelPrior ssnvNoise sindelNoise ssnvNoiseStrandBiasFrac
243 ssnvQuality_LowerBound sindelQuality_LowerBound isSkipDepthFilters depthFilterMultiple
244 snvMaxFilteredBasecallFrac snvMaxSpanningDeletionFrac indelMaxRefRepeat
245 indelMaxWindowFilteredBasecallFrac indelMaxIntHpolLength);
246
247 checkConfigKeys(\@config_keys);
248 checkConfigKeys(\@workflow_keys);
249
250
251 my $binSize = int($config->{user}{binSize});
252
253 $config->{derived}{configurationCmdline} = $cmdline;
254 $config->{derived}{normalBam} = $normalBam;
255 $config->{derived}{tumorBam} = $tumorBam;
256 $config->{derived}{refFile} = $refFile;
257 $config->{derived}{outDir} = $outDir;
258
259 #
260 # Configure bin runs: check for consistent chrom info between BAMs and reference
261 #
262 sub getBamChromInfo($) {
263 my $file = shift;
264 my $cmd = "samtools view -H $file |";
265 open(my $FH,$cmd) || errorX("Can't open process $cmd");
266
267 my %info;
268 my $n=0;
269 while(<$FH>) {
270 next unless(/^\@SQ/);
271 chomp;
272 my @F = split(/\t/);
273 scalar(@F) >= 3 || errorX("Unexpected bam header for file '$file'");
274
275 my %h = ();
276 foreach (@F) {
277 my @vals = split(':');
278 $h{$vals[0]} = $vals[1];
279 }
280 $F[1] = $h{'SN'};
281 $F[2] = $h{'LN'};
282
283 my $size = int($F[2]);
284 ($size > 0) || errorX("Unexpected chromosome size '$size' in bam header for file '$file'");
285 $info{$F[1]}{size} = $size;
286 $info{$F[1]}{order} = $n;
287 $n++;
288 }
289 close($FH) || errorX("Can't close process $cmd");
290 return %info;
291 }
292
293
294 my %chromInfo = getBamChromInfo($normalBam);
295 my @chroms = sort { $chromInfo{$a}{order} <=> $chromInfo{$b}{order} } (keys(%chromInfo));
296
297 {
298 #consistency check:
299 my %tumorChromInfo = getBamChromInfo($tumorBam);
300 for my $chrom (@chroms) {
301 my $ln = $chromInfo{$chrom}{size};
302 my $tln = $tumorChromInfo{$chrom}{size};
303 my $order = $chromInfo{$chrom}{order};
304 my $torder = $tumorChromInfo{$chrom}{order};
305 unless(defined($tln) && ($tln==$ln) && ($torder==$order)) {
306 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'");
307 }
308 delete $tumorChromInfo{$chrom};
309 }
310 for my $chrom (keys(%tumorChromInfo)) {
311 errorX("Tumor and normal BAM file headers disagree on chromosome: '$chrom'");
312 }
313 }
314
315
316 my %refChromInfo;
317 logX("Scanning reference genome");
318 {
319 my $knownGenomeSize=0;
320 my $cmd="$countFasta $refFile |";
321 open(my $FFH,$cmd) || errorX("Failed to open process '$cmd'");
322
323 while(<$FFH>) {
324 chomp;
325 my @F = split(/\t/);
326 scalar(@F) == 4 || errorX("Unexpected value from '$cmd'");
327 $knownGenomeSize += int($F[2]);
328 $refChromInfo{$F[1]}{knownSize} = int($F[2]);
329 $refChromInfo{$F[1]}{size} = int($F[3]);
330 }
331 close($FFH) || errorX("Failed to close process '$cmd'");
332
333 #consistency check:
334 for my $chrom (@chroms) {
335 my $ln = $chromInfo{$chrom}{size};
336 my $rln = $refChromInfo{$chrom}{size};
337 unless(defined($rln) && ($rln==$ln)) {
338 errorX("BAM headers and reference fasta disagree on chromosome: '$chrom'");
339 }
340 $config->{derived}{"chrom_${chrom}_size"} = $rln;
341 $config->{derived}{"chrom_${chrom}_knownSize"} = $refChromInfo{$chrom}{knownSize};
342 }
343 $config->{derived}{chromOrder} = join("\t",@chroms);
344
345 $config->{derived}{knownGenomeSize} = $knownGenomeSize;
346 }
347 logX("Scanning reference genome complete");
348
349
350
351 #
352 # Configure bin runs: create directory structure
353 #
354 my $resultsDir = File::Spec->catdir($outDir,'results');
355 checkMakeDir($resultsDir);
356 if($config->{user}{isWriteRealignedBam}) {
357 my $bamDir = File::Spec->catdir($outDir,'realigned');
358 checkMakeDir($bamDir);
359 }
360 my $chromRootDir = File::Spec->catdir($outDir,'chromosomes');
361 checkMakeDir($chromRootDir);
362 for my $chrom (@chroms) {
363 my $chromDir = File::Spec->catdir($chromRootDir,$chrom);
364 checkMakeDir($chromDir);
365
366 my $chromRef = $chromInfo{$chrom};
367 $chromRef->{dir} = $chromDir;
368 $chromRef->{binList} = getBinList($chromRef->{size},$binSize);
369
370 my $binRootDir = File::Spec->catdir($chromDir,'bins');
371 checkMakeDir($binRootDir);
372
373 for my $binId ( @{$chromRef->{binList}} ) {
374 my $binDir = File::Spec->catdir($binRootDir,$binId);
375 checkMakeDir($binDir);
376 }
377 }
378
379
380
381 #
382 # write run config file:
383 #
384 my $runConfigFile;
385 {
386 my $cstr = <<END;
387 ;
388 ; Strelka workflow configuration file
389 ;
390 ; This is an automatically generated file, you probably don't want to edit it. If starting a new run,
391 ; input configuration templates (with comments) can be found in the Strelka etc/ directory.
392 ;
393 END
394
395 $cstr .= writeConfigIni($config);
396
397 my $configDir = File::Spec->catdir($outDir,'config');
398 checkMakeDir($configDir);
399 $runConfigFile = File::Spec->catdir($configDir,'run.config.ini');
400 open(my $FH,"> $runConfigFile") || errorX("Can't open file '$runConfigFile'");
401 print $FH $cstr;
402 close($FH);
403 }
404
405
406
407 #
408 # create makefile
409 #
410 my $makeFile = File::Spec->catfile($outDir,"Makefile");
411 open(my $MAKEFH, "> $makeFile") || errorX("Can't open file: '$makeFile'");
412
413 my $completeFile = "task.complete";
414
415 print $MAKEFH <<ENDE;
416 # This makefile was automatically generated by $scriptName
417 #
418 # Please do not edit.
419
420 script_dir := $libexecDir
421 call_script := \$(script_dir)/$callScriptName
422 filter_script := \$(script_dir)/$filterScriptName
423 finish_script := \$(script_dir)/$finishScriptName
424
425 config_file := $runConfigFile
426
427 analysis_dir := $outDir
428 results_dir := \$(analysis_dir)/results
429
430 ENDE
431
432 print $MAKEFH <<'ENDE';
433
434 complete_tag := task.complete
435
436 finish_task := $(analysis_dir)/$(complete_tag)
437
438 get_chrom_dir = $(analysis_dir)/chromosomes/$1
439 get_chrom_task = $(call get_chrom_dir,$1)/$(complete_tag)
440 get_bin_task = $(call get_chrom_dir,$1)/bins/$2/$(complete_tag)
441
442
443
444 all: $(finish_task)
445 @$(print_success)
446
447
448 define print_success
449 echo;\
450 echo Analysis complete. Final somatic calls can be found in $(results_dir);\
451 echo
452 endef
453
454
455 # top level results target:
456 #
457 $(finish_task):
458 $(finish_script) --config=$(config_file) && touch $@
459
460
461 # chromosome targets:
462 #
463 ENDE
464
465 for my $chrom (@chroms) {
466
467 print $MAKEFH <<ENDE;
468 chrom_${chrom}_task := \$(call get_chrom_task,$chrom)
469 \$(finish_task): \$(chrom_${chrom}_task)
470 \$(chrom_${chrom}_task):
471 \$(filter_script) --config=\$(config_file) --chrom=$chrom && touch \$@
472
473 ENDE
474
475 }
476
477 print $MAKEFH <<ENDE;
478
479 # chromosome bin targets:
480 #
481 ENDE
482
483 for my $chrom (@chroms) {
484 for my $bin (@{$chromInfo{$chrom}{binList}}) {
485
486 print $MAKEFH <<ENDE;
487 chrom_${chrom}_bin_${bin}_task := \$(call get_bin_task,$chrom,$bin)
488 \$(chrom_${chrom}_task): \$(chrom_${chrom}_bin_${bin}_task)
489 \$(chrom_${chrom}_bin_${bin}_task):
490 \$(call_script) --config=\$(config_file) --chrom=$chrom --bin=$bin && touch \$@
491
492 ENDE
493
494 }
495 }
496
497
498 # If the eval function is available, this is the way we could finish
499 # the makefile without being so verbose but it doesn't look like qmake
500 # understands this function.
501
502 =cut
503
504 print $MAKEFH <<ENDE;
505
506 chroms := @chroms
507
508 ENDE
509
510 for my $chrom (@chroms) {
511 print $MAKEFH "${chrom}_bins := " . join(" ",@{$chromInfo{$chrom}{binList}}) . "\n";
512 }
513
514 print $MAKEFH <<'ENDE';
515
516 define chrom_task_template
517 chrom_$1_task := $(call get_chrom_task,$1)
518 $(finish_task): $$(chrom_$1_task)
519 $$(chrom_$1_task):
520 $$(filter_script) --config=$$(config_file) --chrom=$1 && touch $$@
521 endef
522
523 $(foreach c,$(chroms),$(eval $(call chrom_task_template,$c)))
524
525
526 # chromosome bin targets:
527 #
528 define chrom_bin_task_template
529 chrom_$1_bin_$2_task := $(call get_bin_task,$1,$2)
530 $$(chrom_$1_task): $$(chrom_$1_bin_$2_task)
531 $$(chrom_$1_bin_$2_task):
532 $$(call_script) --config=$$(config_file) --chrom=$1 --bin=$2 && touch $$@
533 endef
534
535 $(foreach c,$(chroms), \
536 $(foreach b,$($c_bins),$(eval $(call chrom_bin_task_template,$c,$b))) \
537 )
538
539 ENDE
540
541 =cut
542
543
544
545 print <<END;
546
547
548 Successfully configured analysis and created makefile '$makeFile'.
549
550 To run the analysis locally using make, run:
551
552 make -C $outDir
553
554 ...or:
555
556 cd $outDir
557 make
558
559 END
560
561 1;
562
563 __END__
564