Mercurial > repos > yusuf > miseq_bam_variants
comparison freebayes_parallel @ 0:1a23ea467feb default tip
intial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Thu, 26 Mar 2015 09:36:17 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1a23ea467feb |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use File::Temp; | |
| 6 | |
| 7 @ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n"; | |
| 8 | |
| 9 my $in_bam = shift @ARGV; | |
| 10 my $num_procs = shift @ARGV; | |
| 11 my $out_vcf = shift @ARGV; | |
| 12 | |
| 13 $SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup; | |
| 14 # Lines look some thing like "@SQ SN:chr1 LN:249250621" | |
| 15 open(SAM_HEADERS, "samtools view -H $in_bam |") | |
| 16 or die "Cannot run samtools on $in_bam: $!\n"; | |
| 17 my %seq2length; | |
| 18 my %seqname2orig_order; | |
| 19 while(<SAM_HEADERS>){ | |
| 20 if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){ | |
| 21 $seq2length{$1} = $2; | |
| 22 $seqname2orig_order{$1} = $.; | |
| 23 } | |
| 24 } | |
| 25 close(SAM_HEADERS); | |
| 26 | |
| 27 # Make sure the index is available and has the name FreeBayes expects | |
| 28 my $bamfile_stem = $in_bam; | |
| 29 $bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one | |
| 30 if(not -e "$bamfile_stem.bai"){ | |
| 31 if(not -e "$in_bam.bai"){ | |
| 32 system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n"; | |
| 33 } | |
| 34 system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n"; | |
| 35 } | |
| 36 | |
| 37 my @cmds; | |
| 38 my @tmp_outfiles; | |
| 39 my %tmpfile2orig_order; | |
| 40 # Sort contigs from largest to smallest for scheduling efficiency | |
| 41 for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){ | |
| 42 my ($tmp_fh, $tmp_filename) = tmpnam(); | |
| 43 push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}"; | |
| 44 push @tmp_outfiles, $tmp_filename; | |
| 45 $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name}; | |
| 46 } | |
| 47 | |
| 48 open(OUT_VCF, ">$out_vcf") | |
| 49 or die "Cannot open $out_vcf for writing: $!\n"; | |
| 50 | |
| 51 run_cmds($num_procs, @cmds); | |
| 52 | |
| 53 # Grab output header from first temp output file | |
| 54 open(H, $tmp_outfiles[0]) | |
| 55 or die "Cannot open $tmp_outfiles[0] for reading: $!\n"; | |
| 56 while(<H>){ | |
| 57 last if not /^#/; # end of headers | |
| 58 # mod for self-referencing meta-data | |
| 59 if(/^##commandline=/){ | |
| 60 print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n"; | |
| 61 } | |
| 62 else{ | |
| 63 # Otherwise verbatim | |
| 64 print OUT_VCF $_; | |
| 65 } | |
| 66 | |
| 67 } | |
| 68 close(H); | |
| 69 | |
| 70 # Concatenate the temporary results into a final outfile | |
| 71 for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){ | |
| 72 open(TMP_VCF, $tmp_outfile) | |
| 73 or die "Cannot open $tmp_outfile for reading: $!\n"; | |
| 74 while(<TMP_VCF>){ | |
| 75 print OUT_VCF unless /^#/; | |
| 76 } | |
| 77 close(TMP_VCF); | |
| 78 } | |
| 79 close(OUT_VCF); | |
| 80 &cleanup; | |
| 81 | |
| 82 sub cleanup{ | |
| 83 unlink @tmp_outfiles; | |
| 84 } | |
| 85 sub run_cmds{ | |
| 86 | |
| 87 my ($max_cmds, @cmd) = @_; | |
| 88 | |
| 89 my ($num_children, $pid); | |
| 90 | |
| 91 for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ | |
| 92 # initialize the number of child processes at 1, and increment it by one | |
| 93 #while it is less than $max_cmds | |
| 94 | |
| 95 my $cmd = shift (@cmds); | |
| 96 if($pid = fork) { | |
| 97 # do nothing if parent | |
| 98 } elsif(defined $pid) { # $pid is zero here if defined | |
| 99 print STDERR $cmd, "\n"; | |
| 100 system $cmd; | |
| 101 exit; | |
| 102 } else { | |
| 103 #weird fork error | |
| 104 die "Can't fork: $!\n"; | |
| 105 } | |
| 106 } | |
| 107 | |
| 108 while(@cmds) { | |
| 109 undef $pid; | |
| 110 FORK: { | |
| 111 my $cmd = shift (@cmds); | |
| 112 if($pid = fork) { | |
| 113 # parent here | |
| 114 $num_children++; | |
| 115 wait; | |
| 116 $num_children--; | |
| 117 next; | |
| 118 | |
| 119 } elsif(defined $pid) { # $pid is zero here if defined | |
| 120 print STDERR $cmd, "\n"; | |
| 121 system $cmd; | |
| 122 exit; | |
| 123 | |
| 124 } else { | |
| 125 #weird fork error | |
| 126 die "Can't fork: $!\n"; | |
| 127 } | |
| 128 } | |
| 129 } | |
| 130 wait while $num_children--; | |
| 131 } | |
| 132 |
