#!/usr/bin/env perl

use strict;
use warnings;
use File::Temp;
use vars qw($tmpbam_fh $tmpbam_filename);
use File::Basename;
@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns GATK 3.1 separately for each reference chromosome, with as many concurrent processes as specified\n"; 

my $in_bam = shift @ARGV;
my $num_procs = shift @ARGV;
my $out_vcf = shift @ARGV;
my $dirname = dirname(__FILE__);
$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;

# Lines look some thing like "@SQ	SN:chr1	LN:249250621"
open(SAM_HEADERS, "samtools view -H $in_bam |")
  or die "Cannot run samtools on $in_bam: $!\n";
my %seq2length;
my %seqname2orig_order;
while(<SAM_HEADERS>){
  if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){
    $seq2length{$1} = $2;
    $seqname2orig_order{$1} = $.;
  }
}
close(SAM_HEADERS);

# GATK will complain of the file doesn't have a .bam ending, which will happen if the file was uploaded to Galaxy (###.dat)
# Also, we need to make sure there is an index file
if($in_bam !~ /\.bam$/){
  ($tmpbam_fh, $tmpbam_filename) = tmpnam();
  system("ln -s $in_bam $tmpbam_filename.bam")>> 8 and die "Cannot create symbolic link from $in_bam to $tmpbam_filename.bam: exit status $?\n";
  if(not -e "$in_bam.bai"){
    system("samtools index $in_bam")>> 8 and die "Cannot create index for BAM file $in_bam: exit status $?\n";
  }
  system("ln -s $in_bam.bai $tmpbam_filename.bam.bai")>> 8 and die "Cannot create symbolic link from $in_bam.bai to $tmpbam_filename.bam.bai: exit status $?\n";
  $in_bam = "$tmpbam_filename.bam";
}
elsif(not -e "$in_bam.bai"){
  system("samtools index $in_bam")>> 8 and die "Cannot create index for BAM file $in_bam: exit status $?\n";
}

# Peak into the BAM to see if it's got Illumina encoded quality values, which will require an extra flag for GATK
my $is_illumina_encoded_qual = 1;
my $num_mapped = 0;
if(open(SAMTOOLS, "samtools view $in_bam |")){
    while(<SAMTOOLS>){
        my @F = split /\t/, $_;
        next unless $F[2] ne "*"; # ignore unmapped reads
        my @quals = map {ord($_)} split(//, $F[10]);
        if(grep {$_ < 64} @quals){
            $is_illumina_encoded_qual = 0; last;
        }
        last if ++$num_mapped > 100; # only look at the first 100 mapped reads 
    }
} # take our chances if samtools call failed that it's not illumina encoded, which will fail quickly in GATK anyways
close(SAMTOOLS);

my @cmds;
my @tmp_outfiles;
my %tmpfile2orig_order;
# Sort contigs from largest to smallest for scheduling efficiency
for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){
  my ($tmp_fh, $tmp_filename) = tmpnam();
  my $intervals_tmpfile = $tmp_filename.".intervals";
  open(INTERVAL, ">$intervals_tmpfile") or die "Cannot open $intervals_tmpfile for writing: $!\n";
  print INTERVAL "$seq_name:1-$seq2length{$seq_name}\n";
  close(INTERVAL);
  push @cmds, "java -Xmx24G -jar $dirname/GenomeAnalysisTK.jar -I $in_bam -T HaplotypeCaller -o $tmp_filename ".join(" ", @ARGV)." -L $intervals_tmpfile -nct 2".
              ($is_illumina_encoded_qual?" --fix_misencoded_quality_scores":"");
  push @tmp_outfiles, $tmp_filename;
  $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name};
}

open(OUT_VCF, ">$out_vcf")
  or die "Cannot open $out_vcf for writing: $!\n";

# div 2 because assigning 2 threads per process for efficiency
run_cmds(int($num_procs/2+0.5), @cmds);

# Grab output header from first temp output file
open(H, $tmp_outfiles[0])
  or die "Cannot open $tmp_outfiles[0] for reading: $!\n";
while(<H>){
  last if not /^#/; # end of headers
  # mod for self-referencing meta-data
  if(/^##commandline=/){
    print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n";
  }
  else{
    # Otherwise verbatim
    print OUT_VCF $_;
  }

}
close(H);

# Concatenate the temporary results into a final outfile
for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){
  open(TMP_VCF, $tmp_outfile) 
    or die "Cannot open $tmp_outfile for reading: $!\n";
  while(<TMP_VCF>){
    print OUT_VCF unless /^#/;
  }
  close(TMP_VCF);
}
close(OUT_VCF);
&cleanup;

sub cleanup{
  for my $t (@tmp_outfiles){
    unlink $t, "$t.intervals";
  }
  unlink "$tmpbam_filename.bam", "$tmpbam_filename.bam.bai" if defined $tmpbam_filename;
}

sub run_cmds{

  my ($max_cmds, @cmd) = @_;

  my ($num_children, $pid);

        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
        # initialize the number of child processes at 1, and increment it by one
        #while it is less than $max_cmds

                my $cmd = shift (@cmds);
                if($pid = fork) {
                        # do nothing if parent
                } elsif(defined $pid) { # $pid is zero here if defined
                        print STDERR $cmd, "\n";
                        system $cmd;
                        exit;
                } else {
                        #weird fork error
                        die "Can't fork: $!\n";
                }
        }

        while(@cmds) {
                undef $pid;
                FORK: {
                        my $cmd = shift (@cmds);
                        if($pid = fork) {
                                # parent here
                                $num_children++;
                                wait;
                                $num_children--;
                                next;

                        } elsif(defined $pid) { # $pid is zero here if defined
                                print STDERR $cmd, "\n";
                                system $cmd;
                                exit;

                        } else {
                                #weird fork error
                                die "Can't fork: $!\n";
                        }
                }
        }
        wait while $num_children--;
}

