view varscan/varscan_processSomatic.pl @ 4:572397bbe057 draft default tip

Added Xmx10G setting to both varscan_somatic wrappers
author geert-vandeweyer
date Wed, 26 Mar 2014 05:24:22 -0400
parents 848f3dc54593
children
line wrap: on
line source

#!/usr/bin/perl


use strict;
use Cwd;

die qq(
Bad numbr of inputs

) if(!@ARGV);

my $options ="";
my $command="";
my $input="";
#my $output="";
my $working_dir = cwd();
my $log = '';
my %outfiles;
foreach my $arg (@ARGV) 
{
	my @tmp = split "::", $arg;
	if($tmp[0] eq "COMMAND") 
	{
		$command = $tmp[1];
	} 
	elsif($tmp[0] eq "INPUT")
	{
		$input = $tmp[1];
	}
	elsif($tmp[0] eq "OPTION") 
	{
		my @p = split(/\s+/,$tmp[1]);
		$options = "$options ${tmp[1]}";
	}
	elsif($tmp[0] eq "OUTPUT") 
	{
		my @p = split(/\s+/,$tmp[1]);
		$p[0] = substr($p[0],2);
		$outfiles{$p[0]} = $p[1];
	}
	
	elsif($tmp[0] eq "LOG")
        {
                $log = $tmp[1];
        }
	else 
	{
		die("Unknown Input: $input\n");
	}
}
system("ln -s $input $working_dir/in.dat");



## RUN
$command = "$command $working_dir/in.dat $options > $log 2>&1";
system($command);

## if tabular files are kept, write them to galaxy-datafile
if ($outfiles{'loh'} ne 'None') {
	## 
	system("cp '$working_dir/in.dat.LOH' '".$outfiles{'loh'}."'");
	system("cp '$working_dir/in.dat.LOH.hc' '".$outfiles{'loh'}."'");
	system("cp '$working_dir/in.dat.Germline' '".$outfiles{'germ'}."'");
	system("cp '$working_dir/in.dat.Germline.hc' '".$outfiles{'germ_hc'}."'");
	system("cp '$working_dir/in.dat.Somatic' '".$outfiles{'som'}."'");
	system("cp '$working_dir/in.dat.Somatic.hc' '".$outfiles{'som_hc'}."'");
}
## if vcf files are kept, generate them, and write to galaxy-datafile
if ($outfiles{'loh_hc_vcf'} ne 'None') {
	tab2vcf($working_dir,'in.dat.LOH.hc',$outfiles{'loh_hc_vcf'});
	tab2vcf($working_dir,'in.dat.Germline.hc',$outfiles{'germ_hc_vcf'});
	tab2vcf($working_dir,'in.dat.Somatic.hc',$outfiles{'som_hc_vcf'});
}
exit;

sub tab2vcf
{
	my $wd = shift;
	my $in = shift;
	my $out = shift;
	open OUT, ">$out";
	print OUT "##fileformat=VCFv4.1\n";
	print OUT "##source=processSomatic\n";
	print OUT '##INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases">'."\n";
	print OUT '##INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (Germline,Somatic,LOH)">'."\n";
	print OUT '##INFO=<ID=VPV,Number=1,Type=Float,Description="Variant P-value">'."\n";
	print OUT '##INFO=<ID=SPV,Number=1,Type=Float,Description="Somatic Variant P-value">'."\n";
	print OUT '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">'."\n";
	print OUT '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">'."\n";
	print OUT '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">'."\n";
	print OUT '##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">'."\n";
	print OUT '##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">'."\n";
	print OUT '##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">'."\n";
	print OUT '##FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev">'."\n";
	print OUT '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR'."\n";
	open IN, "$wd/$in";
	my $head = <IN>;
	while (<IN>) {
		chomp;
		my @p = split(/\t/,$_);
		my $td = $p[4] + $p[5] + $p[8] + $p[9];
		my $vpv = sprintf('%.3E',$p[13]);
		my $spv = sprintf('%.3E',$p[14]);
		my $info = "DP=$td;SS=$p[12];VPV=$vpv;SPV=$spv";
		## gt normal string
		my $gtnormal = '';
		if ("$p[7]" eq "$p[2]") {
			$gtnormal = "0/0";
		}
		elsif ("$p[7]" eq "$p[3]") {
			$gtnormal = "1/1";
		}
		else {
			$gtnormal = "0/1";
		}
		my $nsum = $p[4] + $p[5];
		$gtnormal .= ":.:$nsum:$p[4]:$p[5]:$p[6]:$p[19],$p[20],$p[21],$p[22]";
		## gt tumor string
		my $gttumor = '';
		if ("$p[11]" eq "$p[2]") {
			$gttumor = "0/0";
		}
		elsif ("$p[11]" eq "$p[3]") {
			$gttumor = "1/1";
		}
		else {
			$gttumor = "0/1";
		}
		my $tsum = $p[8] + $p[9];
		$gttumor .= ":.:$tsum:$p[8]:$p[9]:$p[10]:$p[15],$p[16],$p[17],$p[18]";

		## outline 
		my $line = "$p[0]\t$p[1]\t.\t$p[2]\t$p[3]\t.\tPASS\t$info\tGT:GQ:DP:RD:AD:FREQ:DP4\t$gtnormal\t$gttumor\n";
		print OUT $line;
	}
	close IN;
	close OUT;
}
sub vs2vcf 
{

	#
	# G l o b a l     v a r i a b l e s 
	#
	my $version = '0.1';

	#
	# Read in file
	#
	my $input = shift;
	my $output = shift;
	my $chr_ord = shift;
	open(IN, $input) or die "Can't open $input': $!\n";
	open(OUT, ">$output") or die "Can't create $output': $!\n";
	my %output;

	while ( <IN> )
	{
		if ( /^#/ )
		{
			print OUT;
			next;
		}
		chomp;
		my $line = $_;

		my @flds = split ( "\t", $line );
		my $ref = $flds[3];
		my $alt = $flds[4];
		#
		# Deletion of bases
		#
		if ( $alt =~ /^\-/ )
		{
			($flds[3], $flds[4]) = ($ref.substr($alt,1), $ref);
		}

		#
		# Insertion of bases
		#
		if ( $alt =~ /^\+/ )
		{
			$flds[4] = $ref.substr($alt,1);
		}
		## insert dot for reference positions.
		if ($flds[4] eq '') {
			$flds[4] = '.';
		}
		print OUT join( "\t", @flds),"\n" unless defined $chr_ord;
		$output{$flds[0]}{$flds[1]} = join( "\t", @flds)."\n" if defined $chr_ord;
	}
	close(IN);
	# if chromosome order given return in sorted order
	if(defined $chr_ord) 
	{
		for my $chrom (@{ $chr_ord }) 
		{
			for my $pos (sort {$a<=>$b} keys %{ $output{$chrom} }) 
			{
				print OUT $output{$chrom}{$pos};
			}
		}
	}
	close(OUT);
}


sub chromosome_order 
{
	my $input = shift;
	# calculate flagstats
	my $COMM = "samtools view -H $input | grep '^\@SQ'";
	my @SQ = `$COMM`;
	chomp @SQ;
	for(my $i = 0; $i <= $#SQ; $i++) 
	{
		$SQ[$i] =~ s/^\@SQ\tSN:(.*?)\tLN:\d+$/$1/;
	} 
	return(@SQ);
}