view rapsodyn/mpileupfilterandstat.pl @ 3:9332b9da7491 draft

Uploaded
author mcharles
date Thu, 11 Sep 2014 07:31:20 -0400
parents 442a7c88b886
children 3f7b0788a1c4
line wrap: on
line source

#!/usr/bin/perl
use strict;
use Getopt::Long;

#
# Filter a pileup file on forward/reverse presence and %read having the variant
# The error code 
#  1 : multiple variant type detected insertion/deletion/mutation
#  1i : inconsistency in insertion
#  1d : inconsistency in deletion
#  1m : inconsistency in mutation
#  2 : insufficient depth	
#  3 : insufficient variant frequency 
#  4 : variant position not covered by forward and reverse reads
#  5 : variant with other variant in neighbourhood
#  6 : too much depth
#  8 : parsing error (couldn't parse the mpileup line correctly)
#  9 : parsing error (couldn't parse the readbase string correctly)


my $inputfile;
my $logfile;
my $MIN_DISTANCE=0;
my $MIN_VARIANTFREQUENCY=0;
my $MIN_FORWARDREVERSE=0;
my $MIN_DEPTH=0;
my $MAX_DEPTH=500;
my $VERBOSE=0;
my $ONLY_UNFILTERED_VARIANT="OFF";
my $DO_STAT="NO";


my $STAT_MIN_DEPTH_MIN = 2;
my $STAT_MIN_DEPTH_MAX = 10;
my $STAT_MIN_DEPTH_STEP = 2;
my $STAT_MAX_DEPTH_MIN = 100;
my $STAT_MAX_DEPTH_MAX = 200;
my $STAT_MAX_DEPTH_STEP = 100;
my $STAT_FREQ_MIN = 0.8;
my $STAT_FREQ_MAX = 1;
my $STAT_FREQ_STEP = 0.1;
my $STAT_DIST_MIN = 0;
my $STAT_DIST_MAX = 50;
my $STAT_DIST_STEP = 50;

GetOptions (
"input_file=s" => \$inputfile,
"log_file=s" => \$logfile,
"min_depth=i" => \$MIN_DEPTH,
"max_depth=i" => \$MAX_DEPTH,
"min_frequency=f" => \$MIN_VARIANTFREQUENCY,
"min_distance=i" => \$MIN_DISTANCE,
"min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE,
"variant_only=s" => \$ONLY_UNFILTERED_VARIANT,
"v=i" => \$VERBOSE,
"do_stat=s" => \$DO_STAT,
"stat_min_depth_min=i" => \$STAT_MIN_DEPTH_MIN,
"stat_min_depth_max=i" => \$STAT_MIN_DEPTH_MAX,
"stat_min_depth_step=i" => \$STAT_MIN_DEPTH_STEP,
"stat_max_depth_min=i" => \$STAT_MAX_DEPTH_MIN,
"stat_max_depth_max=i" => \$STAT_MAX_DEPTH_MAX,
"stat_max_depth_step=i" => \$STAT_MAX_DEPTH_STEP,
"stat_freq_min=f" => \$STAT_FREQ_MIN,
"stat_freq_max=f" => \$STAT_FREQ_MAX,
"stat_freq_step=f" => \$STAT_FREQ_STEP,
"stat_dist_min=i" => \$STAT_DIST_MIN,
"stat_dist_max=i" => \$STAT_DIST_MAX,
"stat_dist_step=i" => \$STAT_DIST_STEP
) or die("Error in command line arguments\n");

open(IF, $inputfile)  or die("Can't open $inputfile\n");

my @tbl_line;
my %USR_PARAM;
$USR_PARAM{"min_depth"} = $MIN_DEPTH;
$USR_PARAM{"max_depth"} = $MAX_DEPTH;
$USR_PARAM{"min_freq"} = $MIN_VARIANTFREQUENCY;
$USR_PARAM{"min_dist"} = $MIN_DISTANCE;
$USR_PARAM{"min_fr"} = $MIN_FORWARDREVERSE;



#Extraction des variants
my $nb_line=0;
while (my $line=<IF>){
	$nb_line++;
	if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){
		print "$nb_line\n";
	}
	my $error_code=0;
	if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){
		my $current_chromosome = $1;
		my $current_position = $2;
		my $current_refbase = $3;
		my $current_coverage = $4;
		my $current_readbase_string = $5;
		my $current_quality_string = $6;
		
		#Suppression of mPileUp special character
		$current_readbase_string =~ s/\$//g; #the read start at this position
		$current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char
		
		if ($current_readbase_string =~ /[ATGCNatgcn\d]/){
			my %variant;
			$variant{"line"} = $line;
			$variant{"chr"} = $current_chromosome;
			$variant{"pos"} = $current_position;
			$variant{"refbase"} = $current_refbase;
			$variant{"coverage"} = $current_coverage;
			$variant{"readbase"} = $current_readbase_string;
			$variant{"quality"} = $current_quality_string;
			push(@tbl_line,\%variant);

			if ($ONLY_UNFILTERED_VARIANT eq "ON"){
				print $line;
			}
			
		}
		else {
			#Position with no variant
		}
		
	}
	else {
		#Error Parsing
		print STDERR "$line #8";
	}
}
close(IF);

if ($ONLY_UNFILTERED_VARIANT eq "ON"){
	exit(0);
}

####Checking the distance between variant and other filter


my @error;
for (my $i=0;$i<=$#tbl_line;$i++){
	# print "ligne : $tbl_line[$i]\n";
	my $before="";
	my $after="";
	my %line = %{$tbl_line[$i]};

	if ($tbl_line[$i-1]){
		$before = $tbl_line[$i-1];
	}
	if ($tbl_line[$i+1]){
		$after = $tbl_line[$i+1];
	}		
	my $error_code = check_error($tbl_line[$i],$before,$after,\%USR_PARAM);
	if ($error_code == 0){
		print $line{"line"};
	}
	else {
		push(@error,$error_code,"\t",$line{"line"});
	}
}

### LOG
open(LF,">$logfile") or die ("Can't open $logfile\n");

if ($DO_STAT eq "YES"){
	for (my $idx_min_depth=$STAT_MIN_DEPTH_MIN;$idx_min_depth<=$STAT_MIN_DEPTH_MAX;$idx_min_depth = $idx_min_depth + $STAT_MIN_DEPTH_STEP ){
		for (my $idx_max_depth=$STAT_MAX_DEPTH_MIN;$idx_max_depth<=$STAT_MAX_DEPTH_MAX;$idx_max_depth = $idx_max_depth + $STAT_MAX_DEPTH_STEP ){
			for (my $idx_freq = $STAT_FREQ_MIN;$idx_freq<=$STAT_FREQ_MAX;$idx_freq= $idx_freq+$STAT_FREQ_STEP){ 
				for (my $idx_dist=$STAT_DIST_MIN;$idx_dist<=$STAT_DIST_MAX;$idx_dist = $idx_dist + $STAT_DIST_STEP){
					for (my $idx_fr=0;$idx_fr<=1;$idx_fr++){
						my %stat_param;
						$stat_param{"min_depth"}=$idx_min_depth;
						$stat_param{"max_depth"}=$idx_max_depth;
						$stat_param{"min_freq"}=$idx_freq;
						$stat_param{"min_fr"}=$idx_fr;
						$stat_param{"min_dist"}=$idx_dist;

						print LF "#SNP = ",&test_check(\@tbl_line,\%stat_param),"\tdepth (min/max) = ",$stat_param{"min_depth"}," / ",$stat_param{"max_depth"},"\tmin_dist=",$stat_param{"min_dist"},"\tmin_freq=",$stat_param{"min_freq"},"\tmin_forwardreverse = ",$stat_param{"min_fr"},"\n";
					}
				}	
			}
			print "\n";
		}
	}
}


for (my $i=0;$i<=$#error;$i++){
	print LF $error[$i];
}
close (LF);




sub test_check{
	my $ref_tbl_line = shift;
	my $ref_param = shift;
	my @tbl_line = @$ref_tbl_line;
	my %param = %$ref_param;
	my $nb=0;

	for (my $i=0;$i<=$#tbl_line;$i++){
		my $before="";
		my $after="";
		my %line = %{$tbl_line[$i]};

		if ($tbl_line[$i-1]){
			$before = $tbl_line[$i-1];
		}
		if ($tbl_line[$i+1]){
			$after = $tbl_line[$i+1];
		}		
		my $error_code = check_error($tbl_line[$i],$before,$after,\%param);
		if ($error_code == 0){
			$nb++;
		}
	}

	return $nb;
}

sub check_error{
	my $refline = shift;
	my %line = %$refline;
	my $refbefore = shift;
	my $refafter = shift;
	my $refparam = shift;
	my %param = %$refparam;
	
	
	my $current_chromosome = $line{"chr"};
	my $current_position = $line{"pos"};
	my $current_refbase = $line{"refbase"};
	my $current_coverage = $line{"coverage"};
	my $current_readbase_string = $line{"readbase"};
	

	my $min_depth = $param{"min_depth"};
	my $max_depth = $param{"max_depth"};
	my $min_variant_frequency = $param{"min_freq"};
	my $min_forward_reverse = $param{"min_fr"};
	my $min_dist = $param{"min_dist"};
	
	#Verification of neightbourhood
	if ($refbefore){
		my %compareline = %$refbefore;
		my $compare_chromosome = $compareline{"chr"};
		my $compare_position = $compareline{"pos"};
		my $compare_refbase = $compareline{"refbase"};
		my $compare_coverage = $compareline{"coverage"};
		my $compare_readbase_string = $compareline{"readbase"};
		
		if (($current_chromosome eq $compare_chromosome )&&($compare_position + $min_dist >= $current_position)){
			return 5;
		}
	}
	
	if ($refafter){
		my %compareline = %$refafter;
		my $compare_chromosome = $compareline{"chr"};
		my $compare_position = $compareline{"pos"};
		my $compare_refbase = $compareline{"refbase"};
		my $compare_coverage = $compareline{"coverage"};
		my $compare_readbase_string = $compareline{"readbase"};
		
		if (($current_chromosome eq $compare_chromosome )&&($current_position + $min_dist >= $compare_position)){
			return 5;
		}
	}
	
	

	
	#Extraction of insertions
	
	##################################################################
	# my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g;
	# my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g;
	# print "IN : @IN\n";
	# print "DEL :@DEL\n";
	#$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; 
	##################################################################
	#!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct
	##################################################################
	
	# => parcours de boucle
	my @readbase = split(//,$current_readbase_string);
	my $cleaned_readbase_string="";
	my @IN;
	my @DEL;
	my $current_IN="";
	my $current_DEL="";
	my $current_size=0;
	
	for (my $i=0;$i<=$#readbase;$i++){
		if ($readbase[$i] eq "+"){
			#Ouverture de IN
			$current_IN="+";
			
			#Recuperation de la taille
			my $sub = substr $current_readbase_string,$i;
			if ($sub=~/^\+(\d+)/){
				$current_size = $1;
			}
			my $remaining_size = $current_size;
			while (($remaining_size>0)&&($i<=$#readbase)){
				$i++;
				$current_IN.=$readbase[$i];
				if ($readbase[$i]=~ /[ATGCNatgcn]/){
					$remaining_size--;
				}
			}
			push(@IN,$current_IN);
		}
		elsif ($readbase[$i] eq "-"){
			#Ouverture de DEL
			$current_DEL="-";
			
			#Recuperation de la taille
			my $sub = substr $current_readbase_string,$i;
			if ($sub=~/^\-(\d+)/){
				$current_size = $1;
			}
			my $remaining_size = $current_size;
			while (($remaining_size>0)&&($i<=$#readbase)){
				$i++;
				$current_DEL.=$readbase[$i];
				if ($readbase[$i]=~ /[ATGCNatgcn]/){
					$remaining_size--;
				}
			}
			push(@DEL,$current_DEL);
			
		}
		else {
			#Ajout a la string
			$cleaned_readbase_string .= $readbase[$i];
		}
	}

	
	# print "IN : @IN\n";
	# print "DEL :@DEL\n";
	# print "$cleaned_readbase_string\n";	
	
	my @current_readbase_array = split(//,$cleaned_readbase_string);

	#Filtering : error detection

	if ($#current_readbase_array+1 != $current_coverage){
		return 9;
		#parsing error (couldn't parse the readbase string correctly)
	}
	elsif ($current_coverage<$min_depth){
		return 2;
		#  2 : insufficient depth
	}
	elsif ($current_coverage>$max_depth){
		return 6;
		#  6 : too much depth
	}
	else {
		if ($#IN>=0){
			if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
				return 1;
				#  1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
			}
			else {
				########## TEST de coherence des insertions ################
				# for (my $i=0;$i<=$#IN;$i++){
					# if (uc($IN[0]) ne uc($IN[$i])){
						# print uc($IN[0]),"\n";
						# print uc($IN[$i]),"\n";
						# return "1i";
					# }
				# }		
				###########################################################

				if($#IN+1 < $current_coverage*$min_variant_frequency ){
					return 3;
					#  3 : insufficient variant frequency 
				}
			}
		}
		elsif ($#DEL>=0){
			if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){
				return 1;
				#  1 : variant type overload (multiple variant type detected insertion/deletion/mutation)
			}
			else {
				########## TEST de coherence des deletions ################
				# for (my $i=0;$i<=$#DEL;$i++){
					# if (uc($DEL[0]) ne uc($DEL[$i])){
						# print uc($DEL[0]),"\n";
						# print uc($DEL[$i]),"\n";
						# return "1d";
					# }
				# }
				###########################################################

				if($#DEL+1 < $current_coverage*$min_variant_frequency){
					return 3;
					#  3 : insufficient variant frequency 
				}
			}
		}
		else {
			my $nbA=0;
			$nbA++ while ($current_readbase_string =~ m/A/g);
			my $nbC=0;
			$nbC++ while ($current_readbase_string =~ m/C/g);
			my $nbT=0;
			$nbT++ while ($current_readbase_string =~ m/T/g);
			my $nbG=0;
			$nbG++ while ($current_readbase_string =~ m/G/g);
			my $nbN=0;
			$nbN++ while ($current_readbase_string =~ m/N/g);
			my $nba=0;
			$nba++ while ($current_readbase_string =~ m/a/g);
			my $nbc=0;
			$nbc++ while ($current_readbase_string =~ m/c/g);
			my $nbt=0;
			$nbt++ while ($current_readbase_string =~ m/t/g);
			my $nbg=0;
			$nbg++ while ($current_readbase_string =~ m/g/g);
			my $nbn=0;
			$nbn++ while ($current_readbase_string =~ m/n/g);

			if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
				return "1m";
			}
			if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){
				return "1m";
			}
			if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){
				return "1m";
			}
			if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){
				return "1m";
			}
			if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){
				return "1m";
			}

			if ($nbA+$nba >= $current_coverage*$min_variant_frequency){
				if (($nbA<$min_forward_reverse)||($nba<$min_forward_reverse)){
					return 4;
					#  4 : variant position not covered by forward and reverse reads
				}
			}
			elsif ($nbT+$nbt >= $current_coverage*$min_variant_frequency){
				if (($nbT<$min_forward_reverse)||($nbt<$min_forward_reverse)){
					return 4;
					#  4 : variant position not covered by forward and reverse reads
				}
			}	 
			elsif ($nbG+$nbg >= $current_coverage*$min_variant_frequency){
				if (($nbG<$min_forward_reverse)||($nbg<$min_forward_reverse)){
					return 4;
					#  4 : variant position not covered by forward and reverse reads
				}
			}
			elsif ($nbC+$nbc >= $current_coverage*$min_variant_frequency){
				if (($nbC<$min_forward_reverse)||($nbc<$min_forward_reverse)){
					return 4;
					#  4 : variant position not covered by forward and reverse reads
				}
			}
			elsif ($nbN+$nbn >= $current_coverage*$min_variant_frequency){
				if (($nbN<$min_forward_reverse)||($nbn<$min_forward_reverse)){
					return 4;
					#  4 : variant position not covered by forward and reverse reads
				}
			}
			else {
				return 3;
				#  3 : insufficient variant frequency 
			}	
		}
	}
	
	return 0;
}