Mercurial > repos > mcharles > rapsodyn
view mpileupfilter/mpileupfilter.pl @ 5:26d2c851d48e draft
Uploaded
author | mcharles |
---|---|
date | Mon, 16 Jun 2014 06:18:07 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl use strict; use Getopt::Long; # # Filter a pileup file on forward/reverse presence and %read having the variant # The error code # 1 : multiple variant type detected insertion/deletion/mutation # 1i : inconsistency in insertion # 1d : inconsistency in deletion # 1m : inconsistency in mutation # 2 : insufficient depth # 3 : insufficient variant frequency # 4 : variant position not covered by forward and reverse reads # 5 : variant with other variant in neighbourhood # 6 : too much depth # 8 : parsing error (couldn't parse the mpileup line correctly) # 9 : parsing error (couldn't parse the readbase string correctly) my $inputfile; my $logfile; my $MIN_DISTANCE=0; my $MIN_VARIANTFREQUENCY=0; my $MIN_FORWARDREVERSE=0; my $MIN_DEPTH=0; my $MAX_DEPTH=500; my $VERBOSE=0; my $ONLY_UNFILTERED_VARIANT="OFF"; if ($#ARGV<0){ print "\n"; print "perl 020_FilterPileupv6 -input_file <mpileup_file> [OPTION]\n"; print "-input_file \tinputfile in mpileup format\n"; print "-log_file \tlogfile containing discarded mpileup lines and the errorcode associated\n"; print "-min_depth \tminimum depth required [1]\n"; print "-max_depth \tmaximim depth (position with more coverage will be discarded) [100]\n"; print "-min_frequency \tminimum variant frequency (0->1) [1] (default 1 => 100% reads show the variant at this position)\n"; print "-min_distance \tminimum distance between variant [0]\n"; print "-min_forward_and_reverse \tminimum number of reads in forward and reverse covering the variant required [0]\n"; print "\n"; exit(0); } GetOptions ( "input_file=s" => \$inputfile, "log_file=s" => \$logfile, "min_depth=i" => \$MIN_DEPTH, "max_depth=i" => \$MAX_DEPTH, "min_frequency=f" => \$MIN_VARIANTFREQUENCY, "min_distance=i" => \$MIN_DISTANCE, "min_forward_and_reverse=i" => \$MIN_FORWARDREVERSE, "variant_only=s" => \$ONLY_UNFILTERED_VARIANT, "v=i" => \$VERBOSE ) or die("Error in command line arguments\n"); open(IF, $inputfile) or die("Can't open $inputfile\n"); my @tbl_line; my @tbl_variant_position; my @tbl_variant_chr; my @tbl_variant_refbase; my @tbl_variant_coverage; my @tbl_variant_readbase_string; my @tbl_variant_quality_string; #Extraction des variants my $nb_line=0; while (my $line=<IF>){ $nb_line++; if (($nb_line % 1000000 == 0)&&($VERBOSE==1)){ print "$nb_line\n"; } my $error_code=0; if ($line=~/(.*?)\s+(\d+)\s+([ATGCN])\s+(\d+)\s+(.*?)\s+(.*?)$/){ my $current_chromosome = $1; my $current_position = $2; my $current_refbase = $3; my $current_coverage = $4; my $current_readbase_string = $5; my $current_quality_string = $6; #Suppression of mPileUp special character $current_readbase_string =~ s/\$//g; #the read start at this position $current_readbase_string =~ s/\^.//g; #the read end at this position followed by quality char if ($current_readbase_string =~ /[ATGCNatgcn\d]/){ push(@tbl_line,$line); push(@tbl_variant_chr,$current_chromosome); push(@tbl_variant_position,$current_position); push(@tbl_variant_refbase,$current_refbase); push(@tbl_variant_coverage,$current_coverage); push(@tbl_variant_readbase_string,$current_readbase_string); push(@tbl_variant_quality_string,$current_quality_string); if ($ONLY_UNFILTERED_VARIANT eq "ON"){ print $line; } } else { #Position with no variant } } else { #Error Parsing print STDERR "$line #8"; } } close(IF); if ($ONLY_UNFILTERED_VARIANT eq "ON"){ exit(0); } ####Checking the distance between variant and other filter if ($logfile){ open(LF,">$logfile") or die ("Cant't open $logfile\n"); } for (my $i=0;$i<=$#tbl_line;$i++){ # print "ligne : $tbl_line[$i]\n"; my $error_code=0; if ($i==0){ #Comparing $i and $i+1 for neighbourhood filter; if ($#tbl_line>0){ if (($tbl_variant_chr[$i+1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i]+$MIN_DISTANCE>=$tbl_variant_position[$i+1])){ $error_code=5; chomp($tbl_line[$i]); if ($logfile){ print LF "$tbl_line[$i]\tcode:$error_code\n"; } next; } } #Additionnal filters $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); } else { #Compairing $i and $i-1 for neighbourhood filter if (($tbl_variant_chr[$i-1] eq $tbl_variant_chr[$i])&&($tbl_variant_position[$i-1]+$MIN_DISTANCE>=$tbl_variant_position[$i])){ $error_code=5; chomp($tbl_line[$i]); if ($logfile){ print LF "$tbl_line[$i]\tcode:$error_code\n"; } next; } else { #Additionnal filters $error_code = check_error($tbl_variant_chr[$i],$tbl_variant_position[$i],$tbl_variant_refbase[$i],$tbl_variant_coverage[$i],$tbl_variant_readbase_string[$i]); } } if ($error_code == 0){ print $tbl_line[$i]; } else { chomp($tbl_line[$i]); if ($logfile){ print LF "$tbl_line[$i]\tcode:$error_code\n"; } } } if ($logfile){ close (LF); } sub check_error{ my $current_chromosome = shift; my $current_position = shift; my $current_refbase = shift; my $current_coverage = shift; my $current_readbase_string = shift; # print "test : $current_readbase_string\n"; #Extraction of insertions ################################################################## # my @IN = $current_readbase_string =~ m/\+[0-9]+[ACGTNacgtn]+/g; # my @DEL = $current_readbase_string =~ m/\-[0-9]+[ACGTNacgtn]+/g; # print "IN : @IN\n"; # print "DEL :@DEL\n"; #$current_readbase_string=~s/[\+\-][0-9]+[ACGTNacgtn]+//g; ################################################################## #!!! marche pas : exemple .+1Ct. correspond a . / +1C / t /. mais le match de l'expression vire +1Ct ################################################################## # => parcours de boucle my @readbase = split(//,$current_readbase_string); my $cleaned_readbase_string=""; my @IN; my @DEL; my $current_IN=""; my $current_DEL=""; my $current_size=0; for (my $i=0;$i<=$#readbase;$i++){ if ($readbase[$i] eq "+"){ #Ouverture de IN $current_IN="+"; #Recuperation de la taille my $sub = substr $current_readbase_string,$i; if ($sub=~/^\+(\d+)/){ $current_size = $1; } my $remaining_size = $current_size; while (($remaining_size>0)&&($i<=$#readbase)){ $i++; $current_IN.=$readbase[$i]; if ($readbase[$i]=~ /[ATGCNatgcn]/){ $remaining_size--; } } push(@IN,$current_IN); } elsif ($readbase[$i] eq "-"){ #Ouverture de DEL $current_DEL="-"; #Recuperation de la taille my $sub = substr $current_readbase_string,$i; if ($sub=~/^\-(\d+)/){ $current_size = $1; } my $remaining_size = $current_size; while (($remaining_size>0)&&($i<=$#readbase)){ $i++; $current_DEL.=$readbase[$i]; if ($readbase[$i]=~ /[ATGCNatgcn]/){ $remaining_size--; } } push(@DEL,$current_DEL); } else { #Ajout a la string $cleaned_readbase_string .= $readbase[$i]; } } # print "IN : @IN\n"; # print "DEL :@DEL\n"; # print "$cleaned_readbase_string\n"; my @current_readbase_array = split(//,$cleaned_readbase_string); #Filtering : error detection if ($#current_readbase_array+1 != $current_coverage){ return 9; #parsing error (couldn't parse the readbase string correctly) } elsif ($current_coverage<$MIN_DEPTH){ return 2; # 2 : insufficient depth } elsif ($current_coverage>$MAX_DEPTH){ return 6; # 6 : too much depth } else { if ($#IN>=0){ if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ return 1; # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) } else { ########## TEST de coherence des insertions ################ # for (my $i=0;$i<=$#IN;$i++){ # if (uc($IN[0]) ne uc($IN[$i])){ # print uc($IN[0]),"\n"; # print uc($IN[$i]),"\n"; # return "1i"; # } # } ########################################################### if($#IN+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ return 3; # 3 : insufficient variant frequency } } } elsif ($#DEL>=0){ if (($cleaned_readbase_string=~/[ACGTNacgtn]/)){ return 1; # 1 : variant type overload (multiple variant type detected insertion/deletion/mutation) } else { ########## TEST de coherence des deletions ################ # for (my $i=0;$i<=$#DEL;$i++){ # if (uc($DEL[0]) ne uc($DEL[$i])){ # print uc($DEL[0]),"\n"; # print uc($DEL[$i]),"\n"; # return "1d"; # } # } ########################################################### if($#DEL+1 < $current_coverage*$MIN_VARIANTFREQUENCY){ return 3; # 3 : insufficient variant frequency } } } else { my $nbA=0; $nbA++ while ($current_readbase_string =~ m/A/g); my $nbC=0; $nbC++ while ($current_readbase_string =~ m/C/g); my $nbT=0; $nbT++ while ($current_readbase_string =~ m/T/g); my $nbG=0; $nbG++ while ($current_readbase_string =~ m/G/g); my $nbN=0; $nbN++ while ($current_readbase_string =~ m/N/g); my $nba=0; $nba++ while ($current_readbase_string =~ m/a/g); my $nbc=0; $nbc++ while ($current_readbase_string =~ m/c/g); my $nbt=0; $nbt++ while ($current_readbase_string =~ m/t/g); my $nbg=0; $nbg++ while ($current_readbase_string =~ m/g/g); my $nbn=0; $nbn++ while ($current_readbase_string =~ m/n/g); if (($nbA+$nba>0)&&($nbT+$nbt+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbT+$nbt>0)&&($nbA+$nba+$nbG+$nbg+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbG+$nbg>0)&&($nbA+$nba+$nbT+$nbt+$nbC+$nbc+$nbN+$nbn>0)){ return "1m"; } if (($nbC+$nbc>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbN+$nbn>0)){ return "1m"; } if (($nbN+$nbn>0)&&($nbA+$nba+$nbT+$nbt+$nbG+$nbg+$nbC+$nbc>0)){ return "1m"; } if ($nbA+$nba >= $current_coverage*$MIN_VARIANTFREQUENCY){ if (($nbA<$MIN_FORWARDREVERSE)||($nba<$MIN_FORWARDREVERSE)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbT+$nbt >= $current_coverage*$MIN_VARIANTFREQUENCY){ if (($nbT<$MIN_FORWARDREVERSE)||($nbt<$MIN_FORWARDREVERSE)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbG+$nbg >= $current_coverage*$MIN_VARIANTFREQUENCY){ if (($nbG<$MIN_FORWARDREVERSE)||($nbg<$MIN_FORWARDREVERSE)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbC+$nbc >= $current_coverage*$MIN_VARIANTFREQUENCY){ if (($nbC<$MIN_FORWARDREVERSE)||($nbc<$MIN_FORWARDREVERSE)){ return 4; # 4 : variant position not covered by forward and reverse reads } } elsif ($nbN+$nbn >= $current_coverage*$MIN_VARIANTFREQUENCY){ if (($nbN<$MIN_FORWARDREVERSE)||($nbn<$MIN_FORWARDREVERSE)){ return 4; # 4 : variant position not covered by forward and reverse reads } } else { return 3; # 3 : insufficient variant frequency } } } return 0; }