Mercurial > repos > ryanmorin > nextgen_variant_identification
diff SNV/filter_snvmix.pl @ 0:74f5ea818cea
Uploaded
author | ryanmorin |
---|---|
date | Wed, 12 Oct 2011 19:50:38 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SNV/filter_snvmix.pl Wed Oct 12 19:50:38 2011 -0400 @@ -0,0 +1,89 @@ +#!/usr/bin/perl +use strict; +use Getopt::Std; +use vars qw($opt_d $opt_p $opt_c $opt_C $opt_i $opt_S $opt_P $opt_I); +getopts("d:cC:i:p:SPI:"); + +my $usage = "$0 -p nonref_p-value_threshold [0.99] -c track_sequenced_bases -d minimum_nonref_depth [2] -C minimum_number_of_read_centered_calls [1] -i max_num_indels [0] -S need_dual_strand_coverage [0] -P use_unpaired_reads_too"; + +#chr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxPr+ r- n+ n- nC nE nD +#chr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP ref_fwd ref_rev nref_fwd nref_rev nref_center nref_edges indel_pos ref_clean nref_clean ref_bad_pair nref_bad_pair ref_ins nref_ins ref_del nref_del ref_junc nref_junc nref_in_unique_pos +my $nonref_depth_thresh = $opt_d || 2; +my $min_centered = $opt_C || 1; +my $max_indels = $opt_i || 0; +my $dual_strand = $opt_S; +my $p_threshold = $opt_p || 0.99; +my $track_sequenced_bases = $opt_c; +my $pair_filtering = 1; +my $max_indel_relative = $opt_I; #filter on proportion of indels at SNV position relative to nonref calls. E.g. 0.5 would allow 1/2 the number of indel reads at that position vs indels. +if($opt_P){ + $pair_filtering = 0; +} +while(<STDIN>){ + chomp; + my ($chrpos,$ref,$nonref,$info,$ref_plus,$ref_minus,$nonref_plus,$nonref_minus,$centered,$ended,$indels,$ref_clean,$nonref_clean,$ref_bad_pair,$nonref_bad_pair,$ref_ins,$nonref_ins,$ref_del,$nonref_del,$ref_junc,$nref_junc,$nonref_in_unique_pos) = split; + my $ok; + my ($ref_info, $nref_info, $pAA, $pAB, $pBB, $call) = split(/,/, $info); + my $skip; + if($track_sequenced_bases){ + $skip = 1 if $call == 1; + } + else{ + next if $call == 1; + } + my $is_sequenced; + my ($foo,$nref_num) = split /:/, $nref_info; + if($pAA < ($pAB + $pBB)) { + if( ($pAB + $pBB) >= $p_threshold) { + $is_sequenced = $pAB+$pBB; + } + else{ + next; + } + } + else{ + if($track_sequenced_bases){ + if($pAA >= $p_threshold){ + $is_sequenced = $pAA; + } +# print STDERR "skip $skip\n"; + $skip = 1; + } + else{ + next; + } + + } + + if($track_sequenced_bases){ + if($is_sequenced){ + print STDERR "$chrpos\t$is_sequenced\n"; + } + next if $skip; + } + if($indels <= $max_indels && $centered >= $min_centered && $nref_num >= $nonref_depth_thresh){ + if($dual_strand){ + $ok=1 if $nonref_minus > 0 && $nonref_plus > 0; + } + else{ + $ok= 1; + } + } + else{ + #print "$_ FAIL\n$indels <= $max_indels && $centered >= $min_centered && $nref_num >= $nonref_depth_thresh\n"; + } +# if($nonref_bad_pair > ($nonref_plus+$nonref_minus)/2){ + if($nonref_bad_pair > 1 && $pair_filtering){ + #Rodrigo's bad pair filter fail + #print STDERR "skipping $chrpos due to $nonref_bad_pair > ($nonref_plus+$nonref_minus)/2\n"; + $ok = 0; + + } + + unless($chrpos =~ /^chr/){ + $chrpos = "chr$chrpos"; + } + if($ok){ + print "$chrpos\t$ref\t$nonref\t$info\n"; + } +}