diff SNV/filter_snvmix.pl @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SNV/filter_snvmix.pl	Wed Oct 12 19:50:38 2011 -0400
@@ -0,0 +1,89 @@
+#!/usr/bin/perl
+use strict;
+use Getopt::Std;
+use vars qw($opt_d $opt_p $opt_c $opt_C $opt_i $opt_S $opt_P $opt_I);
+getopts("d:cC:i:p:SPI:");
+
+my $usage = "$0 -p nonref_p-value_threshold [0.99] -c track_sequenced_bases -d minimum_nonref_depth [2] -C minimum_number_of_read_centered_calls [1] -i max_num_indels [0] -S need_dual_strand_coverage [0] -P use_unpaired_reads_too";
+
+#chr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxPr+ r- n+ n- nC nE nD
+#chr:pos ref nref ref:ref_count,nref:nref_count,pAA,pAB,pBB,maxP ref_fwd ref_rev nref_fwd nref_rev       nref_center nref_edges  indel_pos       ref_clean nref_clean    ref_bad_pair nref_bad_pair      ref_ins nref_ins        ref_del nref_del   ref_junc nref_junc      nref_in_unique_pos
+my $nonref_depth_thresh = $opt_d || 2;
+my $min_centered = $opt_C || 1;
+my $max_indels = $opt_i || 0;
+my $dual_strand = $opt_S;
+my $p_threshold = $opt_p || 0.99;
+my $track_sequenced_bases = $opt_c;
+my $pair_filtering = 1;
+my $max_indel_relative = $opt_I;  #filter on proportion of indels at SNV position relative to nonref calls. E.g. 0.5 would allow 1/2 the number of indel reads at that position vs indels.  
+if($opt_P){
+    $pair_filtering = 0;
+}
+while(<STDIN>){
+    chomp;
+    my ($chrpos,$ref,$nonref,$info,$ref_plus,$ref_minus,$nonref_plus,$nonref_minus,$centered,$ended,$indels,$ref_clean,$nonref_clean,$ref_bad_pair,$nonref_bad_pair,$ref_ins,$nonref_ins,$ref_del,$nonref_del,$ref_junc,$nref_junc,$nonref_in_unique_pos) = split;
+    my $ok;
+    my ($ref_info, $nref_info, $pAA, $pAB, $pBB, $call) = split(/,/, $info);
+    my $skip;
+    if($track_sequenced_bases){
+	$skip = 1 if $call == 1;
+    }
+    else{
+	next if $call == 1;
+    }
+    my $is_sequenced;
+    my ($foo,$nref_num) = split /:/, $nref_info;
+    if($pAA < ($pAB + $pBB)) {
+	if( ($pAB + $pBB) >= $p_threshold) {
+	    $is_sequenced = $pAB+$pBB;
+	}
+	else{
+	    next;
+	}
+    }   
+    else{
+	if($track_sequenced_bases){
+	    if($pAA >= $p_threshold){
+		$is_sequenced = $pAA;
+	    }
+#	    print STDERR "skip $skip\n";
+	    $skip = 1;
+	}
+	else{
+	    next;
+	}
+
+    }
+
+    if($track_sequenced_bases){
+	if($is_sequenced){
+	    print STDERR "$chrpos\t$is_sequenced\n";
+	}
+	next if $skip;
+    }
+    if($indels <= $max_indels && $centered >= $min_centered && $nref_num >= $nonref_depth_thresh){
+	if($dual_strand){
+	    $ok=1 if $nonref_minus > 0 && $nonref_plus > 0;
+	}
+	else{
+	    $ok= 1;
+	}
+    }
+    else{
+	#print "$_ FAIL\n$indels <= $max_indels && $centered >= $min_centered && $nref_num >= $nonref_depth_thresh\n";
+    }
+#    if($nonref_bad_pair > ($nonref_plus+$nonref_minus)/2){
+    if($nonref_bad_pair > 1 && $pair_filtering){
+        #Rodrigo's bad pair filter fail
+        #print STDERR "skipping $chrpos due to $nonref_bad_pair >  ($nonref_plus+$nonref_minus)/2\n";
+	$ok = 0;
+
+    }
+
+    unless($chrpos =~ /^chr/){
+	$chrpos = "chr$chrpos";
+    }
+    if($ok){
+	print "$chrpos\t$ref\t$nonref\t$info\n";
+    }
+}