diff snp_reports @ 0:14e1cf728269 default tip

init commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:42:26 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_reports	Wed Mar 25 13:42:26 2015 -0600
@@ -0,0 +1,301 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 5 or die "Usage: $0 [-q(uiet)] <snp table> <targets.bed> <coding.gtf> <coverage stats summary.txt> <output table>\n";
+
+my $hgvs_file = $ARGV[0];
+my $target_bed = $ARGV[1];
+my $coding_gtf = $ARGV[2];
+my $stats_file = $ARGV[3];
+my $output = $ARGV[4];
+
+open(TAB, $hgvs_file)
+  or die "Cannot open SNP HGVS file $hgvs_file for reading: $!\n";
+my $header = <TAB>; # header
+chomp $header;
+my @header = split /\t/, $header;
+my ($chr_column, $pos_column, $ref_column, $var_column, $depth_column, $caveat_column, $hgvs_aa_column, $zygosity_column, $rsid_column, $maf_column);
+# Transcript type	Transcript length	Transcript	HGVS cDNA	Strand	Chr	Pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	Since dbSNP Release #	HGVS AA	Distance from an exon boundary (AA coding variants only)	Caveats	Phase	Sources
+for(my $i = 0; $i < $#header; $i++){
+  if($header[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($header[$i] eq "DNA From"){
+    $pos_column = $i;
+  }
+  elsif($header[$i] eq "Ref base"){
+    $ref_column = $i;
+  }
+  elsif($header[$i] eq "Obs base"){
+    $var_column = $i;
+  }
+  elsif($header[$i] eq "Total Reads"){
+    $depth_column = $i;
+  }
+  elsif($header[$i] eq "Caveats"){
+    $caveat_column = $i;
+  }
+  elsif($header[$i] eq "Protein HGVS"){
+    $hgvs_aa_column = $i;
+  }
+  elsif($header[$i] eq "Zygosity"){
+    $zygosity_column = $i;
+  }
+  elsif($header[$i] eq "Pop. freq."){
+    $maf_column = $i;
+  }
+  elsif($header[$i] eq "Pop. freq. source"){
+    $rsid_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Cannot find Chr header in $hgvs_file, aborting\n";
+}
+if(not defined $pos_column){
+  die "Cannot find Pos header in $hgvs_file, aborting\n";
+}
+if(not defined $ref_column){
+  die "Cannot find Ref Bases header in $hgvs_file, aborting\n";
+}
+if(not defined $var_column){
+  die "Cannot find Var Bases header in $hgvs_file, aborting\n";
+}
+if(not defined $depth_column){
+  die "Cannot find Total Reads header in $hgvs_file, aborting\n";
+}
+if(not defined $caveat_column){
+  die "Cannot find Caveats header in $hgvs_file, aborting\n";
+}
+if(not defined $hgvs_aa_column){
+  die "Cannot find HGVS AA header in $hgvs_file, aborting\n";
+}
+if(not defined $zygosity_column){
+  die "Cannot find Zygosity header in $hgvs_file, aborting\n";
+}
+if(not defined $maf_column){
+  die "Cannot find Pop. freq. header in $hgvs_file, aborting\n";
+}
+if(not defined $rsid_column){
+  die "Cannot find rsID header in $hgvs_file, aborting\n";
+}
+
+my %target_regions;
+print STDERR "Reading in coding sequence definitions...\n" unless $quiet;
+open(GTF, $coding_gtf)
+    or die "Cannot open coding sequence GTF file $coding_gtf for reading: $!\n";
+while(<GTF>){
+    next if /^\s*#/;
+    tr/\r//d;
+    chomp;
+    my @fields = split /\t/, $_;
+    next unless $fields[2] eq "CDS";
+    if(not exists $target_regions{$fields[0]}){
+        $target_regions{$fields[0]} = [];
+    }
+    my $chr = $target_regions{$fields[0]};
+    for my $pos ($fields[3]..$fields[4]){
+      $target_regions{$fields[0]}->[$pos] = "C";
+    }
+}
+close(GTF);
+
+print STDERR "Reading in targeted sequence definitions...\n" unless $quiet;
+my $intersection_count = 0;
+my $targeted_total = 0;
+my $non_coding_target_total = 0;
+open(BED, $target_bed)
+    or die "Cannot open target regions BED file $target_bed for reading: $!\n"; 
+while(<BED>){
+    next if /^\s*#/;    
+    tr/\r//d;
+    chomp;
+    my @fields = split /\t/, $_;
+    $targeted_total += $fields[2]-$fields[1]+1;
+
+    if(not exists $target_regions{$fields[0]}){
+	$target_regions{$fields[0]} = [];
+    }
+    my $chr = $target_regions{$fields[0]};
+    for my $pos ($fields[1]..$fields[2]){
+      if(defined $target_regions{$fields[0]}->[$pos]){
+        $intersection_count++;
+      }
+      else{
+        $target_regions{$fields[0]}->[$pos] = "T";
+        $non_coding_target_total++;
+      }
+    }
+}
+close(BED);
+
+print STDERR "Reading in coverage information...\n" unless $quiet;
+open(STATS, $stats_file)
+ or die "Cannot open $stats_file for reading: $!\n";
+my $total_bases_lt_10x;
+my $total_bases_lt_20x;
+while(<STATS>){
+  if(/^Total bases with less than 10-fold coverage\t(\d+)/){
+    $total_bases_lt_10x = $1;
+  }
+  elsif(/^Total bases with less than 20-fold coverage\t(\d+)/){
+    $total_bases_lt_20x = $1;
+  }
+}
+close(STATS);
+my $total_bases_under_consideration_10 = $targeted_total-$non_coding_target_total-$total_bases_lt_10x*(1-$non_coding_target_total/$targeted_total);
+my $total_bases_under_consideration_20 = $targeted_total-$non_coding_target_total-$total_bases_lt_20x*(1-$non_coding_target_total/$targeted_total);
+my $total_noncoding_bases_under_consideration_10 = $non_coding_target_total-$total_bases_lt_10x*($non_coding_target_total/$targeted_total);
+my $total_noncoding_bases_under_consideration_20 = $non_coding_target_total-$total_bases_lt_20x*($non_coding_target_total/$targeted_total);
+
+print STDERR "Processing called SNPs...\n" unless $quiet;
+my %seen;
+my $coding_transition_count_10 = 0; # A->G, G->A, C->T, T->C, i.e. effect of deamination of 5'-methyl C to uracil
+my $coding_transition_count_20 = 0;
+my $coding_transversion_count_10 = 0; # A <-> C, A <-> T, G <-> C or G <-> T
+my $coding_transversion_count_20 = 0;
+my $coding_snp_count_10 = 0;
+my $noncoding_transition_count_10 = 0;
+my $noncoding_transition_count_20 = 0;
+my $noncoding_transversion_count_10 = 0;
+my $noncoding_transversion_count_20 = 0;
+my $synonymous_snp_count_10 = 0;
+my $snp_count_10 = 0;
+my $autosomal_snp_count_10 = 0;
+my $novel_snp_count_10 = 0;
+my $homo_snp_count_10 = 0;
+my $non_coding_snp_count_10 = 0;
+my $coding_snp_count_20 = 0;
+my $synonymous_snp_count_20 = 0;
+my $snp_count_20 = 0;
+my $autosomal_snp_count_20 = 0;
+my $novel_snp_count_20 = 0;
+my $homo_snp_count_20 = 0;
+my $non_coding_snp_count_20 = 0;
+
+# Okay, put all of the protein-coding lines at the start so that if any transcript for a 
+# gene says it's protein coding, that'll be the state recorded for the SNP.
+my @datalines = <TAB>;
+
+my $tot_snps = 0;
+for(@datalines){
+  chomp;
+  my @F = split /\t/, $_;
+
+  my $newbases = $F[$var_column];
+  my $refbases = $F[$ref_column];
+  next unless length($newbases) eq length($refbases); #SNPs and MNPs only
+  for (my $i = 0; $i < length($newbases); $i++){
+    my $newbase = substr($newbases, $i, 1);
+    my $refbase = substr($refbases, $i, 1);
+    next if $refbase eq $newbase; # ref in the middle of a phased MNP
+
+    next if exists $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"}; # seen SNP already
+    $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"} = 1;
+
+    $tot_snps++; 
+    next unless $F[$depth_column] >= 10 and $F[$caveat_column] =~ /^(;?[^;]+auto set to \d\.\d+|)$/; # only look at areas with no caveats (besides auto-set allele frequencies)
+  
+    $snp_count_10++;
+    $snp_count_20++ if $F[$depth_column] >= 20;
+    if($F[$hgvs_aa_column] eq "NA"){ #HGVS protein field
+      if(defined $target_regions{$F[$chr_column]}->[$F[$pos_column]]){
+        if($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "C"){
+          #print STDERR "Counting $F[18] as alternate coding targeted\n";
+          $coding_snp_count_10++;
+          $coding_snp_count_20++ if $F[$depth_column] >= 20;
+          $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
+          $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
+        }
+        elsif($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "T"){
+          #print STDERR "Counting $F[18] as non-coding targeted\n";
+          $non_coding_snp_count_10++;
+          $non_coding_snp_count_20++ if $F[$depth_column] >= 20;
+        }
+        # else non-target flanking
+        else{
+          #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas (but shouldn't be here)\n";
+        }
+      }
+      #else non-target flanking
+      else{
+        #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas\n";
+      }
+      if($refbase eq "C" and $newbase eq "T" or
+         $refbase eq "T" and $newbase eq "C" or 
+         $refbase eq "A" and $newbase eq "G" or
+         $refbase eq "G" and $newbase eq "A"){
+        $noncoding_transition_count_10++;
+        $noncoding_transition_count_20++ if $F[$depth_column] >= 20;
+      }
+      else{
+        $noncoding_transversion_count_10++;
+        $noncoding_transversion_count_20++ if $F[$depth_column] >= 20;
+      }
+      next;
+    }
+    elsif($F[$hgvs_aa_column] =~ /^p\..\d+=$/){
+      $synonymous_snp_count_10++;
+      $synonymous_snp_count_20++ if $F[$depth_column] >= 20;
+    }
+    #print STDERR "Counting $F[18] as coding targeted\n";
+    if($F[$chr_column] !~ /X/ and $F[$chr_column] !~ /Y/){
+      $autosomal_snp_count_10++;
+      $autosomal_snp_count_20++ if $F[$depth_column] >= 20;
+      $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
+      $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
+    }
+    $novel_snp_count_10++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA";
+    $novel_snp_count_20++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA" and $F[$depth_column] >= 20;
+    $coding_snp_count_10++;
+    $coding_snp_count_20++ if $F[$depth_column] >= 20;
+    if($refbase eq "C" and $newbase eq "T" or
+       $refbase eq "T" and $newbase eq "C" or 
+       $refbase eq "A" and $newbase eq "G" or
+       $refbase eq "G" and $newbase eq "A"){
+      $coding_transition_count_10++;
+      $coding_transition_count_20++ if $F[$depth_column] >= 20;
+    }
+    else{
+      $coding_transversion_count_10++;
+      $coding_transversion_count_20++ if $F[$depth_column] >= 20;
+    }
+  } # end for each new base
+}
+
+open(SUMMARY, ">$output")
+  or die "Cannot open $output for writing: $!\n";
+printf SUMMARY "Measure\tActual\tIdeal (human)\n";
+printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 10x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_10, $non_coding_snp_count_10/$total_noncoding_bases_under_consideration_10*100;
+printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 20x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_20, $non_coding_snp_count_20/$total_noncoding_bases_under_consideration_20*100;
+printf SUMMARY "Total coding region of interest\t\t%d\n", $intersection_count;
+printf SUMMARY "Total SNP count (10x)\t%d\t%d\n", $coding_snp_count_10, $total_bases_under_consideration_10*0.00048;
+printf SUMMARY "Total SNP count (20x)\t%d\t%d\n", $coding_snp_count_20, $total_bases_under_consideration_20*0.00048;
+printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 10x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_10, $coding_snp_count_10/$total_bases_under_consideration_10*100;
+printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 20x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_20, $coding_snp_count_20/$total_bases_under_consideration_20*100;
+printf SUMMARY "Non-synonymous SNPs observed %% rate (10x)\t%.2f\t45\n", ($coding_snp_count_10-$synonymous_snp_count_10)/$coding_snp_count_10*100;
+printf SUMMARY "Non-synonymous SNPs observed %% rate (20x)\t%.2f\t45\n", ($coding_snp_count_20-$synonymous_snp_count_20)/$coding_snp_count_20*100;
+# 37% comes from 0.59 homo het ratio report for 20 human genomes in doi:10.1371/journal.pgen.1001111
+printf SUMMARY "Homo SNPs observed %% rate (10x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_10/$autosomal_snp_count_10*100;
+printf SUMMARY "Homo SNPs observed %% rate (20x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_20/$autosomal_snp_count_20*100;
+printf SUMMARY "Novel SNP observed %% rate (10x)\t%.4f\t<1\n", $novel_snp_count_10/$snp_count_10*100;
+printf SUMMARY "Novel SNP observed %% rate (20x)\t%.4f\t<1\n", $novel_snp_count_20/$snp_count_20*100;
+printf SUMMARY "Coding SNPs transition:transversion (10x)\t%.2f\t3\n", $coding_transition_count_10/$coding_transversion_count_10 if $coding_transversion_count_10;
+printf SUMMARY "Coding SNPs transition:transversion (20x)\t%.2f\t3\n", $coding_transition_count_20/$coding_transversion_count_20 if $coding_transversion_count_20;
+printf SUMMARY "Non-coding SNPs transition:transversion (10x)\t%.2f\t2.1\n", $noncoding_transition_count_10/$noncoding_transversion_count_10 if $noncoding_transversion_count_10;
+printf SUMMARY "Non-coding SNPs transition:transversion (20x)\t%.2f\t2.1\n", $noncoding_transition_count_20/$noncoding_transversion_count_20 if $noncoding_transversion_count_20;
+printf SUMMARY "All SNPs transition:transversion (10x)\t%.2f\tNA\n", ($noncoding_transition_count_10+$coding_transition_count_10)/($noncoding_transversion_count_10+$coding_transversion_count_10) if $noncoding_transversion_count_10 or $coding_transversion_count_10;
+printf SUMMARY "All SNPs transition:transversion (20x)\t%.2f\tNA\n", ($noncoding_transition_count_20+$coding_transition_count_20)/($noncoding_transversion_count_20+$coding_transversion_count_20) if $noncoding_transversion_count_20 or $noncoding_transversion_count_20;
+close(SUMMARY);