changeset 0:14e1cf728269 default tip

init commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:42:26 -0600
parents
children
files SNPReports.xml snp_reports
diffstat 2 files changed, 340 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SNPReports.xml	Wed Mar 25 13:42:26 2015 -0600
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+
+<tool id="snp_reports_1" name="Compute protein coding SNP rates">
+  <description>over targeted regions in a genome</description>
+  <version_string>snp_reports -v</version_string>
+  <command interpreter="perl">snp_reports -q $snp_table $target_bed $coding_gtf $depth_summary $out_snp_summary_table</command>
+  <inputs>
+    <param format="achri_snp_table" name="snp_table" type="data" label="SNP table" help="The output from running the tool 'Convert VCF file to table with HGVS cDNA and protein syntax'. If you have no selection available, please run that tool first in the current session, or import it from a previous analysis history."/>
+    <param format="bed" name="target_bed" type="data" label="Target sequencing regions" help="e.g. a BED file from the Shared Data 'Exome target regions' folder, corresponding the to the capture kit used for sequencing the sample"/>
+    <param format="gtf" name="coding_gtf" type="data" label="Coding sequences" help="e.g. a GTF file from the Shared Data 'Genomic coding sequence regions' folder, corresponding to the types of IDs desired (e.g. NCBI RefGene for NM_######)"/>
+    <param format="achri_depth_summary_table" name="depth_summary" type="data" label="Depth report summary" help="The output from the tool 'Compute depth of sequencing coverage'. If you have no selection available, please run that tool first in the current session, or import it from a previous analysis history."/>
+  </inputs>
+  <outputs>
+    <data name="out_snp_summary_table" format="achri_snp_summary_table" label="Summary of expected vs actual protein-coding SNP calls"/>
+  </outputs>
+
+  <tests>
+    <test>
+     <param name="snp_table" value="brca_depth_summary.txt" ftype="achri_annotated_snp_table"/>
+     <param name="target_bed" value="brcas.bed" ftype="bed"/>
+     <param name="depth_summary" value="brca_depth_summary.txt" ftype="achri_depth_summary_table"/>
+     <param name="coding_gtf" value="brcas.gtf" ftype="gtf"/>
+     <output name="out_snp_summary_table">
+       <assert_contents>
+         <has_text text="targeted nucleotide bases: 155091"/>
+         <has_text text="bases mapped to targeted regions: 11473773"/>
+         <has_text text="bases with less than 20-fold coverage: 19046"/>
+       </assert_contents>
+     </output>
+    </test>
+  </tests>
+
+  <help>
+This tool reports several statistics describing the rate of protein coding SNP calls from a next-gen sequencing analysis such as an exome capture. 
+These results can be used to assess the quality of the SNP caller used, filtering thresholds, etc. Location with less than 10 fold coverage, and 
+variants with caveat are ignored.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/snp_reports	Wed Mar 25 13:42:26 2015 -0600
@@ -0,0 +1,301 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 5 or die "Usage: $0 [-q(uiet)] <snp table> <targets.bed> <coding.gtf> <coverage stats summary.txt> <output table>\n";
+
+my $hgvs_file = $ARGV[0];
+my $target_bed = $ARGV[1];
+my $coding_gtf = $ARGV[2];
+my $stats_file = $ARGV[3];
+my $output = $ARGV[4];
+
+open(TAB, $hgvs_file)
+  or die "Cannot open SNP HGVS file $hgvs_file for reading: $!\n";
+my $header = <TAB>; # header
+chomp $header;
+my @header = split /\t/, $header;
+my ($chr_column, $pos_column, $ref_column, $var_column, $depth_column, $caveat_column, $hgvs_aa_column, $zygosity_column, $rsid_column, $maf_column);
+# Transcript type	Transcript length	Transcript	HGVS cDNA	Strand	Chr	Pos	Zygosity	P-value	Variant Reads	Total Reads	Ref Bases	Var Bases	Population Frequency Source	Pop. freq. or DGV2 gain/loss coverage	Since dbSNP Release #	HGVS AA	Distance from an exon boundary (AA coding variants only)	Caveats	Phase	Sources
+for(my $i = 0; $i < $#header; $i++){
+  if($header[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($header[$i] eq "DNA From"){
+    $pos_column = $i;
+  }
+  elsif($header[$i] eq "Ref base"){
+    $ref_column = $i;
+  }
+  elsif($header[$i] eq "Obs base"){
+    $var_column = $i;
+  }
+  elsif($header[$i] eq "Total Reads"){
+    $depth_column = $i;
+  }
+  elsif($header[$i] eq "Caveats"){
+    $caveat_column = $i;
+  }
+  elsif($header[$i] eq "Protein HGVS"){
+    $hgvs_aa_column = $i;
+  }
+  elsif($header[$i] eq "Zygosity"){
+    $zygosity_column = $i;
+  }
+  elsif($header[$i] eq "Pop. freq."){
+    $maf_column = $i;
+  }
+  elsif($header[$i] eq "Pop. freq. source"){
+    $rsid_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Cannot find Chr header in $hgvs_file, aborting\n";
+}
+if(not defined $pos_column){
+  die "Cannot find Pos header in $hgvs_file, aborting\n";
+}
+if(not defined $ref_column){
+  die "Cannot find Ref Bases header in $hgvs_file, aborting\n";
+}
+if(not defined $var_column){
+  die "Cannot find Var Bases header in $hgvs_file, aborting\n";
+}
+if(not defined $depth_column){
+  die "Cannot find Total Reads header in $hgvs_file, aborting\n";
+}
+if(not defined $caveat_column){
+  die "Cannot find Caveats header in $hgvs_file, aborting\n";
+}
+if(not defined $hgvs_aa_column){
+  die "Cannot find HGVS AA header in $hgvs_file, aborting\n";
+}
+if(not defined $zygosity_column){
+  die "Cannot find Zygosity header in $hgvs_file, aborting\n";
+}
+if(not defined $maf_column){
+  die "Cannot find Pop. freq. header in $hgvs_file, aborting\n";
+}
+if(not defined $rsid_column){
+  die "Cannot find rsID header in $hgvs_file, aborting\n";
+}
+
+my %target_regions;
+print STDERR "Reading in coding sequence definitions...\n" unless $quiet;
+open(GTF, $coding_gtf)
+    or die "Cannot open coding sequence GTF file $coding_gtf for reading: $!\n";
+while(<GTF>){
+    next if /^\s*#/;
+    tr/\r//d;
+    chomp;
+    my @fields = split /\t/, $_;
+    next unless $fields[2] eq "CDS";
+    if(not exists $target_regions{$fields[0]}){
+        $target_regions{$fields[0]} = [];
+    }
+    my $chr = $target_regions{$fields[0]};
+    for my $pos ($fields[3]..$fields[4]){
+      $target_regions{$fields[0]}->[$pos] = "C";
+    }
+}
+close(GTF);
+
+print STDERR "Reading in targeted sequence definitions...\n" unless $quiet;
+my $intersection_count = 0;
+my $targeted_total = 0;
+my $non_coding_target_total = 0;
+open(BED, $target_bed)
+    or die "Cannot open target regions BED file $target_bed for reading: $!\n"; 
+while(<BED>){
+    next if /^\s*#/;    
+    tr/\r//d;
+    chomp;
+    my @fields = split /\t/, $_;
+    $targeted_total += $fields[2]-$fields[1]+1;
+
+    if(not exists $target_regions{$fields[0]}){
+	$target_regions{$fields[0]} = [];
+    }
+    my $chr = $target_regions{$fields[0]};
+    for my $pos ($fields[1]..$fields[2]){
+      if(defined $target_regions{$fields[0]}->[$pos]){
+        $intersection_count++;
+      }
+      else{
+        $target_regions{$fields[0]}->[$pos] = "T";
+        $non_coding_target_total++;
+      }
+    }
+}
+close(BED);
+
+print STDERR "Reading in coverage information...\n" unless $quiet;
+open(STATS, $stats_file)
+ or die "Cannot open $stats_file for reading: $!\n";
+my $total_bases_lt_10x;
+my $total_bases_lt_20x;
+while(<STATS>){
+  if(/^Total bases with less than 10-fold coverage\t(\d+)/){
+    $total_bases_lt_10x = $1;
+  }
+  elsif(/^Total bases with less than 20-fold coverage\t(\d+)/){
+    $total_bases_lt_20x = $1;
+  }
+}
+close(STATS);
+my $total_bases_under_consideration_10 = $targeted_total-$non_coding_target_total-$total_bases_lt_10x*(1-$non_coding_target_total/$targeted_total);
+my $total_bases_under_consideration_20 = $targeted_total-$non_coding_target_total-$total_bases_lt_20x*(1-$non_coding_target_total/$targeted_total);
+my $total_noncoding_bases_under_consideration_10 = $non_coding_target_total-$total_bases_lt_10x*($non_coding_target_total/$targeted_total);
+my $total_noncoding_bases_under_consideration_20 = $non_coding_target_total-$total_bases_lt_20x*($non_coding_target_total/$targeted_total);
+
+print STDERR "Processing called SNPs...\n" unless $quiet;
+my %seen;
+my $coding_transition_count_10 = 0; # A->G, G->A, C->T, T->C, i.e. effect of deamination of 5'-methyl C to uracil
+my $coding_transition_count_20 = 0;
+my $coding_transversion_count_10 = 0; # A <-> C, A <-> T, G <-> C or G <-> T
+my $coding_transversion_count_20 = 0;
+my $coding_snp_count_10 = 0;
+my $noncoding_transition_count_10 = 0;
+my $noncoding_transition_count_20 = 0;
+my $noncoding_transversion_count_10 = 0;
+my $noncoding_transversion_count_20 = 0;
+my $synonymous_snp_count_10 = 0;
+my $snp_count_10 = 0;
+my $autosomal_snp_count_10 = 0;
+my $novel_snp_count_10 = 0;
+my $homo_snp_count_10 = 0;
+my $non_coding_snp_count_10 = 0;
+my $coding_snp_count_20 = 0;
+my $synonymous_snp_count_20 = 0;
+my $snp_count_20 = 0;
+my $autosomal_snp_count_20 = 0;
+my $novel_snp_count_20 = 0;
+my $homo_snp_count_20 = 0;
+my $non_coding_snp_count_20 = 0;
+
+# Okay, put all of the protein-coding lines at the start so that if any transcript for a 
+# gene says it's protein coding, that'll be the state recorded for the SNP.
+my @datalines = <TAB>;
+
+my $tot_snps = 0;
+for(@datalines){
+  chomp;
+  my @F = split /\t/, $_;
+
+  my $newbases = $F[$var_column];
+  my $refbases = $F[$ref_column];
+  next unless length($newbases) eq length($refbases); #SNPs and MNPs only
+  for (my $i = 0; $i < length($newbases); $i++){
+    my $newbase = substr($newbases, $i, 1);
+    my $refbase = substr($refbases, $i, 1);
+    next if $refbase eq $newbase; # ref in the middle of a phased MNP
+
+    next if exists $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"}; # seen SNP already
+    $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"} = 1;
+
+    $tot_snps++; 
+    next unless $F[$depth_column] >= 10 and $F[$caveat_column] =~ /^(;?[^;]+auto set to \d\.\d+|)$/; # only look at areas with no caveats (besides auto-set allele frequencies)
+  
+    $snp_count_10++;
+    $snp_count_20++ if $F[$depth_column] >= 20;
+    if($F[$hgvs_aa_column] eq "NA"){ #HGVS protein field
+      if(defined $target_regions{$F[$chr_column]}->[$F[$pos_column]]){
+        if($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "C"){
+          #print STDERR "Counting $F[18] as alternate coding targeted\n";
+          $coding_snp_count_10++;
+          $coding_snp_count_20++ if $F[$depth_column] >= 20;
+          $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
+          $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
+        }
+        elsif($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "T"){
+          #print STDERR "Counting $F[18] as non-coding targeted\n";
+          $non_coding_snp_count_10++;
+          $non_coding_snp_count_20++ if $F[$depth_column] >= 20;
+        }
+        # else non-target flanking
+        else{
+          #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas (but shouldn't be here)\n";
+        }
+      }
+      #else non-target flanking
+      else{
+        #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas\n";
+      }
+      if($refbase eq "C" and $newbase eq "T" or
+         $refbase eq "T" and $newbase eq "C" or 
+         $refbase eq "A" and $newbase eq "G" or
+         $refbase eq "G" and $newbase eq "A"){
+        $noncoding_transition_count_10++;
+        $noncoding_transition_count_20++ if $F[$depth_column] >= 20;
+      }
+      else{
+        $noncoding_transversion_count_10++;
+        $noncoding_transversion_count_20++ if $F[$depth_column] >= 20;
+      }
+      next;
+    }
+    elsif($F[$hgvs_aa_column] =~ /^p\..\d+=$/){
+      $synonymous_snp_count_10++;
+      $synonymous_snp_count_20++ if $F[$depth_column] >= 20;
+    }
+    #print STDERR "Counting $F[18] as coding targeted\n";
+    if($F[$chr_column] !~ /X/ and $F[$chr_column] !~ /Y/){
+      $autosomal_snp_count_10++;
+      $autosomal_snp_count_20++ if $F[$depth_column] >= 20;
+      $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
+      $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
+    }
+    $novel_snp_count_10++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA";
+    $novel_snp_count_20++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA" and $F[$depth_column] >= 20;
+    $coding_snp_count_10++;
+    $coding_snp_count_20++ if $F[$depth_column] >= 20;
+    if($refbase eq "C" and $newbase eq "T" or
+       $refbase eq "T" and $newbase eq "C" or 
+       $refbase eq "A" and $newbase eq "G" or
+       $refbase eq "G" and $newbase eq "A"){
+      $coding_transition_count_10++;
+      $coding_transition_count_20++ if $F[$depth_column] >= 20;
+    }
+    else{
+      $coding_transversion_count_10++;
+      $coding_transversion_count_20++ if $F[$depth_column] >= 20;
+    }
+  } # end for each new base
+}
+
+open(SUMMARY, ">$output")
+  or die "Cannot open $output for writing: $!\n";
+printf SUMMARY "Measure\tActual\tIdeal (human)\n";
+printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 10x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_10, $non_coding_snp_count_10/$total_noncoding_bases_under_consideration_10*100;
+printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 20x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_20, $non_coding_snp_count_20/$total_noncoding_bases_under_consideration_20*100;
+printf SUMMARY "Total coding region of interest\t\t%d\n", $intersection_count;
+printf SUMMARY "Total SNP count (10x)\t%d\t%d\n", $coding_snp_count_10, $total_bases_under_consideration_10*0.00048;
+printf SUMMARY "Total SNP count (20x)\t%d\t%d\n", $coding_snp_count_20, $total_bases_under_consideration_20*0.00048;
+printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 10x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_10, $coding_snp_count_10/$total_bases_under_consideration_10*100;
+printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 20x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_20, $coding_snp_count_20/$total_bases_under_consideration_20*100;
+printf SUMMARY "Non-synonymous SNPs observed %% rate (10x)\t%.2f\t45\n", ($coding_snp_count_10-$synonymous_snp_count_10)/$coding_snp_count_10*100;
+printf SUMMARY "Non-synonymous SNPs observed %% rate (20x)\t%.2f\t45\n", ($coding_snp_count_20-$synonymous_snp_count_20)/$coding_snp_count_20*100;
+# 37% comes from 0.59 homo het ratio report for 20 human genomes in doi:10.1371/journal.pgen.1001111
+printf SUMMARY "Homo SNPs observed %% rate (10x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_10/$autosomal_snp_count_10*100;
+printf SUMMARY "Homo SNPs observed %% rate (20x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_20/$autosomal_snp_count_20*100;
+printf SUMMARY "Novel SNP observed %% rate (10x)\t%.4f\t<1\n", $novel_snp_count_10/$snp_count_10*100;
+printf SUMMARY "Novel SNP observed %% rate (20x)\t%.4f\t<1\n", $novel_snp_count_20/$snp_count_20*100;
+printf SUMMARY "Coding SNPs transition:transversion (10x)\t%.2f\t3\n", $coding_transition_count_10/$coding_transversion_count_10 if $coding_transversion_count_10;
+printf SUMMARY "Coding SNPs transition:transversion (20x)\t%.2f\t3\n", $coding_transition_count_20/$coding_transversion_count_20 if $coding_transversion_count_20;
+printf SUMMARY "Non-coding SNPs transition:transversion (10x)\t%.2f\t2.1\n", $noncoding_transition_count_10/$noncoding_transversion_count_10 if $noncoding_transversion_count_10;
+printf SUMMARY "Non-coding SNPs transition:transversion (20x)\t%.2f\t2.1\n", $noncoding_transition_count_20/$noncoding_transversion_count_20 if $noncoding_transversion_count_20;
+printf SUMMARY "All SNPs transition:transversion (10x)\t%.2f\tNA\n", ($noncoding_transition_count_10+$coding_transition_count_10)/($noncoding_transversion_count_10+$coding_transversion_count_10) if $noncoding_transversion_count_10 or $coding_transversion_count_10;
+printf SUMMARY "All SNPs transition:transversion (20x)\t%.2f\tNA\n", ($noncoding_transition_count_20+$coding_transition_count_20)/($noncoding_transversion_count_20+$coding_transversion_count_20) if $noncoding_transversion_count_20 or $noncoding_transversion_count_20;
+close(SUMMARY);