# HG changeset patch # User Yusuf Ali # Date 1427312546 21600 # Node ID 14e1cf728269de2b917f664d4e14133b750dfefb init commit diff -r 000000000000 -r 14e1cf728269 SNPReports.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SNPReports.xml Wed Mar 25 13:42:26 2015 -0600 @@ -0,0 +1,39 @@ + + + + over targeted regions in a genome + snp_reports -v + snp_reports -q $snp_table $target_bed $coding_gtf $depth_summary $out_snp_summary_table + + + + + + + + + + + + + + + + + + + + + + + + + + + +This tool reports several statistics describing the rate of protein coding SNP calls from a next-gen sequencing analysis such as an exome capture. +These results can be used to assess the quality of the SNP caller used, filtering thresholds, etc. Location with less than 10 fold coverage, and +variants with caveat are ignored. + + + diff -r 000000000000 -r 14e1cf728269 snp_reports --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/snp_reports Wed Mar 25 13:42:26 2015 -0600 @@ -0,0 +1,301 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 5 or die "Usage: $0 [-q(uiet)] \n"; + +my $hgvs_file = $ARGV[0]; +my $target_bed = $ARGV[1]; +my $coding_gtf = $ARGV[2]; +my $stats_file = $ARGV[3]; +my $output = $ARGV[4]; + +open(TAB, $hgvs_file) + or die "Cannot open SNP HGVS file $hgvs_file for reading: $!\n"; +my $header = ; # header +chomp $header; +my @header = split /\t/, $header; +my ($chr_column, $pos_column, $ref_column, $var_column, $depth_column, $caveat_column, $hgvs_aa_column, $zygosity_column, $rsid_column, $maf_column); +# Transcript type Transcript length Transcript HGVS cDNA Strand Chr Pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage Since dbSNP Release # HGVS AA Distance from an exon boundary (AA coding variants only) Caveats Phase Sources +for(my $i = 0; $i < $#header; $i++){ + if($header[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($header[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($header[$i] eq "Ref base"){ + $ref_column = $i; + } + elsif($header[$i] eq "Obs base"){ + $var_column = $i; + } + elsif($header[$i] eq "Total Reads"){ + $depth_column = $i; + } + elsif($header[$i] eq "Caveats"){ + $caveat_column = $i; + } + elsif($header[$i] eq "Protein HGVS"){ + $hgvs_aa_column = $i; + } + elsif($header[$i] eq "Zygosity"){ + $zygosity_column = $i; + } + elsif($header[$i] eq "Pop. freq."){ + $maf_column = $i; + } + elsif($header[$i] eq "Pop. freq. source"){ + $rsid_column = $i; + } +} +if(not defined $chr_column){ + die "Cannot find Chr header in $hgvs_file, aborting\n"; +} +if(not defined $pos_column){ + die "Cannot find Pos header in $hgvs_file, aborting\n"; +} +if(not defined $ref_column){ + die "Cannot find Ref Bases header in $hgvs_file, aborting\n"; +} +if(not defined $var_column){ + die "Cannot find Var Bases header in $hgvs_file, aborting\n"; +} +if(not defined $depth_column){ + die "Cannot find Total Reads header in $hgvs_file, aborting\n"; +} +if(not defined $caveat_column){ + die "Cannot find Caveats header in $hgvs_file, aborting\n"; +} +if(not defined $hgvs_aa_column){ + die "Cannot find HGVS AA header in $hgvs_file, aborting\n"; +} +if(not defined $zygosity_column){ + die "Cannot find Zygosity header in $hgvs_file, aborting\n"; +} +if(not defined $maf_column){ + die "Cannot find Pop. freq. header in $hgvs_file, aborting\n"; +} +if(not defined $rsid_column){ + die "Cannot find rsID header in $hgvs_file, aborting\n"; +} + +my %target_regions; +print STDERR "Reading in coding sequence definitions...\n" unless $quiet; +open(GTF, $coding_gtf) + or die "Cannot open coding sequence GTF file $coding_gtf for reading: $!\n"; +while(){ + next if /^\s*#/; + tr/\r//d; + chomp; + my @fields = split /\t/, $_; + next unless $fields[2] eq "CDS"; + if(not exists $target_regions{$fields[0]}){ + $target_regions{$fields[0]} = []; + } + my $chr = $target_regions{$fields[0]}; + for my $pos ($fields[3]..$fields[4]){ + $target_regions{$fields[0]}->[$pos] = "C"; + } +} +close(GTF); + +print STDERR "Reading in targeted sequence definitions...\n" unless $quiet; +my $intersection_count = 0; +my $targeted_total = 0; +my $non_coding_target_total = 0; +open(BED, $target_bed) + or die "Cannot open target regions BED file $target_bed for reading: $!\n"; +while(){ + next if /^\s*#/; + tr/\r//d; + chomp; + my @fields = split /\t/, $_; + $targeted_total += $fields[2]-$fields[1]+1; + + if(not exists $target_regions{$fields[0]}){ + $target_regions{$fields[0]} = []; + } + my $chr = $target_regions{$fields[0]}; + for my $pos ($fields[1]..$fields[2]){ + if(defined $target_regions{$fields[0]}->[$pos]){ + $intersection_count++; + } + else{ + $target_regions{$fields[0]}->[$pos] = "T"; + $non_coding_target_total++; + } + } +} +close(BED); + +print STDERR "Reading in coverage information...\n" unless $quiet; +open(STATS, $stats_file) + or die "Cannot open $stats_file for reading: $!\n"; +my $total_bases_lt_10x; +my $total_bases_lt_20x; +while(){ + if(/^Total bases with less than 10-fold coverage\t(\d+)/){ + $total_bases_lt_10x = $1; + } + elsif(/^Total bases with less than 20-fold coverage\t(\d+)/){ + $total_bases_lt_20x = $1; + } +} +close(STATS); +my $total_bases_under_consideration_10 = $targeted_total-$non_coding_target_total-$total_bases_lt_10x*(1-$non_coding_target_total/$targeted_total); +my $total_bases_under_consideration_20 = $targeted_total-$non_coding_target_total-$total_bases_lt_20x*(1-$non_coding_target_total/$targeted_total); +my $total_noncoding_bases_under_consideration_10 = $non_coding_target_total-$total_bases_lt_10x*($non_coding_target_total/$targeted_total); +my $total_noncoding_bases_under_consideration_20 = $non_coding_target_total-$total_bases_lt_20x*($non_coding_target_total/$targeted_total); + +print STDERR "Processing called SNPs...\n" unless $quiet; +my %seen; +my $coding_transition_count_10 = 0; # A->G, G->A, C->T, T->C, i.e. effect of deamination of 5'-methyl C to uracil +my $coding_transition_count_20 = 0; +my $coding_transversion_count_10 = 0; # A <-> C, A <-> T, G <-> C or G <-> T +my $coding_transversion_count_20 = 0; +my $coding_snp_count_10 = 0; +my $noncoding_transition_count_10 = 0; +my $noncoding_transition_count_20 = 0; +my $noncoding_transversion_count_10 = 0; +my $noncoding_transversion_count_20 = 0; +my $synonymous_snp_count_10 = 0; +my $snp_count_10 = 0; +my $autosomal_snp_count_10 = 0; +my $novel_snp_count_10 = 0; +my $homo_snp_count_10 = 0; +my $non_coding_snp_count_10 = 0; +my $coding_snp_count_20 = 0; +my $synonymous_snp_count_20 = 0; +my $snp_count_20 = 0; +my $autosomal_snp_count_20 = 0; +my $novel_snp_count_20 = 0; +my $homo_snp_count_20 = 0; +my $non_coding_snp_count_20 = 0; + +# Okay, put all of the protein-coding lines at the start so that if any transcript for a +# gene says it's protein coding, that'll be the state recorded for the SNP. +my @datalines = ; + +my $tot_snps = 0; +for(@datalines){ + chomp; + my @F = split /\t/, $_; + + my $newbases = $F[$var_column]; + my $refbases = $F[$ref_column]; + next unless length($newbases) eq length($refbases); #SNPs and MNPs only + for (my $i = 0; $i < length($newbases); $i++){ + my $newbase = substr($newbases, $i, 1); + my $refbase = substr($refbases, $i, 1); + next if $refbase eq $newbase; # ref in the middle of a phased MNP + + next if exists $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"}; # seen SNP already + $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"} = 1; + + $tot_snps++; + next unless $F[$depth_column] >= 10 and $F[$caveat_column] =~ /^(;?[^;]+auto set to \d\.\d+|)$/; # only look at areas with no caveats (besides auto-set allele frequencies) + + $snp_count_10++; + $snp_count_20++ if $F[$depth_column] >= 20; + if($F[$hgvs_aa_column] eq "NA"){ #HGVS protein field + if(defined $target_regions{$F[$chr_column]}->[$F[$pos_column]]){ + if($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "C"){ + #print STDERR "Counting $F[18] as alternate coding targeted\n"; + $coding_snp_count_10++; + $coding_snp_count_20++ if $F[$depth_column] >= 20; + $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/; + $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20; + } + elsif($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "T"){ + #print STDERR "Counting $F[18] as non-coding targeted\n"; + $non_coding_snp_count_10++; + $non_coding_snp_count_20++ if $F[$depth_column] >= 20; + } + # else non-target flanking + else{ + #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas (but shouldn't be here)\n"; + } + } + #else non-target flanking + else{ + #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas\n"; + } + if($refbase eq "C" and $newbase eq "T" or + $refbase eq "T" and $newbase eq "C" or + $refbase eq "A" and $newbase eq "G" or + $refbase eq "G" and $newbase eq "A"){ + $noncoding_transition_count_10++; + $noncoding_transition_count_20++ if $F[$depth_column] >= 20; + } + else{ + $noncoding_transversion_count_10++; + $noncoding_transversion_count_20++ if $F[$depth_column] >= 20; + } + next; + } + elsif($F[$hgvs_aa_column] =~ /^p\..\d+=$/){ + $synonymous_snp_count_10++; + $synonymous_snp_count_20++ if $F[$depth_column] >= 20; + } + #print STDERR "Counting $F[18] as coding targeted\n"; + if($F[$chr_column] !~ /X/ and $F[$chr_column] !~ /Y/){ + $autosomal_snp_count_10++; + $autosomal_snp_count_20++ if $F[$depth_column] >= 20; + $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/; + $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20; + } + $novel_snp_count_10++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA"; + $novel_snp_count_20++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA" and $F[$depth_column] >= 20; + $coding_snp_count_10++; + $coding_snp_count_20++ if $F[$depth_column] >= 20; + if($refbase eq "C" and $newbase eq "T" or + $refbase eq "T" and $newbase eq "C" or + $refbase eq "A" and $newbase eq "G" or + $refbase eq "G" and $newbase eq "A"){ + $coding_transition_count_10++; + $coding_transition_count_20++ if $F[$depth_column] >= 20; + } + else{ + $coding_transversion_count_10++; + $coding_transversion_count_20++ if $F[$depth_column] >= 20; + } + } # end for each new base +} + +open(SUMMARY, ">$output") + or die "Cannot open $output for writing: $!\n"; +printf SUMMARY "Measure\tActual\tIdeal (human)\n"; +printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 10x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_10, $non_coding_snp_count_10/$total_noncoding_bases_under_consideration_10*100; +printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 20x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_20, $non_coding_snp_count_20/$total_noncoding_bases_under_consideration_20*100; +printf SUMMARY "Total coding region of interest\t\t%d\n", $intersection_count; +printf SUMMARY "Total SNP count (10x)\t%d\t%d\n", $coding_snp_count_10, $total_bases_under_consideration_10*0.00048; +printf SUMMARY "Total SNP count (20x)\t%d\t%d\n", $coding_snp_count_20, $total_bases_under_consideration_20*0.00048; +printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 10x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_10, $coding_snp_count_10/$total_bases_under_consideration_10*100; +printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 20x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_20, $coding_snp_count_20/$total_bases_under_consideration_20*100; +printf SUMMARY "Non-synonymous SNPs observed %% rate (10x)\t%.2f\t45\n", ($coding_snp_count_10-$synonymous_snp_count_10)/$coding_snp_count_10*100; +printf SUMMARY "Non-synonymous SNPs observed %% rate (20x)\t%.2f\t45\n", ($coding_snp_count_20-$synonymous_snp_count_20)/$coding_snp_count_20*100; +# 37% comes from 0.59 homo het ratio report for 20 human genomes in doi:10.1371/journal.pgen.1001111 +printf SUMMARY "Homo SNPs observed %% rate (10x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_10/$autosomal_snp_count_10*100; +printf SUMMARY "Homo SNPs observed %% rate (20x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_20/$autosomal_snp_count_20*100; +printf SUMMARY "Novel SNP observed %% rate (10x)\t%.4f\t<1\n", $novel_snp_count_10/$snp_count_10*100; +printf SUMMARY "Novel SNP observed %% rate (20x)\t%.4f\t<1\n", $novel_snp_count_20/$snp_count_20*100; +printf SUMMARY "Coding SNPs transition:transversion (10x)\t%.2f\t3\n", $coding_transition_count_10/$coding_transversion_count_10 if $coding_transversion_count_10; +printf SUMMARY "Coding SNPs transition:transversion (20x)\t%.2f\t3\n", $coding_transition_count_20/$coding_transversion_count_20 if $coding_transversion_count_20; +printf SUMMARY "Non-coding SNPs transition:transversion (10x)\t%.2f\t2.1\n", $noncoding_transition_count_10/$noncoding_transversion_count_10 if $noncoding_transversion_count_10; +printf SUMMARY "Non-coding SNPs transition:transversion (20x)\t%.2f\t2.1\n", $noncoding_transition_count_20/$noncoding_transversion_count_20 if $noncoding_transversion_count_20; +printf SUMMARY "All SNPs transition:transversion (10x)\t%.2f\tNA\n", ($noncoding_transition_count_10+$coding_transition_count_10)/($noncoding_transversion_count_10+$coding_transversion_count_10) if $noncoding_transversion_count_10 or $coding_transversion_count_10; +printf SUMMARY "All SNPs transition:transversion (20x)\t%.2f\tNA\n", ($noncoding_transition_count_20+$coding_transition_count_20)/($noncoding_transversion_count_20+$coding_transversion_count_20) if $noncoding_transversion_count_20 or $noncoding_transversion_count_20; +close(SUMMARY);