Mercurial > repos > yusuf > snp_report
comparison snp_reports @ 0:14e1cf728269 default tip
init commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:42:26 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:14e1cf728269 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
| 7 print "Version 1.0\n"; | |
| 8 exit; | |
| 9 } | |
| 10 | |
| 11 my $quiet = 0; | |
| 12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | |
| 13 $quiet = 1; | |
| 14 shift @ARGV; | |
| 15 } | |
| 16 | |
| 17 @ARGV == 5 or die "Usage: $0 [-q(uiet)] <snp table> <targets.bed> <coding.gtf> <coverage stats summary.txt> <output table>\n"; | |
| 18 | |
| 19 my $hgvs_file = $ARGV[0]; | |
| 20 my $target_bed = $ARGV[1]; | |
| 21 my $coding_gtf = $ARGV[2]; | |
| 22 my $stats_file = $ARGV[3]; | |
| 23 my $output = $ARGV[4]; | |
| 24 | |
| 25 open(TAB, $hgvs_file) | |
| 26 or die "Cannot open SNP HGVS file $hgvs_file for reading: $!\n"; | |
| 27 my $header = <TAB>; # header | |
| 28 chomp $header; | |
| 29 my @header = split /\t/, $header; | |
| 30 my ($chr_column, $pos_column, $ref_column, $var_column, $depth_column, $caveat_column, $hgvs_aa_column, $zygosity_column, $rsid_column, $maf_column); | |
| 31 # Transcript type Transcript length Transcript HGVS cDNA Strand Chr Pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage Since dbSNP Release # HGVS AA Distance from an exon boundary (AA coding variants only) Caveats Phase Sources | |
| 32 for(my $i = 0; $i < $#header; $i++){ | |
| 33 if($header[$i] eq "Chr"){ | |
| 34 $chr_column = $i; | |
| 35 } | |
| 36 elsif($header[$i] eq "DNA From"){ | |
| 37 $pos_column = $i; | |
| 38 } | |
| 39 elsif($header[$i] eq "Ref base"){ | |
| 40 $ref_column = $i; | |
| 41 } | |
| 42 elsif($header[$i] eq "Obs base"){ | |
| 43 $var_column = $i; | |
| 44 } | |
| 45 elsif($header[$i] eq "Total Reads"){ | |
| 46 $depth_column = $i; | |
| 47 } | |
| 48 elsif($header[$i] eq "Caveats"){ | |
| 49 $caveat_column = $i; | |
| 50 } | |
| 51 elsif($header[$i] eq "Protein HGVS"){ | |
| 52 $hgvs_aa_column = $i; | |
| 53 } | |
| 54 elsif($header[$i] eq "Zygosity"){ | |
| 55 $zygosity_column = $i; | |
| 56 } | |
| 57 elsif($header[$i] eq "Pop. freq."){ | |
| 58 $maf_column = $i; | |
| 59 } | |
| 60 elsif($header[$i] eq "Pop. freq. source"){ | |
| 61 $rsid_column = $i; | |
| 62 } | |
| 63 } | |
| 64 if(not defined $chr_column){ | |
| 65 die "Cannot find Chr header in $hgvs_file, aborting\n"; | |
| 66 } | |
| 67 if(not defined $pos_column){ | |
| 68 die "Cannot find Pos header in $hgvs_file, aborting\n"; | |
| 69 } | |
| 70 if(not defined $ref_column){ | |
| 71 die "Cannot find Ref Bases header in $hgvs_file, aborting\n"; | |
| 72 } | |
| 73 if(not defined $var_column){ | |
| 74 die "Cannot find Var Bases header in $hgvs_file, aborting\n"; | |
| 75 } | |
| 76 if(not defined $depth_column){ | |
| 77 die "Cannot find Total Reads header in $hgvs_file, aborting\n"; | |
| 78 } | |
| 79 if(not defined $caveat_column){ | |
| 80 die "Cannot find Caveats header in $hgvs_file, aborting\n"; | |
| 81 } | |
| 82 if(not defined $hgvs_aa_column){ | |
| 83 die "Cannot find HGVS AA header in $hgvs_file, aborting\n"; | |
| 84 } | |
| 85 if(not defined $zygosity_column){ | |
| 86 die "Cannot find Zygosity header in $hgvs_file, aborting\n"; | |
| 87 } | |
| 88 if(not defined $maf_column){ | |
| 89 die "Cannot find Pop. freq. header in $hgvs_file, aborting\n"; | |
| 90 } | |
| 91 if(not defined $rsid_column){ | |
| 92 die "Cannot find rsID header in $hgvs_file, aborting\n"; | |
| 93 } | |
| 94 | |
| 95 my %target_regions; | |
| 96 print STDERR "Reading in coding sequence definitions...\n" unless $quiet; | |
| 97 open(GTF, $coding_gtf) | |
| 98 or die "Cannot open coding sequence GTF file $coding_gtf for reading: $!\n"; | |
| 99 while(<GTF>){ | |
| 100 next if /^\s*#/; | |
| 101 tr/\r//d; | |
| 102 chomp; | |
| 103 my @fields = split /\t/, $_; | |
| 104 next unless $fields[2] eq "CDS"; | |
| 105 if(not exists $target_regions{$fields[0]}){ | |
| 106 $target_regions{$fields[0]} = []; | |
| 107 } | |
| 108 my $chr = $target_regions{$fields[0]}; | |
| 109 for my $pos ($fields[3]..$fields[4]){ | |
| 110 $target_regions{$fields[0]}->[$pos] = "C"; | |
| 111 } | |
| 112 } | |
| 113 close(GTF); | |
| 114 | |
| 115 print STDERR "Reading in targeted sequence definitions...\n" unless $quiet; | |
| 116 my $intersection_count = 0; | |
| 117 my $targeted_total = 0; | |
| 118 my $non_coding_target_total = 0; | |
| 119 open(BED, $target_bed) | |
| 120 or die "Cannot open target regions BED file $target_bed for reading: $!\n"; | |
| 121 while(<BED>){ | |
| 122 next if /^\s*#/; | |
| 123 tr/\r//d; | |
| 124 chomp; | |
| 125 my @fields = split /\t/, $_; | |
| 126 $targeted_total += $fields[2]-$fields[1]+1; | |
| 127 | |
| 128 if(not exists $target_regions{$fields[0]}){ | |
| 129 $target_regions{$fields[0]} = []; | |
| 130 } | |
| 131 my $chr = $target_regions{$fields[0]}; | |
| 132 for my $pos ($fields[1]..$fields[2]){ | |
| 133 if(defined $target_regions{$fields[0]}->[$pos]){ | |
| 134 $intersection_count++; | |
| 135 } | |
| 136 else{ | |
| 137 $target_regions{$fields[0]}->[$pos] = "T"; | |
| 138 $non_coding_target_total++; | |
| 139 } | |
| 140 } | |
| 141 } | |
| 142 close(BED); | |
| 143 | |
| 144 print STDERR "Reading in coverage information...\n" unless $quiet; | |
| 145 open(STATS, $stats_file) | |
| 146 or die "Cannot open $stats_file for reading: $!\n"; | |
| 147 my $total_bases_lt_10x; | |
| 148 my $total_bases_lt_20x; | |
| 149 while(<STATS>){ | |
| 150 if(/^Total bases with less than 10-fold coverage\t(\d+)/){ | |
| 151 $total_bases_lt_10x = $1; | |
| 152 } | |
| 153 elsif(/^Total bases with less than 20-fold coverage\t(\d+)/){ | |
| 154 $total_bases_lt_20x = $1; | |
| 155 } | |
| 156 } | |
| 157 close(STATS); | |
| 158 my $total_bases_under_consideration_10 = $targeted_total-$non_coding_target_total-$total_bases_lt_10x*(1-$non_coding_target_total/$targeted_total); | |
| 159 my $total_bases_under_consideration_20 = $targeted_total-$non_coding_target_total-$total_bases_lt_20x*(1-$non_coding_target_total/$targeted_total); | |
| 160 my $total_noncoding_bases_under_consideration_10 = $non_coding_target_total-$total_bases_lt_10x*($non_coding_target_total/$targeted_total); | |
| 161 my $total_noncoding_bases_under_consideration_20 = $non_coding_target_total-$total_bases_lt_20x*($non_coding_target_total/$targeted_total); | |
| 162 | |
| 163 print STDERR "Processing called SNPs...\n" unless $quiet; | |
| 164 my %seen; | |
| 165 my $coding_transition_count_10 = 0; # A->G, G->A, C->T, T->C, i.e. effect of deamination of 5'-methyl C to uracil | |
| 166 my $coding_transition_count_20 = 0; | |
| 167 my $coding_transversion_count_10 = 0; # A <-> C, A <-> T, G <-> C or G <-> T | |
| 168 my $coding_transversion_count_20 = 0; | |
| 169 my $coding_snp_count_10 = 0; | |
| 170 my $noncoding_transition_count_10 = 0; | |
| 171 my $noncoding_transition_count_20 = 0; | |
| 172 my $noncoding_transversion_count_10 = 0; | |
| 173 my $noncoding_transversion_count_20 = 0; | |
| 174 my $synonymous_snp_count_10 = 0; | |
| 175 my $snp_count_10 = 0; | |
| 176 my $autosomal_snp_count_10 = 0; | |
| 177 my $novel_snp_count_10 = 0; | |
| 178 my $homo_snp_count_10 = 0; | |
| 179 my $non_coding_snp_count_10 = 0; | |
| 180 my $coding_snp_count_20 = 0; | |
| 181 my $synonymous_snp_count_20 = 0; | |
| 182 my $snp_count_20 = 0; | |
| 183 my $autosomal_snp_count_20 = 0; | |
| 184 my $novel_snp_count_20 = 0; | |
| 185 my $homo_snp_count_20 = 0; | |
| 186 my $non_coding_snp_count_20 = 0; | |
| 187 | |
| 188 # Okay, put all of the protein-coding lines at the start so that if any transcript for a | |
| 189 # gene says it's protein coding, that'll be the state recorded for the SNP. | |
| 190 my @datalines = <TAB>; | |
| 191 | |
| 192 my $tot_snps = 0; | |
| 193 for(@datalines){ | |
| 194 chomp; | |
| 195 my @F = split /\t/, $_; | |
| 196 | |
| 197 my $newbases = $F[$var_column]; | |
| 198 my $refbases = $F[$ref_column]; | |
| 199 next unless length($newbases) eq length($refbases); #SNPs and MNPs only | |
| 200 for (my $i = 0; $i < length($newbases); $i++){ | |
| 201 my $newbase = substr($newbases, $i, 1); | |
| 202 my $refbase = substr($refbases, $i, 1); | |
| 203 next if $refbase eq $newbase; # ref in the middle of a phased MNP | |
| 204 | |
| 205 next if exists $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"}; # seen SNP already | |
| 206 $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"} = 1; | |
| 207 | |
| 208 $tot_snps++; | |
| 209 next unless $F[$depth_column] >= 10 and $F[$caveat_column] =~ /^(;?[^;]+auto set to \d\.\d+|)$/; # only look at areas with no caveats (besides auto-set allele frequencies) | |
| 210 | |
| 211 $snp_count_10++; | |
| 212 $snp_count_20++ if $F[$depth_column] >= 20; | |
| 213 if($F[$hgvs_aa_column] eq "NA"){ #HGVS protein field | |
| 214 if(defined $target_regions{$F[$chr_column]}->[$F[$pos_column]]){ | |
| 215 if($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "C"){ | |
| 216 #print STDERR "Counting $F[18] as alternate coding targeted\n"; | |
| 217 $coding_snp_count_10++; | |
| 218 $coding_snp_count_20++ if $F[$depth_column] >= 20; | |
| 219 $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/; | |
| 220 $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20; | |
| 221 } | |
| 222 elsif($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "T"){ | |
| 223 #print STDERR "Counting $F[18] as non-coding targeted\n"; | |
| 224 $non_coding_snp_count_10++; | |
| 225 $non_coding_snp_count_20++ if $F[$depth_column] >= 20; | |
| 226 } | |
| 227 # else non-target flanking | |
| 228 else{ | |
| 229 #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas (but shouldn't be here)\n"; | |
| 230 } | |
| 231 } | |
| 232 #else non-target flanking | |
| 233 else{ | |
| 234 #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas\n"; | |
| 235 } | |
| 236 if($refbase eq "C" and $newbase eq "T" or | |
| 237 $refbase eq "T" and $newbase eq "C" or | |
| 238 $refbase eq "A" and $newbase eq "G" or | |
| 239 $refbase eq "G" and $newbase eq "A"){ | |
| 240 $noncoding_transition_count_10++; | |
| 241 $noncoding_transition_count_20++ if $F[$depth_column] >= 20; | |
| 242 } | |
| 243 else{ | |
| 244 $noncoding_transversion_count_10++; | |
| 245 $noncoding_transversion_count_20++ if $F[$depth_column] >= 20; | |
| 246 } | |
| 247 next; | |
| 248 } | |
| 249 elsif($F[$hgvs_aa_column] =~ /^p\..\d+=$/){ | |
| 250 $synonymous_snp_count_10++; | |
| 251 $synonymous_snp_count_20++ if $F[$depth_column] >= 20; | |
| 252 } | |
| 253 #print STDERR "Counting $F[18] as coding targeted\n"; | |
| 254 if($F[$chr_column] !~ /X/ and $F[$chr_column] !~ /Y/){ | |
| 255 $autosomal_snp_count_10++; | |
| 256 $autosomal_snp_count_20++ if $F[$depth_column] >= 20; | |
| 257 $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/; | |
| 258 $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20; | |
| 259 } | |
| 260 $novel_snp_count_10++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA"; | |
| 261 $novel_snp_count_20++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA" and $F[$depth_column] >= 20; | |
| 262 $coding_snp_count_10++; | |
| 263 $coding_snp_count_20++ if $F[$depth_column] >= 20; | |
| 264 if($refbase eq "C" and $newbase eq "T" or | |
| 265 $refbase eq "T" and $newbase eq "C" or | |
| 266 $refbase eq "A" and $newbase eq "G" or | |
| 267 $refbase eq "G" and $newbase eq "A"){ | |
| 268 $coding_transition_count_10++; | |
| 269 $coding_transition_count_20++ if $F[$depth_column] >= 20; | |
| 270 } | |
| 271 else{ | |
| 272 $coding_transversion_count_10++; | |
| 273 $coding_transversion_count_20++ if $F[$depth_column] >= 20; | |
| 274 } | |
| 275 } # end for each new base | |
| 276 } | |
| 277 | |
| 278 open(SUMMARY, ">$output") | |
| 279 or die "Cannot open $output for writing: $!\n"; | |
| 280 printf SUMMARY "Measure\tActual\tIdeal (human)\n"; | |
| 281 printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 10x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_10, $non_coding_snp_count_10/$total_noncoding_bases_under_consideration_10*100; | |
| 282 printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 20x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_20, $non_coding_snp_count_20/$total_noncoding_bases_under_consideration_20*100; | |
| 283 printf SUMMARY "Total coding region of interest\t\t%d\n", $intersection_count; | |
| 284 printf SUMMARY "Total SNP count (10x)\t%d\t%d\n", $coding_snp_count_10, $total_bases_under_consideration_10*0.00048; | |
| 285 printf SUMMARY "Total SNP count (20x)\t%d\t%d\n", $coding_snp_count_20, $total_bases_under_consideration_20*0.00048; | |
| 286 printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 10x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_10, $coding_snp_count_10/$total_bases_under_consideration_10*100; | |
| 287 printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 20x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_20, $coding_snp_count_20/$total_bases_under_consideration_20*100; | |
| 288 printf SUMMARY "Non-synonymous SNPs observed %% rate (10x)\t%.2f\t45\n", ($coding_snp_count_10-$synonymous_snp_count_10)/$coding_snp_count_10*100; | |
| 289 printf SUMMARY "Non-synonymous SNPs observed %% rate (20x)\t%.2f\t45\n", ($coding_snp_count_20-$synonymous_snp_count_20)/$coding_snp_count_20*100; | |
| 290 # 37% comes from 0.59 homo het ratio report for 20 human genomes in doi:10.1371/journal.pgen.1001111 | |
| 291 printf SUMMARY "Homo SNPs observed %% rate (10x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_10/$autosomal_snp_count_10*100; | |
| 292 printf SUMMARY "Homo SNPs observed %% rate (20x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_20/$autosomal_snp_count_20*100; | |
| 293 printf SUMMARY "Novel SNP observed %% rate (10x)\t%.4f\t<1\n", $novel_snp_count_10/$snp_count_10*100; | |
| 294 printf SUMMARY "Novel SNP observed %% rate (20x)\t%.4f\t<1\n", $novel_snp_count_20/$snp_count_20*100; | |
| 295 printf SUMMARY "Coding SNPs transition:transversion (10x)\t%.2f\t3\n", $coding_transition_count_10/$coding_transversion_count_10 if $coding_transversion_count_10; | |
| 296 printf SUMMARY "Coding SNPs transition:transversion (20x)\t%.2f\t3\n", $coding_transition_count_20/$coding_transversion_count_20 if $coding_transversion_count_20; | |
| 297 printf SUMMARY "Non-coding SNPs transition:transversion (10x)\t%.2f\t2.1\n", $noncoding_transition_count_10/$noncoding_transversion_count_10 if $noncoding_transversion_count_10; | |
| 298 printf SUMMARY "Non-coding SNPs transition:transversion (20x)\t%.2f\t2.1\n", $noncoding_transition_count_20/$noncoding_transversion_count_20 if $noncoding_transversion_count_20; | |
| 299 printf SUMMARY "All SNPs transition:transversion (10x)\t%.2f\tNA\n", ($noncoding_transition_count_10+$coding_transition_count_10)/($noncoding_transversion_count_10+$coding_transversion_count_10) if $noncoding_transversion_count_10 or $coding_transversion_count_10; | |
| 300 printf SUMMARY "All SNPs transition:transversion (20x)\t%.2f\tNA\n", ($noncoding_transition_count_20+$coding_transition_count_20)/($noncoding_transversion_count_20+$coding_transversion_count_20) if $noncoding_transversion_count_20 or $noncoding_transversion_count_20; | |
| 301 close(SUMMARY); |
