annotate snp_reports @ 0:14e1cf728269 default tip

init commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:42:26 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 if(@ARGV == 1 and $ARGV[0] eq "-v"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 print "Version 1.0\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 exit;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my $quiet = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 $quiet = 1;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 shift @ARGV;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 @ARGV == 5 or die "Usage: $0 [-q(uiet)] <snp table> <targets.bed> <coding.gtf> <coverage stats summary.txt> <output table>\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my $hgvs_file = $ARGV[0];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $target_bed = $ARGV[1];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my $coding_gtf = $ARGV[2];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my $stats_file = $ARGV[3];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 my $output = $ARGV[4];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 open(TAB, $hgvs_file)
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 or die "Cannot open SNP HGVS file $hgvs_file for reading: $!\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my $header = <TAB>; # header
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 chomp $header;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 my @header = split /\t/, $header;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 my ($chr_column, $pos_column, $ref_column, $var_column, $depth_column, $caveat_column, $hgvs_aa_column, $zygosity_column, $rsid_column, $maf_column);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 # Transcript type Transcript length Transcript HGVS cDNA Strand Chr Pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage Since dbSNP Release # HGVS AA Distance from an exon boundary (AA coding variants only) Caveats Phase Sources
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 for(my $i = 0; $i < $#header; $i++){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 if($header[$i] eq "Chr"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 $chr_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 elsif($header[$i] eq "DNA From"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 $pos_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 elsif($header[$i] eq "Ref base"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 $ref_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 elsif($header[$i] eq "Obs base"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 $var_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 elsif($header[$i] eq "Total Reads"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 $depth_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 elsif($header[$i] eq "Caveats"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 $caveat_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 elsif($header[$i] eq "Protein HGVS"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 $hgvs_aa_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 elsif($header[$i] eq "Zygosity"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 $zygosity_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 elsif($header[$i] eq "Pop. freq."){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 $maf_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 elsif($header[$i] eq "Pop. freq. source"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 $rsid_column = $i;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 if(not defined $chr_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 die "Cannot find Chr header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 if(not defined $pos_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 die "Cannot find Pos header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 if(not defined $ref_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 die "Cannot find Ref Bases header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 if(not defined $var_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 die "Cannot find Var Bases header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 if(not defined $depth_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 die "Cannot find Total Reads header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 if(not defined $caveat_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 die "Cannot find Caveats header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 if(not defined $hgvs_aa_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 die "Cannot find HGVS AA header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 if(not defined $zygosity_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 die "Cannot find Zygosity header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 if(not defined $maf_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 die "Cannot find Pop. freq. header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 if(not defined $rsid_column){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 die "Cannot find rsID header in $hgvs_file, aborting\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 my %target_regions;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 print STDERR "Reading in coding sequence definitions...\n" unless $quiet;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 open(GTF, $coding_gtf)
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 or die "Cannot open coding sequence GTF file $coding_gtf for reading: $!\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 while(<GTF>){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 next if /^\s*#/;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 tr/\r//d;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 chomp;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 my @fields = split /\t/, $_;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 next unless $fields[2] eq "CDS";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 if(not exists $target_regions{$fields[0]}){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 $target_regions{$fields[0]} = [];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 my $chr = $target_regions{$fields[0]};
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 for my $pos ($fields[3]..$fields[4]){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 $target_regions{$fields[0]}->[$pos] = "C";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 close(GTF);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 print STDERR "Reading in targeted sequence definitions...\n" unless $quiet;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 my $intersection_count = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 my $targeted_total = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 my $non_coding_target_total = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 open(BED, $target_bed)
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 or die "Cannot open target regions BED file $target_bed for reading: $!\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 while(<BED>){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 next if /^\s*#/;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 tr/\r//d;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 chomp;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 my @fields = split /\t/, $_;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 $targeted_total += $fields[2]-$fields[1]+1;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 if(not exists $target_regions{$fields[0]}){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 $target_regions{$fields[0]} = [];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 my $chr = $target_regions{$fields[0]};
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 for my $pos ($fields[1]..$fields[2]){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 if(defined $target_regions{$fields[0]}->[$pos]){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 $intersection_count++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 else{
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 $target_regions{$fields[0]}->[$pos] = "T";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 $non_coding_target_total++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 close(BED);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 print STDERR "Reading in coverage information...\n" unless $quiet;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 open(STATS, $stats_file)
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 or die "Cannot open $stats_file for reading: $!\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 my $total_bases_lt_10x;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 my $total_bases_lt_20x;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 while(<STATS>){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 if(/^Total bases with less than 10-fold coverage\t(\d+)/){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 $total_bases_lt_10x = $1;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 elsif(/^Total bases with less than 20-fold coverage\t(\d+)/){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 $total_bases_lt_20x = $1;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 close(STATS);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 my $total_bases_under_consideration_10 = $targeted_total-$non_coding_target_total-$total_bases_lt_10x*(1-$non_coding_target_total/$targeted_total);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 my $total_bases_under_consideration_20 = $targeted_total-$non_coding_target_total-$total_bases_lt_20x*(1-$non_coding_target_total/$targeted_total);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 my $total_noncoding_bases_under_consideration_10 = $non_coding_target_total-$total_bases_lt_10x*($non_coding_target_total/$targeted_total);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 my $total_noncoding_bases_under_consideration_20 = $non_coding_target_total-$total_bases_lt_20x*($non_coding_target_total/$targeted_total);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 print STDERR "Processing called SNPs...\n" unless $quiet;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 my %seen;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 my $coding_transition_count_10 = 0; # A->G, G->A, C->T, T->C, i.e. effect of deamination of 5'-methyl C to uracil
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 my $coding_transition_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 my $coding_transversion_count_10 = 0; # A <-> C, A <-> T, G <-> C or G <-> T
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 my $coding_transversion_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 my $coding_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 my $noncoding_transition_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 my $noncoding_transition_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 my $noncoding_transversion_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 my $noncoding_transversion_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 my $synonymous_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 my $snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 my $autosomal_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 my $novel_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 my $homo_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 my $non_coding_snp_count_10 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 my $coding_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 my $synonymous_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 my $snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 my $autosomal_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 my $novel_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 my $homo_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 my $non_coding_snp_count_20 = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 # Okay, put all of the protein-coding lines at the start so that if any transcript for a
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 # gene says it's protein coding, that'll be the state recorded for the SNP.
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 my @datalines = <TAB>;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 my $tot_snps = 0;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 for(@datalines){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 chomp;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 my @F = split /\t/, $_;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 my $newbases = $F[$var_column];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 my $refbases = $F[$ref_column];
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 next unless length($newbases) eq length($refbases); #SNPs and MNPs only
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 for (my $i = 0; $i < length($newbases); $i++){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 my $newbase = substr($newbases, $i, 1);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 my $refbase = substr($refbases, $i, 1);
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 next if $refbase eq $newbase; # ref in the middle of a phased MNP
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 next if exists $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"}; # seen SNP already
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 $seen{"$F[$chr_column]:$F[$pos_column]:$newbase"} = 1;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 $tot_snps++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 next unless $F[$depth_column] >= 10 and $F[$caveat_column] =~ /^(;?[^;]+auto set to \d\.\d+|)$/; # only look at areas with no caveats (besides auto-set allele frequencies)
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 $snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 $snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 if($F[$hgvs_aa_column] eq "NA"){ #HGVS protein field
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 if(defined $target_regions{$F[$chr_column]}->[$F[$pos_column]]){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 if($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "C"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 #print STDERR "Counting $F[18] as alternate coding targeted\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 $coding_snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 $coding_snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 elsif($target_regions{$F[$chr_column]}->[$F[$pos_column]] eq "T"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 #print STDERR "Counting $F[18] as non-coding targeted\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 $non_coding_snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 $non_coding_snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 # else non-target flanking
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 else{
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas (but shouldn't be here)\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 #else non-target flanking
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 else{
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 #print STDERR "Ignoring $F[1]:$F[2] as flanking targeted areas\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 if($refbase eq "C" and $newbase eq "T" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 $refbase eq "T" and $newbase eq "C" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 $refbase eq "A" and $newbase eq "G" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 $refbase eq "G" and $newbase eq "A"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 $noncoding_transition_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 $noncoding_transition_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 else{
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 $noncoding_transversion_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 $noncoding_transversion_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 next;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 elsif($F[$hgvs_aa_column] =~ /^p\..\d+=$/){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 $synonymous_snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 $synonymous_snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 #print STDERR "Counting $F[18] as coding targeted\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 if($F[$chr_column] !~ /X/ and $F[$chr_column] !~ /Y/){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 $autosomal_snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 $autosomal_snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 $homo_snp_count_10++ if $F[$zygosity_column] =~ /homozygote/;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 $homo_snp_count_20++ if $F[$zygosity_column] =~ /homozygote/ and $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 $novel_snp_count_10++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 $novel_snp_count_20++ if $F[$rsid_column] eq "novel" and $F[$maf_column] eq "NA" and $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 $coding_snp_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 $coding_snp_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 if($refbase eq "C" and $newbase eq "T" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 $refbase eq "T" and $newbase eq "C" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 $refbase eq "A" and $newbase eq "G" or
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 $refbase eq "G" and $newbase eq "A"){
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 $coding_transition_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 $coding_transition_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 else{
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 $coding_transversion_count_10++;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 $coding_transversion_count_20++ if $F[$depth_column] >= 20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 } # end for each new base
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 }
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 open(SUMMARY, ">$output")
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 or die "Cannot open $output for writing: $!\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 printf SUMMARY "Measure\tActual\tIdeal (human)\n";
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 10x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_10, $non_coding_snp_count_10/$total_noncoding_bases_under_consideration_10*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 printf SUMMARY "Non-coding SNPs observed %% rate (in the ~%d target non-coding bases with >= 20x coverage)\t%.4f\t0.1\n", $total_noncoding_bases_under_consideration_20, $non_coding_snp_count_20/$total_noncoding_bases_under_consideration_20*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 printf SUMMARY "Total coding region of interest\t\t%d\n", $intersection_count;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 printf SUMMARY "Total SNP count (10x)\t%d\t%d\n", $coding_snp_count_10, $total_bases_under_consideration_10*0.00048;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 printf SUMMARY "Total SNP count (20x)\t%d\t%d\n", $coding_snp_count_20, $total_bases_under_consideration_20*0.00048;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 10x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_10, $coding_snp_count_10/$total_bases_under_consideration_10*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 printf SUMMARY "Coding SNPs observed %% rate (in the ~%d target coding bases with >= 20x coverage)\t%.4f\t0.048\n", $total_bases_under_consideration_20, $coding_snp_count_20/$total_bases_under_consideration_20*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 printf SUMMARY "Non-synonymous SNPs observed %% rate (10x)\t%.2f\t45\n", ($coding_snp_count_10-$synonymous_snp_count_10)/$coding_snp_count_10*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 printf SUMMARY "Non-synonymous SNPs observed %% rate (20x)\t%.2f\t45\n", ($coding_snp_count_20-$synonymous_snp_count_20)/$coding_snp_count_20*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 # 37% comes from 0.59 homo het ratio report for 20 human genomes in doi:10.1371/journal.pgen.1001111
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 printf SUMMARY "Homo SNPs observed %% rate (10x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_10/$autosomal_snp_count_10*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 printf SUMMARY "Homo SNPs observed %% rate (20x, autosomal chromosomes)\t%.4f\t37\n", $homo_snp_count_20/$autosomal_snp_count_20*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 printf SUMMARY "Novel SNP observed %% rate (10x)\t%.4f\t<1\n", $novel_snp_count_10/$snp_count_10*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 printf SUMMARY "Novel SNP observed %% rate (20x)\t%.4f\t<1\n", $novel_snp_count_20/$snp_count_20*100;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 printf SUMMARY "Coding SNPs transition:transversion (10x)\t%.2f\t3\n", $coding_transition_count_10/$coding_transversion_count_10 if $coding_transversion_count_10;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 printf SUMMARY "Coding SNPs transition:transversion (20x)\t%.2f\t3\n", $coding_transition_count_20/$coding_transversion_count_20 if $coding_transversion_count_20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 printf SUMMARY "Non-coding SNPs transition:transversion (10x)\t%.2f\t2.1\n", $noncoding_transition_count_10/$noncoding_transversion_count_10 if $noncoding_transversion_count_10;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 printf SUMMARY "Non-coding SNPs transition:transversion (20x)\t%.2f\t2.1\n", $noncoding_transition_count_20/$noncoding_transversion_count_20 if $noncoding_transversion_count_20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 printf SUMMARY "All SNPs transition:transversion (10x)\t%.2f\tNA\n", ($noncoding_transition_count_10+$coding_transition_count_10)/($noncoding_transversion_count_10+$coding_transversion_count_10) if $noncoding_transversion_count_10 or $coding_transversion_count_10;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 printf SUMMARY "All SNPs transition:transversion (20x)\t%.2f\tNA\n", ($noncoding_transition_count_20+$coding_transition_count_20)/($noncoding_transversion_count_20+$coding_transversion_count_20) if $noncoding_transversion_count_20 or $noncoding_transversion_count_20;
14e1cf728269 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 close(SUMMARY);