annotate kappa_report @ 0:2d601bd04c93 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:28:12 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 my $quiet = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 if(@ARGV and $ARGV[0] =~ /^-q/){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 $quiet = 1;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 # Calculates Cohen's kappa measure for inter-annotator agreement, here for two genotype callers
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 # SPECIAL ATTENTION MUST BE PAID to include or exclude calls using depth of coverage information, and only look at targeted regions (e.g. exome analysis)
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 # Also, the probability of a non-call must be incorporated, as there is going to be 99.9% agreement by default due to the low rate of
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 # SNPs in the genome in human, for example.
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 @ARGV == 9 or die "Usage: $0 [-q(uiet)] <exome targets.bed> <coding regions.gtf> <genotypes1.hgvs.txt> <bam1> <coverage cutoff1> <genotypes2.hgvs.txt> <bam2> <coverage cutoff2> <output report.txt>\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my $targets_file = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my $coding_file = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $calls1_file = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my $bam_file1 = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my $calls1_min_depth = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 my $calls2_file = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 my $bam_file2 = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 my $calls2_min_depth = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 my $output_file = shift @ARGV;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my %reporting;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 print STDERR "Reading in target regions list...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 open(BED, $targets_file)
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 or die "Cannot open target BED file $targets_file for reading: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 while(<BED>){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 next if /^(\s*#|track=)/;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 chomp;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 my @fields = split /\t/, $_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 if(not exists $reporting{$fields[0]}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 $reporting{$fields[0]} = [[$fields[1], $fields[2]]];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 next;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 push @{$reporting{$fields[0]}}, [$fields[1], $fields[2]];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 close(BED);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 for my $c (keys %reporting){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my %coding;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 print STDERR "Reading in coding regions list...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 open(GTF, $coding_file)
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 or die "Cannot open $coding_file for reading: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 while(<GTF>){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 next if /^\s*#/;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 my @fields = split /\t/, $_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 if($fields[2] eq "CDS"){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 if(not exists $coding{$fields[0]}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 $coding{$fields[0]} = [[$fields[3], $fields[4]]];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 next;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 push @{$coding{$fields[0]}}, [$fields[3], $fields[4]];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 close(GTF);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 for my $c (keys %coding){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 $coding{$c} = [sort {$a->[0] <=> $b->[0]} @{$coding{$c}}];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 print STDERR "Reading in genotype calls...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 open(OUT, ">$output_file")
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 or die "Cannot open $output_file for writing: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 my ($calls1, $subpar_calls1) = retrieve_calls($calls1_file, \%reporting, \%coding, $calls1_min_depth);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 my ($calls2, $subpar_calls2) = retrieve_calls($calls2_file, \%reporting, \%coding, $calls2_min_depth);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 my $total_calls1 = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 for my $chr (keys %$calls1){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 $total_calls1 += scalar(keys %{$calls1->{$chr}});
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 my $total_calls2 = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 for my $chr (keys %$calls2){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 $total_calls2 += scalar(keys %{$calls2->{$chr}});
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 print OUT "Total Method 1 calls\t$total_calls1\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 print OUT "Total Method 2 calls\t$total_calls2\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 print STDERR "Comparing genotype calls...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 # Go through the BAM file in all targeted locations to see if there is a reference or variant call (HGVS file only shows variants)
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 my $bases_diff = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 my $bases_same = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 my $zygosities_diff = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 my $zygosities_same = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 for my $chr (keys %$calls1){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 for my $pos (keys %{$calls1->{$chr}}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 if(exists $calls2->{$chr}->{$pos}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 # same variant base
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 if($calls2->{$chr}->{$pos}->[0] eq $calls1->{$chr}->{$pos}->[0]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 $bases_same++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 if($calls1->{$chr}->{$pos}->[1] eq $calls2->{$chr}->{$pos}->[1]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 $zygosities_same++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 $zygosities_diff++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 # diff variant base
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 $bases_diff++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 delete $calls1->{$chr}->{$pos}; # so in the end all that's left is method-1-only calls in the hash
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 delete $calls2->{$chr}->{$pos}; # so in the end all that's left is method-2-only calls in the hash
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 delete $subpar_calls1->{$chr}->{$pos} if exists $subpar_calls1->{$chr}; # so in the end all that's left is method-1-only calls in the hash
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 delete $subpar_calls2->{$chr}->{$pos} if exists $subpar_calls2->{$chr}; # so in the end all that's left is method-2-only calls in the hash
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 print OUT "Same calls\t$zygosities_same\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 print OUT "Diff zygosity\t$zygosities_diff\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 print OUT "Diff bases\t$bases_diff\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 my $covbed = "$$.bed";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 my $num_to_check = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 open(TMPCOV, ">$covbed")
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 or die "Cannot open temporary BED file $covbed for samtools: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 for my $chr (keys %$calls1){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 for my $pos (keys %{$calls1->{$chr}}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 if(exists $subpar_calls2->{$chr} and exists $subpar_calls2->{$chr}->{$pos}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 next; #exclude counting positions with sufficent coverage in one analysis, but not the other
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 $num_to_check++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 print TMPCOV "$chr\t$pos\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 close(TMPCOV);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 print STDERR "Checking $num_to_check read depths for missing calls in $bam_file2...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 my $remaining_calls1_homo = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 my $remaining_calls1_het = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 my $remaining_calls1_novel = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 my $remaining_calls1_novel_homo = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 open(SAM, "samtools mpileup -l $covbed -I $bam_file2 2>/dev/null |")
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 or die "Cannot run samtools: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 while(<SAM>){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 my @B = split /\t/, $_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 my $depth = @B > 1 ? $B[3] : 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases.
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 my $called_base = @B > 1 ? $B[2] : "N";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 if($depth and $called_base eq $calls1->{$B[0]}->{$B[1]}->[0]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 next;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 if($depth >= $calls2_min_depth){ # have good coverage in sample 2, but didn't make a SNP call like in sample 1
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 if($calls1->{$B[0]}->{$B[1]}->[1] =~ /het/){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 $remaining_calls1_het++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 $remaining_calls1_homo++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 if($calls1->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls1->{$B[0]}->{$B[1]}->[3] < 0.001){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 $remaining_calls1_novel++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 $remaining_calls1_novel_homo++ if $calls1->{$B[0]}->{$B[1]}->[1] !~ /het/;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 #print STDERR "Got method 1 only non-novel call: ", join("/", @{$calls1->{$B[0]}->{$B[1]}}), "\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 print OUT "Method 1 only\t",($remaining_calls1_homo+$remaining_calls1_het),"\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 printf OUT "Method 1 only, het %%\t%.1f\n",$remaining_calls1_het/($remaining_calls1_homo+$remaining_calls1_het)*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 printf OUT "Method 1 only, novel %%\t%.1f\n",$remaining_calls1_novel/($remaining_calls1_homo+$remaining_calls1_het)*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 printf OUT "Method 1 only, novel, homo %%\t%.1f\n",$remaining_calls1_novel_homo/$remaining_calls1_novel*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 $num_to_check = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 open(TMPCOV, ">$covbed")
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 or die "Cannot open temporary BED file $covbed for samtools: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 for my $chr (keys %$calls2){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 for my $pos (keys %{$calls2->{$chr}}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 if(exists $subpar_calls1->{$chr} and exists $subpar_calls1->{$chr}->{$pos}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 next; #exclude counting positions with sufficent coverage in one analysis, but not the other
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 $num_to_check++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 print TMPCOV "$chr\t$pos\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 close(TMPCOV);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 print STDERR "Checking $num_to_check read depths for missing calls in $bam_file1...\n" unless $quiet;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 my $remaining_calls2_het = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 my $remaining_calls2_homo = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 my $remaining_calls2_novel = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 my $remaining_calls2_novel_homo = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 open(SAM, "samtools mpileup -l $covbed -I $bam_file1 2>/dev/null |")
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 or die "Cannot run samtools: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 while(<SAM>){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 my @B = split /\t/, $_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 my $depth = @B > 1 ? $B[3] : 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases.
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 my $called_base = @B > 1 ? $B[2] : "N";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 if($depth and $called_base eq $calls2->{$B[0]}->{$B[1]}->[0]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 next;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 if($depth >= $calls1_min_depth){ # have good coverage in sample 1, but didn't make a SNP call like in sample 2
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 if($calls2->{$B[0]}->{$B[1]}->[1] =~ /het/){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 $remaining_calls2_het++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 else{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 $remaining_calls2_homo++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 if($calls2->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls2->{$B[0]}->{$B[1]}->[3] < 0.001){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 $remaining_calls2_novel++;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 $remaining_calls2_novel_homo++ if $calls2->{$B[0]}->{$B[1]}->[1] !~ /het/;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 print OUT "Method 2 only\t",($remaining_calls2_homo+$remaining_calls2_het),"\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 printf OUT "Method 2 only, het %%\t%.1f\n",$remaining_calls2_het/($remaining_calls2_homo+$remaining_calls2_het)*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 printf OUT "Method 2 only, novel %%\t%.1f\n",$remaining_calls2_novel/($remaining_calls2_homo+$remaining_calls2_het)*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 printf OUT "Method 2 only, novel, homo %%\t%.1f\n",$remaining_calls2_novel_homo/$remaining_calls2_novel*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 my $total_calls = $zygosities_same+$zygosities_diff+$bases_diff+$remaining_calls1_homo+$remaining_calls1_het+$remaining_calls2_homo+$remaining_calls2_het;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 printf OUT "Total coding sequence genotype concordance%%\t%.1f\n", $zygosities_same/$total_calls*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 printf OUT "Total coding sequence zygosity concordance%%\t%.1f\n", 100-$zygosities_diff/$total_calls*100;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 unlink $covbed;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 sub retrieve_calls{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 my ($hgvs_file, $reporting, $coding, $coverage_cutoff) = @_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 my %calls;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 my %bad_coverage;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 open(HGVS, $hgvs_file)
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 or die "Cannot open the first genotypes file $hgvs_file for reading: $!\n";
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 <HGVS>; # header
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 while(<HGVS>){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 my @F = split /\t/, $_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 my $HGVS = $F[2];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 next unless $HGVS =~ />([ACGT])/; # only look at SNPs
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 my $new_base = $1;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 my $chr = $F[4];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 my $pos = $F[5];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 my $zygosity = $F[6];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 my $coverage = $F[9];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 my $maf = $F[14];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 if($coverage_cutoff > $coverage or $F[16] ne ""){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 $bad_coverage{$chr} = {} unless exists $bad_coverage{$chr};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 $bad_coverage{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf] unless exists $bad_coverage{$chr}->{$pos};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 next;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 if(exists $reporting->{$chr}){ # in a region reported in the annotation, and it's protein coding?
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 my $in_region = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 my $arrayref = $reporting->{$chr};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 my $interval = $arrayref->[$search_index];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 if($pos >= $interval->[0] and $pos <= $interval->[1]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 $in_region = 1; last;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 last if $pos < $interval->[0];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 next unless $in_region;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 $in_region = 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 $arrayref = $coding->{$chr};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 my $interval = $arrayref->[$search_index];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 if($pos >= $interval->[0] and $pos <= $interval->[1]){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 $in_region = 1; last;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 last if $pos < $interval->[0];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 next unless $in_region;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 if(not exists $calls{$chr}){
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 $calls{$chr} = {};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 next if exists $calls{$chr}->{$pos};
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 $calls{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 close(HGVS);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 return (\%calls, \%bad_coverage);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 sub find_earliest_index{
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 my ($query, $array) = @_;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 return 0 if $query < $array->[0]->[0];
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 my ($left, $right, $prevCenter) = (0, $#$array, -1);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 while(1) {
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 my $center = int (($left + $right)/2);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 return $center if $cmp == 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 if ($center == $prevCenter) {
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 return $left;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 $right = $center if $cmp < 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 $left = $center if $cmp > 0;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 $prevCenter = $center;
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 }
2d601bd04c93 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313