Mercurial > repos > yusuf > poor_gene_coverage
comparison vcf2hgvs_table @ 0:7cdd13ff182a default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 15:49:28 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7cdd13ff182a |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 BEGIN{ | |
| 4 my $prog_dir = `dirname $0`; | |
| 5 chomp $prog_dir; | |
| 6 push @INC, $prog_dir; # so DisjointSets.pm can be found no matter the working directory | |
| 7 } | |
| 8 | |
| 9 use DisjointSets; # homebrew module | |
| 10 use Bio::DB::Sam; # for FastA reference pulls | |
| 11 use Bio::SeqUtils; | |
| 12 use Bio::Tools::CodonTable; | |
| 13 use Statistics::Zed; | |
| 14 use Getopt::Long; | |
| 15 use Set::IntervalTree; | |
| 16 use strict; | |
| 17 use warnings; | |
| 18 use vars qw($min_prop $zed $codonTable $default_transl_table %transl_except %internal_prop %dbsnp_info %chr2variant_locs %chr2dbsnp_vcf_lines %chr2internal_vcf_lines %chr2caveats %chr2phase @snvs $fasta_index $max_args $quiet); | |
| 19 | |
| 20 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
| 21 print "Version 1.0\n"; | |
| 22 exit; | |
| 23 } | |
| 24 | |
| 25 #$max_args = `getconf ARG_MAX`; # largest number of args you can send to a system command (enviroment included, see limits.h) | |
| 26 #chomp $max_args; | |
| 27 $max_args = 4096; # if not defined $max_args or $max_args < 1; # the minimum since System V | |
| 28 $max_args -= 50; | |
| 29 | |
| 30 # find out if a variant appears in the user provided data | |
| 31 sub internal_prop($$$$){ | |
| 32 my ($chr,$pos,$ref,$variant) = @_; | |
| 33 | |
| 34 my $key = "$chr:$pos:$ref:$variant"; | |
| 35 if(exists $internal_prop{$key}){ | |
| 36 return $internal_prop{$key}; | |
| 37 } | |
| 38 | |
| 39 #print STDERR "Checking if internal_prop for $key exists: "; | |
| 40 if(exists $chr2internal_vcf_lines{$chr}->{$pos}){ | |
| 41 for(@{$chr2internal_vcf_lines{$chr}->{$pos}}){ | |
| 42 my @fields = split /\t/, $_; | |
| 43 if($pos == $fields[1] and length($fields[3]) == length($ref) and $fields[4] eq $variant){ | |
| 44 #print STDERR "yes\n"; | |
| 45 if(/MAF=(\d\.\d+)/){ | |
| 46 $internal_prop{$key} = $1; # change from percent to proportion | |
| 47 return $1; | |
| 48 } | |
| 49 } | |
| 50 } | |
| 51 } | |
| 52 else{ | |
| 53 #print STDERR "no\n"; | |
| 54 } | |
| 55 | |
| 56 $internal_prop{$key} = "NA"; | |
| 57 return "NA"; | |
| 58 } | |
| 59 | |
| 60 # find out if a variant appears in the NCBI's dbSNP | |
| 61 sub dbsnp_info($$$$){ | |
| 62 my ($chr,$pos,$ref,$variant) = @_; | |
| 63 | |
| 64 my $key = "$chr:$pos:$ref:$variant"; | |
| 65 if(exists $dbsnp_info{$key}){ | |
| 66 return @{$dbsnp_info{$key}}; | |
| 67 } | |
| 68 | |
| 69 if(exists $chr2dbsnp_vcf_lines{$chr}->{$pos}){ | |
| 70 #print STDERR "Checking existing SNP data for $chr:$pos -> ", join("\n", @{$chr2dbsnp_vcf_lines{$chr}->{$pos}}), "\n"; | |
| 71 for(@{$chr2dbsnp_vcf_lines{$chr}->{$pos}}){ | |
| 72 my @fields = split /\t/, $_; | |
| 73 for my $var (split /,/, $fields[4]){ | |
| 74 # Allows for different reference seqs between dbSNP and input, assuming patches only | |
| 75 if(length($fields[3]) == length($ref) and ($var eq $variant or $ref eq $var and $variant eq $fields[3])){ | |
| 76 my ($freq, $subpop) = ("",""); | |
| 77 $freq = $1 if $fields[7] =~ /(?:\A|;)MMAF=(0\.\d+)(?:;|\Z)/; | |
| 78 $subpop = $1 if $fields[7] =~ /(?:\A|;)MMAF_SRC=(\S+?)(?:;|\Z)/; | |
| 79 $dbsnp_info{$key} = [$subpop, $freq || "NA", $fields[2]]; | |
| 80 return @{$dbsnp_info{$key}}; | |
| 81 } | |
| 82 } | |
| 83 } | |
| 84 } | |
| 85 $dbsnp_info{$key} = ["novel", "NA", "NA"]; | |
| 86 return @{$dbsnp_info{$key}}; | |
| 87 } | |
| 88 | |
| 89 sub record_snv{ | |
| 90 my $line = join("", @_); | |
| 91 push @snvs, $line; | |
| 92 | |
| 93 my @fields = split /\t/, $line; | |
| 94 my $prop_info_key = $fields[9]; | |
| 95 my ($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; | |
| 96 $chr2variant_locs{$chr} = {} unless exists $chr2variant_locs{$chr}; | |
| 97 return unless $ref; # ref not defined for CNVs | |
| 98 # Need to grab whole range for MNPs | |
| 99 for(my $i = 0; $i < length($ref); $i++){ | |
| 100 $chr2variant_locs{$chr}->{$pos+$i} = 1; | |
| 101 } | |
| 102 } | |
| 103 | |
| 104 sub retrieve_vcf_lines($$$){ | |
| 105 my ($dbsnp_file, $internal_snp_file, $chr) = @_; | |
| 106 | |
| 107 my (%dbsnp_lines, %internal_snp_lines); | |
| 108 | |
| 109 if(not defined $dbsnp_file or not exists $chr2variant_locs{$chr}){ | |
| 110 return ({}, {}, {}, {}); # no data requested for this chromosome | |
| 111 } | |
| 112 | |
| 113 # build up the request | |
| 114 my @tabix_regions; | |
| 115 my @var_locs = keys %{$chr2variant_locs{$chr}}; | |
| 116 # sort by variant start location | |
| 117 for my $var_loc (sort {$a <=> $b} @var_locs){ | |
| 118 push @tabix_regions, $chr.":".$var_loc."-".$var_loc; | |
| 119 } | |
| 120 for(my $i = 0; $i <= $#tabix_regions; $i += $max_args){ # chunkify tabix request if too many for the system to handle | |
| 121 my $end = $i + $max_args > $#tabix_regions ? $#tabix_regions : $i + $max_args; | |
| 122 my $regions = "'".join("' '", @tabix_regions[$i..$end])."'"; | |
| 123 # From file is very slow for some reason | |
| 124 #my $regions_file = "/tmp/vcf2hgvs_$$.bed"; | |
| 125 #open(REQ_BED, ">$regions_file") | |
| 126 # or die "Cannot open $regions_file for writing: $!\n"; | |
| 127 #print REQ_BED join("\n", @tabix_regions), "\n"; | |
| 128 #close(REQ_BED); | |
| 129 | |
| 130 # retrieve the data | |
| 131 die "Cannot find dbSNP VCF file $dbsnp_file\n" if not -e $dbsnp_file; | |
| 132 | |
| 133 open(VCF, "tabix $dbsnp_file $regions |") | |
| 134 or die "Cannot run tabix on $dbsnp_file (args ".substr($regions, 0, length($regions)>100? 100 : length($regions))."): $!\n"; | |
| 135 while(<VCF>){ | |
| 136 #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | |
| 137 if(/^(\S+\t(\d+)(?:\t\S+){6})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible | |
| 138 $dbsnp_lines{$2} = [] unless exists $dbsnp_lines{$2}; | |
| 139 push @{$dbsnp_lines{$2}}, $1; | |
| 140 } | |
| 141 } | |
| 142 close(VCF); | |
| 143 | |
| 144 if($internal_snp_file){ | |
| 145 die "Cannot find internal VCF file $internal_snp_file\n" if not -e $internal_snp_file; | |
| 146 open(VCF, "tabix $internal_snp_file $regions |") | |
| 147 or die "Cannot run tabix on $internal_snp_file: $!\n"; | |
| 148 while(<VCF>){ | |
| 149 #if(/^(\S+\t(\d+)(?:\t\S+){6})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | |
| 150 if(/^(\S+\t(\d+)(?:\t\S+){5})/ and exists $chr2variant_locs{$chr}->{$2}){ # take only main columns to save room, if possible | |
| 151 $internal_snp_lines{$2} = [] unless exists $internal_snp_lines{$2}; | |
| 152 push @{$internal_snp_lines{$2}}, $1; | |
| 153 } | |
| 154 } | |
| 155 close(VCF); | |
| 156 } | |
| 157 } | |
| 158 | |
| 159 #unlink $regions_file; | |
| 160 | |
| 161 return (\%dbsnp_lines, \%internal_snp_lines); | |
| 162 } | |
| 163 | |
| 164 sub prop_info_key{ | |
| 165 my($chr,$pos,$ref,$variant,$exon_edge_dist) = @_; | |
| 166 | |
| 167 $chr =~ s/^chr//; | |
| 168 if($chr eq "M"){ | |
| 169 $chr = "MT"; # NCBI uses different name for mitochondrial chromosome | |
| 170 $pos-- if $pos >= 3107; # also, doesn't keep the old positioning (historical) | |
| 171 } | |
| 172 return join(":", $chr,$pos,$ref,$variant, ($exon_edge_dist ? $exon_edge_dist : "")); | |
| 173 } | |
| 174 | |
| 175 sub prop_info($$$){ | |
| 176 my($snpfile,$internal_snps_file,$prop_info_key) = @_; | |
| 177 | |
| 178 my($chr,$pos,$ref,$variant) = split /:/, $prop_info_key; | |
| 179 | |
| 180 # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse | |
| 181 if(not exists $chr2dbsnp_vcf_lines{$chr}){ | |
| 182 ($chr2dbsnp_vcf_lines{$chr}, $chr2internal_vcf_lines{$chr}) = retrieve_vcf_lines($snpfile,$internal_snps_file,$chr); | |
| 183 } | |
| 184 my $internal_maf = 0; | |
| 185 if($internal_snps_file){ | |
| 186 $internal_maf = internal_prop($chr,$pos,$ref,$variant); | |
| 187 $internal_maf = 0 if $internal_maf eq "NA"; | |
| 188 } | |
| 189 | |
| 190 my @results = dbsnp_info($chr,$pos,$ref,$variant); | |
| 191 | |
| 192 # Not all entries have a proportion in dbSNP | |
| 193 return $internal_snps_file ? ($ref, $variant, @results, $internal_maf) : ($ref, $variant, @results); | |
| 194 } | |
| 195 | |
| 196 #offset a given HGVS nomenclature position (single position only) by a given number of bases | |
| 197 sub hgvs_plus($$){ | |
| 198 my ($hgvs, $offset) = @_; | |
| 199 if($hgvs =~ /^(\S+)(-\d+)(.*)/){ | |
| 200 # all negative | |
| 201 if($2+$offset<0){ | |
| 202 return $1.($2+$offset).$3; | |
| 203 } | |
| 204 # switches to positive, need to mod | |
| 205 else{ | |
| 206 return $1+($2+$offset); | |
| 207 } | |
| 208 } | |
| 209 elsif($hgvs =~ /^(\S+)\+(\d+)(.*)/){ | |
| 210 # all positive | |
| 211 if($2+$offset>0){ | |
| 212 return $1."+".($2+$offset).$3; | |
| 213 } | |
| 214 # switches to negative, need to mod | |
| 215 else{ | |
| 216 return $1+($2+$offset); | |
| 217 } | |
| 218 } | |
| 219 elsif($hgvs =~ /^(-?\d+)(.*)/){ | |
| 220 # special case if offset spans -/+ since there is no position 0 | |
| 221 if($1 < 0 and $1+$offset >= 0){ | |
| 222 $offset++; | |
| 223 } | |
| 224 elsif($1 > 0 and $1+$offset <= 0){ | |
| 225 $offset--; | |
| 226 } | |
| 227 return ($1+$offset).$2; | |
| 228 } | |
| 229 else{ | |
| 230 die "Cannot convert $hgvs to a new offset ($offset), only single base position nomenclature is currently supported\n"; | |
| 231 } | |
| 232 } | |
| 233 | |
| 234 # offset a given position by a given number of bases, | |
| 235 # taking into account that if the new offset crosses the threshold in the last argument, | |
| 236 # HGVS boundary nomenclature has to be introduced | |
| 237 sub hgvs_plus_exon($$$){ | |
| 238 my ($pos, $offset, $boundary) = @_; | |
| 239 | |
| 240 # special case if offset spans -/+ since there is no position 0 | |
| 241 if($pos =~ /^(-?\d+)(.*)/){ | |
| 242 if($1 < 0 and $1+$offset >= 0){ | |
| 243 $offset++; | |
| 244 } | |
| 245 elsif($1 > 0 and $1+$offset <= 0){ | |
| 246 $offset--; | |
| 247 } | |
| 248 } | |
| 249 my $new_pos = $pos + $offset; | |
| 250 if($new_pos > $boundary and $pos <= $boundary){ | |
| 251 # just moved into an intron 3' | |
| 252 $new_pos = $boundary."+".($new_pos-$boundary); | |
| 253 } | |
| 254 elsif($new_pos < $boundary and $pos >= $boundary){ | |
| 255 # just moved into an intron 5' | |
| 256 $new_pos = $boundary.($new_pos-$boundary); | |
| 257 } | |
| 258 return $new_pos; | |
| 259 } | |
| 260 | |
| 261 # given a nucleotide position, calculates the AA there (assumes coding region) | |
| 262 sub getCodonFromSeq($$$$){ | |
| 263 my ($chr_ref, $location, $frame_offset, $strand) = @_; | |
| 264 | |
| 265 my $codon; | |
| 266 if($strand eq "+"){ | |
| 267 $codon = substr($$chr_ref, $location-1-$frame_offset, 3); | |
| 268 } | |
| 269 else{ | |
| 270 $codon = substr($$chr_ref, $location-3+$frame_offset, 3); | |
| 271 $codon = reverse($codon); | |
| 272 $codon =~ tr/ACGTacgt/TGCAtgca/; | |
| 273 } | |
| 274 return $codon; | |
| 275 } | |
| 276 | |
| 277 sub getCodonFromSeqIndex($$$$){ | |
| 278 my ($chr, $location, $frame_offset, $strand) = @_; | |
| 279 | |
| 280 my $codon; | |
| 281 if($strand eq "+"){ | |
| 282 $codon = $fasta_index->fetch($chr.":".($location-$frame_offset)."-".($location-$frame_offset+2)); | |
| 283 } | |
| 284 else{ | |
| 285 $codon = $fasta_index->fetch($chr.":".($location-2+$frame_offset)."-".($location+$frame_offset)); | |
| 286 $codon = reverse($codon); | |
| 287 $codon =~ tr/ACGTacgt/TGCAtgca/; | |
| 288 } | |
| 289 return $codon; | |
| 290 } | |
| 291 | |
| 292 sub getAAFromSeq($$$$$){ | |
| 293 return $_[4]->translate(getCodonFromSeq($_[0], $_[1], $_[2], $_[3])); | |
| 294 } | |
| 295 | |
| 296 sub getAAFromSeqIndex($$$$$){ | |
| 297 # convert codon to AA | |
| 298 if(exists $transl_except{"$_[0]:$_[1]"}){ | |
| 299 return $transl_except{"$_[0]:$_[1]"}; | |
| 300 } | |
| 301 else{ | |
| 302 return $_[4]->translate(getCodonFromSeqIndex($_[0], $_[1], $_[2], $_[3])); | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 sub hgvs_protein{ | |
| 307 my ($chr, $location, $ref, $variant, $cdna_pos, $strand, $transl_table) = @_; | |
| 308 | |
| 309 if(substr($ref,0,1) eq substr($variant,0,1)){ | |
| 310 substr($ref,0,1) = ""; | |
| 311 substr($variant,0,1) = ""; | |
| 312 $location++; | |
| 313 if($strand eq "-"){ | |
| 314 $cdna_pos--; | |
| 315 } | |
| 316 else{ | |
| 317 $cdna_pos++; | |
| 318 } | |
| 319 } | |
| 320 | |
| 321 if($cdna_pos !~ /^\d+/){ | |
| 322 die "Aborting: got illegal cDNA position ($cdna_pos) for protein HGVS conversion of position ", | |
| 323 "$location, ref $ref, variant $variant. Please correct the program code.\n"; | |
| 324 } | |
| 325 # Get the correct frame for the protein translation, to know what codons are affected | |
| 326 my $aapos = int(($cdna_pos-1)/3)+1; | |
| 327 | |
| 328 # does it destroy the start codon? | |
| 329 if($cdna_pos < 4){ # assumes animal codon usage | |
| 330 return "p.0?"; # indicates start codon missing, unsure of effect | |
| 331 } | |
| 332 | |
| 333 my $table = $transl_table ne $default_transl_table ? # non standard translation table requested | |
| 334 Bio::Tools::CodonTable->new(-id=>$transl_table) : $codonTable; | |
| 335 | |
| 336 my $frame_offset = ($cdna_pos-1)%3; | |
| 337 my $origAA = getAAFromSeqIndex($chr, $location, $frame_offset, $strand, $table); | |
| 338 # take 100000 bp on either side for translation context of variant seq | |
| 339 my $five_prime_buffer = $location < 10000 ? $location-1 : 10000; | |
| 340 my $mutSeq = $fasta_index->fetch($chr.":".($location-$five_prime_buffer)."-".($location+10000)); | |
| 341 | |
| 342 # substitute all of the immediately adjacent variants in phase with this one to get the correct local effect | |
| 343 substr($mutSeq, $five_prime_buffer, length($ref)) = $variant; | |
| 344 | |
| 345 # does it cause a frameshift? | |
| 346 my $length_diff = length($variant)-length($ref); | |
| 347 if($length_diff%3){ # insertion or deletion not a multiple of three | |
| 348 my $fs_codon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); | |
| 349 my $ext = 0; | |
| 350 my $newAA; | |
| 351 do{ | |
| 352 $ext++; | |
| 353 # The "NA"s below make it so that we don't pick up any translation exceptions from the original reference annotation | |
| 354 if($strand eq "+"){ | |
| 355 $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$ext*3, $frame_offset, $strand, $table); | |
| 356 } | |
| 357 else{ | |
| 358 $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1-$ext*3, $frame_offset, $strand, $table); | |
| 359 } | |
| 360 } while($newAA ne "*"); | |
| 361 | |
| 362 return "p.".$origAA.$aapos.$table->translate($fs_codon)."fs*$ext"; | |
| 363 } | |
| 364 | |
| 365 # does it cause a stop codon to be lost? | |
| 366 if($origAA eq "*"){ | |
| 367 my $stopChangeCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1, $frame_offset, $strand); | |
| 368 # still a stop after the mutation (ignore translation exceptions) | |
| 369 if($table->is_ter_codon($stopChangeCodon)){ | |
| 370 return "p.*$aapos="; | |
| 371 } | |
| 372 # calculate the new stop, assuming there aren't mutations downstream in candidate stop codons | |
| 373 my $ext = 0; | |
| 374 my $newCodon; | |
| 375 do{ | |
| 376 if($strand eq "+"){ | |
| 377 $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1+(++$ext*3), $frame_offset, $strand); | |
| 378 } | |
| 379 else{ | |
| 380 $newCodon = getCodonFromSeq(\$mutSeq, $five_prime_buffer+1-(++$ext*3), $frame_offset, $strand); | |
| 381 } | |
| 382 } while(not $table->is_ter_codon($newCodon)); | |
| 383 | |
| 384 return "p.*".$aapos.$table->translate($stopChangeCodon)."ext*".$ext; | |
| 385 } | |
| 386 | |
| 387 # if we get this far, it's a "regular" AA level change | |
| 388 my $origAAs = ""; | |
| 389 for(my $i = 0; $i < length($ref)+$frame_offset; $i+=3){ | |
| 390 my $oldAA = getAAFromSeqIndex($chr, $location+$i, $frame_offset, $strand, $table); | |
| 391 if($strand eq "+"){ | |
| 392 $origAAs .= $oldAA; | |
| 393 } | |
| 394 else{ | |
| 395 $origAAs = $oldAA . $origAAs; | |
| 396 } | |
| 397 } | |
| 398 my $newAAs = ""; | |
| 399 for(my $i = 0; $i < length($variant)+$frame_offset; $i+=3){ | |
| 400 # NA means we don't take translation exceptions from the original | |
| 401 my $newAA = getAAFromSeq(\$mutSeq, $five_prime_buffer+1+$i, $frame_offset, $strand, $table); | |
| 402 if($strand eq "+"){ | |
| 403 $newAAs .= $newAA; | |
| 404 } | |
| 405 else{ | |
| 406 $newAAs = $newAA . $newAAs; | |
| 407 } | |
| 408 } | |
| 409 | |
| 410 # silent | |
| 411 if($origAAs eq $newAAs){ | |
| 412 return "p.".$origAAs.$aapos."="; | |
| 413 } | |
| 414 | |
| 415 # minimize the difference if there are leading or trailing AAs the same | |
| 416 my $delLength = length($ref); | |
| 417 while(substr($newAAs, 0, 1) eq substr($origAAs, 0, 1)){ | |
| 418 $newAAs = substr($newAAs, 1); | |
| 419 $origAAs = substr($origAAs, 1); | |
| 420 $location+=3; | |
| 421 $delLength-=3; | |
| 422 $aapos++; | |
| 423 } | |
| 424 while(substr($newAAs, -1) eq substr($origAAs, -1)){ | |
| 425 $newAAs = substr($newAAs, 0, length($newAAs)-1); | |
| 426 $origAAs = substr($origAAs, 0, length($origAAs)-1); | |
| 427 } | |
| 428 | |
| 429 # insertion | |
| 430 if(length($origAAs) == 0){ | |
| 431 my $insAAs = getAAFromSeqIndex($chr,$location-3,$frame_offset,$strand,$table).($aapos-1)."_". | |
| 432 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table); | |
| 433 return "p.".$insAAs.$aapos."ins".$newAAs; | |
| 434 } | |
| 435 # deletion | |
| 436 elsif(length($newAAs) == 0){ | |
| 437 my $delAAs; | |
| 438 if(length($origAAs) == 1){ | |
| 439 $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; # single AA deletion | |
| 440 } | |
| 441 else{ # deleting a stretch | |
| 442 if($strand eq "+"){ | |
| 443 my $endPoint = $location+$delLength-1; | |
| 444 $delAAs = getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos."_". | |
| 445 getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos+int(($delLength-1)/3)); | |
| 446 } | |
| 447 else{ | |
| 448 my $endPoint = $location-$delLength+1; | |
| 449 $delAAs = getAAFromSeqIndex($chr,$endPoint,$frame_offset,$strand,$table).($aapos-int(($delLength-1)/3))."_". | |
| 450 getAAFromSeqIndex($chr,$location,$frame_offset,$strand,$table).$aapos; | |
| 451 } | |
| 452 } | |
| 453 return "p.".$delAAs."del"; | |
| 454 } | |
| 455 else{ | |
| 456 # substitution | |
| 457 if(length($origAAs) == 1 and length($newAAs) == 1){ | |
| 458 return "p.".$origAAs.$aapos.$newAAs; | |
| 459 } | |
| 460 # indel | |
| 461 elsif(length($origAAs) != 1){ | |
| 462 # convert ref stretch into range syntax | |
| 463 if($strand eq "+"){ | |
| 464 $origAAs = substr($origAAs, 0, 1).$aapos."_".substr($origAAs, -1).($aapos+length($origAAs)-1); | |
| 465 } | |
| 466 else{ | |
| 467 $origAAs = substr($origAAs, 0, 1).($aapos-length($origAAs)+1)."_".substr($origAAs, -1).$aapos; | |
| 468 } | |
| 469 } | |
| 470 return "p.".$origAAs."delins".$newAAs; | |
| 471 } | |
| 472 return ("NA", ""); | |
| 473 } | |
| 474 | |
| 475 sub z2p{ | |
| 476 if(not defined $zed){ | |
| 477 $zed = new Statistics::Zed; | |
| 478 } | |
| 479 my $p = $zed->z2p(value => $_[0]); | |
| 480 return $p < 0.0000000001 ? 0 : $p; | |
| 481 } | |
| 482 sub gq2p{ | |
| 483 return $_[0] > 200 ? 0 : 10**($_[0]/-10); | |
| 484 } | |
| 485 | |
| 486 my ($multi_phased, $min_depth, $flanking_bases, $dbsnp, $internal_snp, $genename_bed_file, $dir_1000G, $dir_esp6500, $min_pvalue, $mappability_file, $reference_file, $samtools_phasing_file, $exons_file, $input_file, $output_file, $cnv_file, $dgv_file, $which_chr, $enrichment_regions_file, $rare_variant_prop); | |
| 487 $multi_phased = 0; | |
| 488 $min_depth = 2; | |
| 489 $flanking_bases = 30; | |
| 490 $min_pvalue = 0.01; | |
| 491 $min_prop = 0.14; | |
| 492 $rare_variant_prop = 0.05; | |
| 493 $input_file = "-"; # STDIN by default | |
| 494 $output_file = "-"; # STDOUT by default | |
| 495 $default_transl_table = "1"; # assumes NCBI 'Standard' table, unless it is an argument to the script... | |
| 496 &GetOptions("d=i" => \$min_depth, | |
| 497 "f=i" => \$flanking_bases, | |
| 498 "s=s" => \$dbsnp, | |
| 499 "t=s" => \$dir_1000G, | |
| 500 "n=s" => \$dir_esp6500, | |
| 501 "u=s" => \$internal_snp, | |
| 502 "q" => \$quiet, | |
| 503 "p=f" => \$min_pvalue, | |
| 504 "h=f" => \$min_prop, | |
| 505 "m=s" => \$mappability_file, | |
| 506 "r=s" => \$reference_file, | |
| 507 "z=s" => \$samtools_phasing_file, | |
| 508 "e=s" => \$exons_file, | |
| 509 "i=s" => \$input_file, | |
| 510 "c=s" => \$cnv_file, | |
| 511 "g=s" => \$dgv_file, | |
| 512 "b=s" => \$genename_bed_file, | |
| 513 "w=s" => \$which_chr, | |
| 514 "o=s" => \$output_file, | |
| 515 "a=i" => \$default_transl_table, | |
| 516 "v=f" => \$rare_variant_prop, | |
| 517 "x=s" => \$enrichment_regions_file); # if enrichment regions are specified, variants without a transcript model but in these ranges will be reported | |
| 518 | |
| 519 if(($input_file ne "/dev/null" and not defined $reference_file) or | |
| 520 not defined $exons_file or | |
| 521 (defined $cnv_file and not defined $dgv_file)){ | |
| 522 die "Usage: $0 [-v(ersion)] [-q(uiet)] [-w(hich) contig_to_report (default is all)] [-d(epth of variant reads req'd) #] [-v(ariant max freq to count as rare)] [-f(lanking exon bases to report) #] [-p(robability of random genotype, maximum to report) 0.#]\n", | |
| 523 " [-h(et proportion of variant reads, minimum to report) 0.#] [-c(opy number) variants_file.bed -g(enomic structural) variants_control_db.txt.gz] [-z file_containing_samtools_phase_output.txt]\n", | |
| 524 " [-t(housand) genomes_integrated_vcfs_gz_dir] [-n ESP6500_dir] [-u(ser) specified_population.vcf.gz] [-m(appability) crg_file.bed]\n", | |
| 525 " [-x enrichment_regions_file.bed] [-a(mino) acid translation table number from NCBI]\n", | |
| 526 " [-i(nput) genotypes.vcf <-r(eference) sequence_file.fasta>] [-o(utput) hgvs_file.tsv] [-s(np) database_from_ncbi.vcf.gz]\n", | |
| 527 " <-b(ed) file of named gene regions.bed> <-e(xons) file.gtf>\n\n", | |
| 528 "Input gz files must be indexed with Tabix.\nDefault input is STDIN, default output is STDOUT. Note: if -c is specified, polyploidies are are assume to be proximal. Other defaults: -d 2, -v 0.05, -f 30, -p 0.01, -h 0.14 -a 1\nReference sequence is not strictly necessary if only CNV are being annotated.\n"; | |
| 529 } | |
| 530 | |
| 531 print STDERR "Considering $flanking_bases flanking bases for variants as well\n" unless $quiet; | |
| 532 | |
| 533 $codonTable = new Bio::Tools::CodonTable(id => $default_transl_table); | |
| 534 | |
| 535 my %enrichment_regions; | |
| 536 # Note, we assume the regions are non-overlapping | |
| 537 if(defined $enrichment_regions_file){ | |
| 538 print STDERR "Loading enrichment regions...\n" unless $quiet; | |
| 539 open(BED, $enrichment_regions_file) | |
| 540 or die "Cannot open $enrichment_regions_file for reading: $!\n"; | |
| 541 while(<BED>){ | |
| 542 chomp; | |
| 543 my @F = split /\t/, $_; | |
| 544 $enrichment_regions{$F[0]} = [] if not exists $enrichment_regions{$F[0]}; | |
| 545 push @{$enrichment_regions{$F[0]}}, [$F[1], $F[2]]; | |
| 546 } | |
| 547 close(BED); | |
| 548 } | |
| 549 for my $chr (keys %enrichment_regions){ # sort by start | |
| 550 $enrichment_regions{$chr} = [sort {$a->[0] <=> $b->[0]} @{$enrichment_regions{$chr}}]; | |
| 551 } | |
| 552 | |
| 553 if(defined $reference_file){ | |
| 554 print STDERR "Scanning reference FastA info\n" unless $quiet; | |
| 555 if(not -e $reference_file){ | |
| 556 die "Reference FastA file ($reference_file) does not exist.\n"; | |
| 557 } | |
| 558 if(not -e $reference_file.".fai" and not -w dirname($reference_file)){ | |
| 559 die "Reference FastA file ($reference_file) is not indexed, and the directory is not writable.\n"; | |
| 560 } | |
| 561 $fasta_index = Bio::DB::Sam::Fai->load($reference_file); | |
| 562 } | |
| 563 | |
| 564 my %chr2mappability; | |
| 565 if(defined $mappability_file){ | |
| 566 print STDERR "Reading in mappability data\n" unless $quiet; | |
| 567 my ($nmer) = $mappability_file =~ /(\d+).*?$/; | |
| 568 die "Cannot determine nmer from nmer file name $mappability_file, aborting\n" unless $nmer; | |
| 569 open(MAP, $mappability_file) | |
| 570 or die "Cannot open mappability data file $mappability_file for reading: $!\n"; | |
| 571 <MAP>; # header | |
| 572 while(<MAP>){ | |
| 573 next if /^#/; | |
| 574 chomp; | |
| 575 my @F = split /\t/, $_; | |
| 576 my $x = int(1/$F[3]+0.5); | |
| 577 $chr2mappability{$F[0]} = Set::IntervalTree->new() if not exists $chr2mappability{$F[0]}; | |
| 578 $chr2mappability{$F[0]}->insert("non-unique mapping region (x$x)", $F[1], $F[2]+$nmer-1); | |
| 579 } | |
| 580 close(MAP); | |
| 581 } | |
| 582 | |
| 583 # Is phasing data provided? | |
| 584 if(defined $samtools_phasing_file){ | |
| 585 print STDERR "Reading in phasing data\n" unless $quiet; | |
| 586 open(PHASE, $samtools_phasing_file) | |
| 587 or die "Cannot open phasing data file $samtools_phasing_file for reading: $!\n"; | |
| 588 my $phase_range; | |
| 589 while(<PHASE>){ | |
| 590 if(/^PS/){ | |
| 591 chomp; | |
| 592 my @F = split /\t/, $_; | |
| 593 $phase_range = "$F[2]-$F[3]"; | |
| 594 } | |
| 595 if(/^M[12]/){ | |
| 596 chomp; | |
| 597 my @F = split /\t/, $_; | |
| 598 #ignore strange cases where haplotype reference has no cases (weird samtools call) | |
| 599 next if $F[9] == 0 or $F[7] == 0; | |
| 600 my $chr = $F[1]; | |
| 601 next if defined $which_chr and not $chr eq $which_chr; | |
| 602 my $pos = $F[3]; | |
| 603 #print STDERR "Recording phase for $chr:$pos:$F[4] , $chr:$pos:$F[5] as A-$chr:$phase_range and B-$chr:$phase_range\n" if $pos == 12907379; | |
| 604 if(($F[10]+$F[8])/($F[9]+$F[7]) >= $min_prop){ # error meets reporting threshold | |
| 605 $chr2caveats{"$chr:$pos"} .= "; " if exists $chr2caveats{"$chr:$pos"}; | |
| 606 $chr2caveats{"$chr:$pos"} .= "inconsistent haplotype phasing"; | |
| 607 } | |
| 608 else{ # appears to be a genuine phasing | |
| 609 $chr2phase{"$chr:$pos:$F[4]"} = "A-$chr:$phase_range"; # grouping for haplotype | |
| 610 $chr2phase{"$chr:$pos:$F[5]"} = "B-$chr:$phase_range"; # grouping for haplotype | |
| 611 } | |
| 612 } | |
| 613 } | |
| 614 close(PHASE); | |
| 615 } | |
| 616 | |
| 617 # Check the VCF file to see if contains phase data | |
| 618 open(VCFIN, $input_file) | |
| 619 or die "Cannot open $input_file for reading: $!\n"; | |
| 620 my $phase_chr = ""; | |
| 621 my @phase_dataA; | |
| 622 my @phase_dataB; | |
| 623 while(<VCFIN>){ | |
| 624 if(/^\s*(?:#|$)/){ # blank or hash comment | |
| 625 next; | |
| 626 } | |
| 627 my @F = split /\t/, $_; | |
| 628 next if exists $chr2caveats{"$F[0]:$F[1]"} and $chr2caveats{"$F[0]:$F[1]"} =~ /inconsistent haplotype phasing/; | |
| 629 # | indicates phased | |
| 630 if($F[8] =~ m(^(\d+)\|(\d+):)){ | |
| 631 next if $1 eq $2; # not useful to us (actually would mess up phase combining later on), but is provided sometimes | |
| 632 # start of a phasing block | |
| 633 if($phase_chr eq ""){ | |
| 634 $phase_chr = $F[0]; | |
| 635 } | |
| 636 my @vars = split /,/, $F[4]; | |
| 637 if($1 > @vars){ | |
| 638 die "Invalid VCF file (line #$.): First haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; | |
| 639 } | |
| 640 if($2 > @vars){ | |
| 641 die "Invalid VCF file (line #$.): Second haplotype listed as $1, but only ", scalar(@vars), " variants were provided (", join(",", @vars), "\n"; | |
| 642 } | |
| 643 unshift @vars, $F[3]; | |
| 644 push @phase_dataA, [$F[1], $vars[$1]]; | |
| 645 push @phase_dataB, [$F[1], $vars[$2]]; | |
| 646 } | |
| 647 # non phased het call, ends any phasing block there might be | |
| 648 elsif($F[8] =~ m(^0/1)){ | |
| 649 # Did we just finish a phased block? If so, output it. | |
| 650 if(@phase_dataA > 1){ | |
| 651 my $phase_def = "G-$phase_chr:".$phase_dataA[0]->[0]."-".$phase_dataA[$#phase_dataA]->[0]; | |
| 652 for my $d (@phase_dataA){ | |
| 653 my ($p, $v) = @$d; | |
| 654 if(exists $chr2phase{"$phase_chr:$p:$v"}){ | |
| 655 $chr2phase{"$phase_chr:$p:$v"} .= ",$phase_def"; | |
| 656 $multi_phased ||= 1; | |
| 657 } | |
| 658 else{ | |
| 659 $chr2phase{"$phase_chr:$p:$v"} = $phase_def; | |
| 660 } | |
| 661 } | |
| 662 $phase_def = "H-$phase_chr:".$phase_dataB[0]->[0]."-".$phase_dataB[$#phase_dataB]->[0]; | |
| 663 for my $d (@phase_dataB){ | |
| 664 my ($p, $v) = @$d; | |
| 665 if(exists $chr2phase{"$phase_chr:$p:$v"}){ | |
| 666 $chr2phase{"$phase_chr:$p:$v"} = ",$phase_def"; | |
| 667 $multi_phased ||= 1; | |
| 668 } | |
| 669 else{ | |
| 670 $chr2phase{"$phase_chr:$p:$v"} = $phase_def; | |
| 671 } | |
| 672 } | |
| 673 } | |
| 674 if($phase_chr ne ""){ | |
| 675 $phase_chr = ""; | |
| 676 @phase_dataA = (); | |
| 677 @phase_dataB = (); | |
| 678 } | |
| 679 } | |
| 680 } | |
| 681 | |
| 682 print STDERR "Reading in feature GTF data..." unless $quiet; | |
| 683 my %feature_range; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] | |
| 684 my %feature_intervaltree; # chr => transcript_id => [[genomic_exon_start,genomic_exon_end,cdna_start_pos],...] | |
| 685 my %feature_strand; # transcript_id => +|- | |
| 686 my $feature_count = 0; | |
| 687 my %feature_min; | |
| 688 my %feature_max; | |
| 689 my %feature_cds_min; | |
| 690 my %feature_cds_max; | |
| 691 my %feature_contig; | |
| 692 my %feature_length; | |
| 693 my %feature_type; | |
| 694 my %feature_transl_table; # note alternate translation table usage | |
| 695 my %chr_read; | |
| 696 open(GTF, $exons_file) | |
| 697 or die "Cannot open $exons_file for reading: $!\n"; | |
| 698 while(<GTF>){ | |
| 699 next if /^\s*#/; | |
| 700 my @fields = split /\t/, $_; | |
| 701 next if defined $which_chr and $fields[0] ne $which_chr and "chr$fields[0]" ne $which_chr and $fields[0] ne "chr$which_chr"; | |
| 702 | |
| 703 if($fields[2] eq "exon" or $fields[2] eq "CDS"){ | |
| 704 next unless $fields[$#fields] =~ /transcript_id \"(.*?)\"/o; | |
| 705 my $parent = $1; | |
| 706 if(not $quiet and not exists $chr_read{$fields[0]}){ | |
| 707 print STDERR " $fields[0]"; | |
| 708 $chr_read{$fields[0]} = 1; | |
| 709 } | |
| 710 if(not exists $feature_strand{$parent}){ | |
| 711 $feature_strand{$parent} = $fields[6]; | |
| 712 $feature_contig{$parent} = $fields[0]; | |
| 713 if($fields[$#fields] =~ /transcript_type \"(.*?)\"/){ | |
| 714 $feature_type{$parent} = $1; | |
| 715 } | |
| 716 else{ | |
| 717 $feature_type{$parent} = "NA"; | |
| 718 } | |
| 719 } | |
| 720 if($fields[2] eq "CDS"){ | |
| 721 #print STDERR "CDS value for $parent is $fields[2]..$fields[3]\n"; | |
| 722 if(not exists $feature_cds_min{$parent} or $fields[3] < $feature_cds_min{$parent}){ | |
| 723 $feature_cds_min{$parent} = $fields[3]; | |
| 724 } | |
| 725 if(not exists $feature_cds_max{$parent} or $fields[4] > $feature_cds_max{$parent}){ | |
| 726 $feature_cds_max{$parent} = $fields[4]; | |
| 727 } | |
| 728 if($fields[$#fields] =~ /transl_table \"(\d+)\"/){ | |
| 729 $feature_transl_table{$parent} = $1; #assume one translation table per CDS, which should be reasonable | |
| 730 } | |
| 731 while($fields[$#fields] =~ /transl_except \"pos:(\S+?),aa:(\S+?)\"/g){ | |
| 732 my $pos = $1; | |
| 733 my $new_aa = $2; # needs to change from three letter code to 1 | |
| 734 if($new_aa =~ /^ter/i){ # can be funny so have special case (allows TERM, etc.) | |
| 735 $new_aa = "*"; | |
| 736 } | |
| 737 elsif(length($new_aa) == 3){ | |
| 738 $new_aa = Bio::SeqUtils->new()->seq3in($new_aa); | |
| 739 } | |
| 740 if($pos =~ /^(\d+)\.\.(\d+)/){ | |
| 741 for my $p ($1..$2){ | |
| 742 $transl_except{"$fields[0]:$p"} = $new_aa; | |
| 743 } | |
| 744 } | |
| 745 else{ | |
| 746 $transl_except{"$fields[0]:$pos"} = $new_aa; | |
| 747 } | |
| 748 } | |
| 749 next; | |
| 750 } | |
| 751 if(not exists $feature_min{$parent} or $fields[3] < $feature_min{$parent}){ | |
| 752 $feature_min{$parent} = $fields[3]; | |
| 753 } | |
| 754 if(not exists $feature_max{$parent} or $fields[4] > $feature_max{$parent}){ | |
| 755 $feature_max{$parent} = $fields[4]; | |
| 756 } | |
| 757 | |
| 758 $feature_count++; | |
| 759 if(not exists $feature_range{$fields[0]}){ | |
| 760 $feature_range{$fields[0]} = {}; # Chr => {parentID => [start,stop]} | |
| 761 $feature_intervaltree{$fields[0]} = Set::IntervalTree->new(); | |
| 762 } | |
| 763 if(not exists $feature_range{$fields[0]}->{$parent}){ | |
| 764 $feature_range{$fields[0]}->{$parent} = []; | |
| 765 } | |
| 766 push @{$feature_range{$fields[0]}->{$parent}}, [$fields[3],$fields[4]]; | |
| 767 $feature_intervaltree{$fields[0]}->insert($parent, $fields[3], $fields[4]+1); # ranges need to have positive length for module to work properly | |
| 768 $feature_length{$parent} += $fields[4]-$fields[3]+1; | |
| 769 } | |
| 770 } | |
| 771 close(GTF); | |
| 772 print STDERR "\nFound $feature_count exons on ", scalar(keys %feature_range), " contigs in the GTF file\n" unless $quiet; | |
| 773 | |
| 774 for my $contig (keys %feature_range){ | |
| 775 for my $parent (keys %{$feature_range{$contig}}){ | |
| 776 # sort by subrange start | |
| 777 my @feature_ranges = sort {$a->[0] <=> $b->[0]} @{$feature_range{$contig}->{$parent}}; | |
| 778 $feature_range{$contig}->{$parent} = \@feature_ranges; | |
| 779 $feature_range{"chr".$contig}->{$parent} = \@feature_ranges if not $contig =~ /^chr/; | |
| 780 $feature_range{$1}->{$parent} = \@feature_ranges if $contig =~ /^chr(\S+)/; | |
| 781 } | |
| 782 } | |
| 783 | |
| 784 # Calculate the cDNA position of the leftmost (reference strand) base for each exon | |
| 785 for my $contig (keys %feature_range){ | |
| 786 for my $parent (keys %{$feature_range{$contig}}){ | |
| 787 my @feature_ranges = @{$feature_range{$contig}->{$parent}}; | |
| 788 if($feature_strand{$parent} eq "-"){ | |
| 789 # set up utr offset for correct CDS coordinates | |
| 790 my $feature_offset = 0; | |
| 791 for(my $i = $#feature_ranges; $i >= 0; $i--){ | |
| 792 last if not $feature_cds_max{$parent}; | |
| 793 # exon is completely 5' of the start | |
| 794 if($feature_ranges[$i]->[0] > $feature_cds_max{$parent}){ | |
| 795 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 796 } | |
| 797 # exon with the cds start | |
| 798 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$parent} and | |
| 799 $feature_ranges[$i]->[0] <= $feature_cds_max{$parent}){ | |
| 800 $feature_offset += $feature_cds_max{$parent} - $feature_ranges[$i]->[1]; | |
| 801 last; | |
| 802 } | |
| 803 else{ | |
| 804 die "The CDS for $parent (on negative strand) ends downstream ", | |
| 805 "($feature_cds_max{$parent}) of the an exon", | |
| 806 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; | |
| 807 } | |
| 808 } | |
| 809 for(my $i = $#feature_ranges; $i >= 0; $i--){ | |
| 810 $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 811 $feature_ranges[$i]->[2] = $feature_offset-1; | |
| 812 } | |
| 813 } | |
| 814 else{ # positive strand | |
| 815 # set up utr offset for correct CDS coordinates | |
| 816 my $feature_offset = 0; | |
| 817 for(my $i = 0; $i <= $#feature_ranges; $i++){ | |
| 818 last if not $feature_cds_min{$parent}; | |
| 819 # All 5' utr exon | |
| 820 if($feature_ranges[$i]->[1] < $feature_cds_min{$parent}){ | |
| 821 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 822 } | |
| 823 # exon with the cds start | |
| 824 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$parent} and | |
| 825 $feature_ranges[$i]->[0] <= $feature_cds_min{$parent}){ | |
| 826 $feature_offset -= $feature_cds_min{$parent} - $feature_ranges[$i]->[0]; | |
| 827 last; | |
| 828 } | |
| 829 else{ | |
| 830 die "The CDS for $parent starts upstream ($feature_cds_min{$parent}) of the first exon", | |
| 831 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; | |
| 832 } | |
| 833 } | |
| 834 # assign cDNA coords for each exon to the third array element | |
| 835 for(my $i = 0; $i <= $#feature_ranges; $i++){ | |
| 836 $feature_ranges[$i]->[2] = $feature_offset; | |
| 837 $feature_offset += $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 838 } | |
| 839 } | |
| 840 } | |
| 841 } | |
| 842 | |
| 843 print STDERR "Reading in gene name definitions...\n" unless $quiet; | |
| 844 die "Data file $genename_bed_file does not exist, aborting.\n" if not -e $genename_bed_file; | |
| 845 my %gene_ids; | |
| 846 open(TAB, $genename_bed_file) | |
| 847 or die "Cannot open gene name BED file $genename_bed_file for reading: $!\n"; | |
| 848 while(<TAB>){ | |
| 849 chomp; | |
| 850 # format should be "chr start stop gene_name ..." | |
| 851 my @fields = split /\t/, $_; | |
| 852 next if $#fields < 3; | |
| 853 my $c = $fields[0]; | |
| 854 if(not exists $gene_ids{$c}){ | |
| 855 $gene_ids{$c} = Set::IntervalTree->new(); | |
| 856 } | |
| 857 $gene_ids{$c}->insert($fields[3], $fields[1], $fields[2]); | |
| 858 } | |
| 859 | |
| 860 # Print output header | |
| 861 open(OUT, ">$output_file") | |
| 862 or die "Cannot open $output_file for writing: $!\n"; | |
| 863 | |
| 864 print OUT join("\t", "Feature type", "Transcript length", "Selected transcript", "Transcript HGVS", "Strand", "Chr", "DNA From", "DNA To", "Zygosity", "P-value", "Variant Reads", "Total Reads", | |
| 865 "Ref base", "Obs base", "Pop. freq. source", "Pop. freq.", "Variant DB ID"), "\t", | |
| 866 ($internal_snp ? "Internal pop. freq.\t" : ""), | |
| 867 join("\t", "Protein HGVS", "Closest exon junction (AA coding variants)", "Gene Name", "Caveats", "Phase", "Num rare variants in gene (MAF <= $rare_variant_prop)", "Num rare coding and splice site variants in gene (MAF <= $rare_variant_prop)"),"\n"; | |
| 868 | |
| 869 # If there is CNV data, load it. | |
| 870 # BED columns should be chr start stop caveats ploidy . ignored ignored r,g,b | |
| 871 # The dot means the strand doesn't matter. | |
| 872 # where the first five fields are required, others optional | |
| 873 # where r,g,b is overloaded with father,mother ploidies and "b" is integer representing affected status logical AND (father bit mask 1, mother bit mask 2) | |
| 874 if(defined $cnv_file){ | |
| 875 print STDERR "Reading in CNV data...\n" unless $quiet; | |
| 876 open(CNV, $cnv_file) | |
| 877 or die "Cannot open $cnv_file for reading: $!\n"; | |
| 878 while(<CNV>){ | |
| 879 chomp; | |
| 880 my @F = split /\t/, $_, -1; | |
| 881 if(@F < 5){ | |
| 882 print STDERR "Skipping unparseable line ($cnv_file #$.): $_\n"; | |
| 883 next; | |
| 884 } | |
| 885 my $ploidy = $F[4]; | |
| 886 my $cnv_chr = $F[0]; | |
| 887 next if defined $which_chr and $cnv_chr ne $which_chr and "chr$cnv_chr" ne $which_chr and $cnv_chr ne "chr$which_chr"; | |
| 888 my $cnv_start = $F[1]; | |
| 889 my $cnv_end = $F[2]; | |
| 890 my $p_value = "NA"; | |
| 891 if($F[3] =~ s/p-value=(\S+?)(?:;|$)//){ | |
| 892 $p_value = $1; | |
| 893 next if $min_pvalue < $p_value; | |
| 894 } | |
| 895 | |
| 896 # Report a variant line for each gene that is found in this CNV | |
| 897 my $target_parents = $feature_intervaltree{$cnv_chr}->fetch($cnv_start, $cnv_end+1); | |
| 898 | |
| 899 my $caveats = ""; | |
| 900 if(@F == 9){ | |
| 901 my @parents_ploidy = split /,/, $F[8]; | |
| 902 if($parents_ploidy[2] == 0){ # neither parent affected | |
| 903 if($ploidy < $parents_ploidy[0] and $ploidy < $parents_ploidy[1]){ | |
| 904 if($ploidy > 2){ | |
| 905 $caveats = "Polyploidy is less severe than in either unaffected parents"; | |
| 906 } | |
| 907 # else: no caveats, this offspring has fewer copies than normally observed, or in unaffected parents | |
| 908 elsif($ploidy < 2){ | |
| 909 if($parents_ploidy[0] == 2 and $parents_ploidy[1] == 2){ | |
| 910 $caveats = "De novo copy loss, unaffected parents are diploid"; | |
| 911 } | |
| 912 else{ | |
| 913 $caveats = "Copy loss is greater than in either unaffected parent"; | |
| 914 } | |
| 915 } | |
| 916 } | |
| 917 elsif($ploidy >= $parents_ploidy[0] and $ploidy <= $parents_ploidy[1] or | |
| 918 $ploidy >= $parents_ploidy[1] and $ploidy <= $parents_ploidy[0]){ | |
| 919 $caveats = "Aneuploidy likely inherited from an unaffected parent"; | |
| 920 } | |
| 921 elsif($ploidy > $parents_ploidy[0] and $ploidy > $parents_ploidy[1]){ | |
| 922 if($parents_ploidy[0] > 2){ | |
| 923 if($parents_ploidy[1] > 2){ | |
| 924 $caveats = "Lower polyploidy already exists in both unaffected parents"; | |
| 925 } | |
| 926 else{ | |
| 927 $caveats = "Lower polyploidy already exists in unaffected father"; | |
| 928 } | |
| 929 } | |
| 930 else{ | |
| 931 if($parents_ploidy[1] > 2){ | |
| 932 $caveats = "Lower polyploidy already exists in unaffected mother"; | |
| 933 } | |
| 934 # else no caveats, because both parents are "normal", yet we have polyploidy in the offspring | |
| 935 else{ | |
| 936 $caveats = "De novo polyploidy, unaffected parents are diploid"; | |
| 937 } | |
| 938 } | |
| 939 } | |
| 940 # else | |
| 941 else{ | |
| 942 die "Oops! Error in program logic...how did we get here (unaffected parents)? $_"; | |
| 943 } | |
| 944 } | |
| 945 elsif($parents_ploidy[2] == 1){ # father affected | |
| 946 if($ploidy == $parents_ploidy[1]){ # just like unaffected Mom | |
| 947 if($ploidy > 2){ | |
| 948 if($ploidy == $parents_ploidy[0]){ | |
| 949 $caveats = "Same polyploidy present in both affected and unaffected parents"; | |
| 950 } | |
| 951 else{ | |
| 952 $caveats = "Polyploidy inherited from unaffected mother"; | |
| 953 } | |
| 954 } | |
| 955 elsif($ploidy < 2){ | |
| 956 if($ploidy == $parents_ploidy[0]){ | |
| 957 $caveats = "Same copy loss in both affected and unaffected parents"; | |
| 958 } | |
| 959 else{ | |
| 960 $caveats = "Copy loss is shared with unaffected mother"; | |
| 961 } | |
| 962 } | |
| 963 else{ | |
| 964 if($ploidy == $parents_ploidy[0]){ | |
| 965 # Why was this even reported? parents and child have diploid status... | |
| 966 next; | |
| 967 } | |
| 968 $caveats = "Diploidy is shared with unaffected mother"; | |
| 969 } | |
| 970 } | |
| 971 elsif($ploidy > 2){ # polyploidy | |
| 972 if($parents_ploidy[0] == 2){ | |
| 973 if($parents_ploidy[1] > 2){ | |
| 974 $caveats = "Unaffected mother has polyploidy (".$parents_ploidy[1]."x), but affected father is diploid"; | |
| 975 } | |
| 976 elsif($parents_ploidy[1] == 2){ | |
| 977 $caveats = "Both unaffected mother and affected father are diploid"; | |
| 978 } | |
| 979 else{ | |
| 980 $caveats = "Affected father is diploid, unaffected mother has copy loss (".$parents_ploidy[1]."x)"; | |
| 981 } | |
| 982 } | |
| 983 elsif($parents_ploidy[0] < 2){ | |
| 984 $caveats = "Polyploidy found, but affected father had copy loss (".$parents_ploidy[0]."x)"; | |
| 985 } | |
| 986 elsif($ploidy < $parents_ploidy[1]){ | |
| 987 $caveats = "Polyploidy is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; | |
| 988 } | |
| 989 # past here the ploidy is great than in the unaffected mother | |
| 990 elsif($parents_ploidy[1] < 2){ | |
| 991 $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), but unaffected mother actually had copy loss (". $parents_ploidy[1]. "x)"; | |
| 992 } | |
| 993 elsif($parents_ploidy[1] == 2){ | |
| 994 $caveats = "Polyploidy is also severe in affected father (".$parents_ploidy[0]."x), and mother is diploid"; | |
| 995 } | |
| 996 elsif($ploidy < $parents_ploidy[0]){ | |
| 997 $caveats = "Polyploidy is less severe than in affected father (".$parents_ploidy[0]."x), but more severe than unaffected mother (". $parents_ploidy[1]. "x)"; | |
| 998 } | |
| 999 elsif($ploidy > $parents_ploidy[0]){ | |
| 1000 $caveats = "Polyploidy is more severe than in affected father (".$parents_ploidy[0]."x)"; | |
| 1001 } | |
| 1002 else{ | |
| 1003 $caveats = "Polyploidy is as severe as in affected father"; | |
| 1004 } | |
| 1005 } | |
| 1006 elsif($ploidy == 2){ | |
| 1007 # Don't report diploid status, any funny recombination should show up in large indel analysis | |
| 1008 next; | |
| 1009 } | |
| 1010 else{ # copies < 2 | |
| 1011 if($ploidy == $parents_ploidy[0]){ | |
| 1012 if($ploidy > $parents_ploidy[1]){ | |
| 1013 $caveats = "Copy loss is the same as affected father, but less than unaffected mother (". $parents_ploidy[1]. "x)"; | |
| 1014 } | |
| 1015 else{ | |
| 1016 $caveats = "Copy loss is as severe as in affected father"; | |
| 1017 } | |
| 1018 } | |
| 1019 elsif($ploidy > $parents_ploidy[0]){ | |
| 1020 if($ploidy > $parents_ploidy[1]){ | |
| 1021 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ | |
| 1022 $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; | |
| 1023 } | |
| 1024 elsif($ploidy == 2){ | |
| 1025 next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) | |
| 1026 } | |
| 1027 else{ | |
| 1028 $caveats = "Copy loss is less severe than in unaffected mother (".$parents_ploidy[1]."x), or affected father (".$parents_ploidy[0]."x)"; | |
| 1029 } | |
| 1030 } | |
| 1031 # else: child has less copies than unaffected mom, but more than affected Dad | |
| 1032 else{ | |
| 1033 if($parents_ploidy[1] > 2){ | |
| 1034 $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; | |
| 1035 } | |
| 1036 elsif($parents_ploidy[1] == 2){ | |
| 1037 $caveats = "Copy loss was more severe in affected father (".$parents_ploidy[0]."x), but unaffected mother was diploid"; | |
| 1038 } | |
| 1039 else{ # unaffected has loss | |
| 1040 $caveats = "Copy loss is more severe than unaffect mother (".$parents_ploidy[1]."x), but less severe than affected father (".$parents_ploidy[0]."x)"; | |
| 1041 } | |
| 1042 } | |
| 1043 } | |
| 1044 # past here, ploidy is less than affected father | |
| 1045 elsif($parents_ploidy[1] > 2){ | |
| 1046 $caveats = "Copy loss is more severe than affected father (".$parents_ploidy[0]."x), and unaffected mother had polyploidy (".$parents_ploidy[1]."x)"; | |
| 1047 } | |
| 1048 elsif($parents_ploidy[1] == 2){ | |
| 1049 $caveats = "Copy loss is more severe than in affected father (".$parents_ploidy[0]."x)"; | |
| 1050 } | |
| 1051 else{ | |
| 1052 $caveats = "Copy loss is more severe than in both unaffect mother (".$parents_ploidy[1]."x), and affected father (".$parents_ploidy[0]."x)"; | |
| 1053 } | |
| 1054 } | |
| 1055 } | |
| 1056 elsif($parents_ploidy[2] == 2){ # mother affected | |
| 1057 if($ploidy == $parents_ploidy[0]){ # just like unaffected Dad | |
| 1058 if($ploidy > 2){ | |
| 1059 if($ploidy == $parents_ploidy[1]){ | |
| 1060 $caveats = "Same polyploidy present in both affected and unaffected parents"; | |
| 1061 } | |
| 1062 else{ | |
| 1063 $caveats = "Polyploidy inherited from unaffected father"; | |
| 1064 } | |
| 1065 } | |
| 1066 elsif($ploidy < 2){ | |
| 1067 if($ploidy == $parents_ploidy[1]){ | |
| 1068 $caveats = "Same copy loss in both affected and unaffected parents"; | |
| 1069 } | |
| 1070 else{ | |
| 1071 $caveats = "Copy loss is shared with unaffected father"; | |
| 1072 } | |
| 1073 } | |
| 1074 else{ | |
| 1075 if($ploidy == $parents_ploidy[1]){ | |
| 1076 # Why was this even reported? parents and child have diploid status... | |
| 1077 next; | |
| 1078 } | |
| 1079 $caveats = "Diploidy is shared with unaffected father"; | |
| 1080 } | |
| 1081 } | |
| 1082 elsif($ploidy > 2){ # polyploidy | |
| 1083 if($parents_ploidy[1] == 2){ | |
| 1084 if($parents_ploidy[0] > 2){ | |
| 1085 $caveats = "Unaffected father has polyploidy (".$parents_ploidy[0]."x), but affected mother is diploid"; | |
| 1086 } | |
| 1087 elsif($parents_ploidy[0] == 2){ | |
| 1088 $caveats = "Both unaffected father and affected mother are diploid"; | |
| 1089 } | |
| 1090 else{ | |
| 1091 $caveats = "Affected mother is diploid, unaffected father has copy loss (".$parents_ploidy[1]."x)"; | |
| 1092 } | |
| 1093 } | |
| 1094 elsif($parents_ploidy[1] < 2){ | |
| 1095 $caveats = "Polyploidy found, but affected mother had copy loss (".$parents_ploidy[1]."x)"; | |
| 1096 } | |
| 1097 elsif($ploidy < $parents_ploidy[0]){ | |
| 1098 $caveats = "Polyploidy is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; | |
| 1099 } | |
| 1100 # past here the ploidy is great than in the unaffected father | |
| 1101 elsif($parents_ploidy[0] < 2){ | |
| 1102 $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), but unaffected father actually had copy loss (". $parents_ploidy[0]. "x)"; | |
| 1103 } | |
| 1104 elsif($parents_ploidy[0] == 2){ | |
| 1105 $caveats = "Polyploidy is also severe in affected mother (".$parents_ploidy[1]."x), and unaffected father is diploid"; | |
| 1106 } | |
| 1107 elsif($ploidy < $parents_ploidy[1]){ | |
| 1108 $caveats = "Polyploidy is less severe than in affected mother (".$parents_ploidy[1]."x), but more severe than unaffected father (". $parents_ploidy[0]. "x)"; | |
| 1109 } | |
| 1110 elsif($ploidy > $parents_ploidy[1]){ | |
| 1111 $caveats = "Polyploidy is more severe than in affected mother (".$parents_ploidy[1]."x)"; | |
| 1112 } | |
| 1113 else{ | |
| 1114 $caveats = "Polyploidy is as severe as in affected mother"; | |
| 1115 } | |
| 1116 } | |
| 1117 elsif($ploidy == 2){ | |
| 1118 # Don't report diploid status, any funny recombination should show up in large indel analysis | |
| 1119 next; | |
| 1120 } | |
| 1121 else{ # copies < 2 | |
| 1122 if($ploidy == $parents_ploidy[1]){ | |
| 1123 if($ploidy > $parents_ploidy[0]){ | |
| 1124 $caveats = "Copy loss is the same as affected mother, but less than unaffected father (". $parents_ploidy[0]. "x)"; | |
| 1125 } | |
| 1126 else{ | |
| 1127 $caveats = "Copy loss is as severe as in affected mother"; | |
| 1128 } | |
| 1129 } | |
| 1130 elsif($ploidy > $parents_ploidy[1]){ | |
| 1131 if($ploidy > $parents_ploidy[0]){ | |
| 1132 if($parents_ploidy[1] == 0 and $parents_ploidy[0] == 0){ | |
| 1133 $caveats = "Poor mapping, or Mendelian inheritence violation is severe: no copies of region in either parent, but present in offspring"; | |
| 1134 } | |
| 1135 elsif($ploidy == 2){ | |
| 1136 next; # child got best of both parents, ignore from CNV standpoint (may still have SNPs of course, or translocation, etc.) | |
| 1137 } | |
| 1138 else{ | |
| 1139 $caveats = "Copy loss is less severe than in unaffected father (".$parents_ploidy[0]."x), or affected mother (".$parents_ploidy[1]."x)"; | |
| 1140 } | |
| 1141 } | |
| 1142 # else: child has less copies than unaffected Dad, but more than affected Mom | |
| 1143 else{ | |
| 1144 if($parents_ploidy[0] > 2){ | |
| 1145 $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father had polyploidy (".$parents_ploidy[0]."x)"; | |
| 1146 } | |
| 1147 elsif($parents_ploidy[0] == 2){ | |
| 1148 $caveats = "Copy loss was more severe in affected mother (".$parents_ploidy[1]."x), but unaffected father was diploid"; | |
| 1149 } | |
| 1150 else{ # unaffected has loss | |
| 1151 $caveats = "Copy loss is more severe than unaffect father (".$parents_ploidy[0]."x), but less severe than affected mother (".$parents_ploidy[1]."x)"; | |
| 1152 } | |
| 1153 } | |
| 1154 } | |
| 1155 # past here, ploidy is less than affected mother | |
| 1156 elsif($parents_ploidy[0] > 2){ | |
| 1157 $caveats = "Copy loss is more severe than affected mother (".$parents_ploidy[1]."x), and unaffected father had polyploidy (".$parents_ploidy[0]."x)"; | |
| 1158 } | |
| 1159 elsif($parents_ploidy[0] == 2){ | |
| 1160 $caveats = "Copy loss is more severe than in affected mother (".$parents_ploidy[1]."x)"; | |
| 1161 } | |
| 1162 else{ | |
| 1163 $caveats = "Copy loss is more severe than in both unaffect father (".$parents_ploidy[0]."x), and affected mother (".$parents_ploidy[1]."x)"; | |
| 1164 } | |
| 1165 } | |
| 1166 } | |
| 1167 | |
| 1168 } | |
| 1169 if($F[3] and $F[3] ne "-"){ # prexisting caveat from CNV caller | |
| 1170 if(defined $caveats){ | |
| 1171 $caveats .= "; $F[3]" unless $caveats =~ /\b$F[3]\b/; | |
| 1172 } | |
| 1173 else{ | |
| 1174 $caveats = $F[3]; | |
| 1175 } | |
| 1176 } | |
| 1177 | |
| 1178 # Sort by start for consistency | |
| 1179 my @target_parents = sort {$feature_range{$cnv_chr}->{$a}->[0]->[0] <=> $feature_range{$cnv_chr}->{$b}->[0]->[0]} @$target_parents; | |
| 1180 | |
| 1181 for my $target_parent (@target_parents){ | |
| 1182 my $target_caveats = $caveats; | |
| 1183 my $strand = $feature_strand{$target_parent}; | |
| 1184 # report the gain/loss of each gene separately, for simplicity in downstream analysis | |
| 1185 my $cnv_exon_start = 10000000000; # genomic coords | |
| 1186 my $cnv_exon_end = 0; | |
| 1187 my $cnv_cdna_start = 0; # cDNA coords | |
| 1188 my $cnv_cdna_end = 0; | |
| 1189 my $off5 = 0; # border outside the exon? | |
| 1190 my $off3 = 0; | |
| 1191 my @feature_ranges = @{$feature_range{$cnv_chr}->{$target_parent}}; | |
| 1192 # find the first and last exons in the gene that are inside the CNV | |
| 1193 for my $subregion (@feature_ranges){ | |
| 1194 # exon overlaps CNV? | |
| 1195 if($subregion->[0] <= $cnv_end and $subregion->[1] >= $cnv_start){ | |
| 1196 if($cnv_exon_start > $subregion->[0]){ | |
| 1197 if($cnv_start < $subregion->[0]){ | |
| 1198 $cnv_exon_start = $subregion->[0]; $off5 = 1; | |
| 1199 $cnv_cdna_start = $subregion->[2]; | |
| 1200 } | |
| 1201 else{ | |
| 1202 $cnv_exon_start = $cnv_start; $off5 = 0; | |
| 1203 $cnv_cdna_start = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_start: $cnv_start-$subregion->[0]); | |
| 1204 } | |
| 1205 } | |
| 1206 if($cnv_exon_end < $subregion->[1]){ | |
| 1207 if($cnv_end > $subregion->[1]){ | |
| 1208 $cnv_exon_end = $subregion->[1]; $off3 = 1; | |
| 1209 $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$subregion->[1] : $subregion->[1]-$subregion->[0]); | |
| 1210 } | |
| 1211 else{ | |
| 1212 $cnv_exon_end = $cnv_end; $off3 = 0; | |
| 1213 $cnv_cdna_end = $subregion->[2]+($strand eq "-" ? $subregion->[0]-$cnv_end : $cnv_end-$subregion->[0]); | |
| 1214 } | |
| 1215 } | |
| 1216 } | |
| 1217 } | |
| 1218 | |
| 1219 my $ends_internally = 0; | |
| 1220 if($cnv_exon_end == 0){ # ends inside the exon | |
| 1221 $cnv_exon_end = $cnv_end; | |
| 1222 $ends_internally = 1; | |
| 1223 } | |
| 1224 # See if it's in the structural variant database | |
| 1225 my @gain_coverage; $#gain_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks | |
| 1226 my @loss_coverage; $#loss_coverage = $cnv_exon_end-$cnv_exon_start; # preallocate blanks | |
| 1227 my $dgv_loss_id; # report the DGV entry that covers most of the observed structural variant | |
| 1228 my $dgv_loss_length = 0; # report the DGV entry that covers most of the observed structural variant | |
| 1229 my $dgv_gain_id; # report the DGV entry that covers most of the observed structural variant | |
| 1230 my $dgv_gain_length = 0; # report the DGV entry that covers most of the observed structural variant | |
| 1231 my $gains; | |
| 1232 my $losses; | |
| 1233 my $dgv_chr = $cnv_chr; | |
| 1234 $dgv_chr =~ s/^chr//; # no prefix in DGV | |
| 1235 #open(DGV, "tabix $dgv_file $dgv_chr:$cnv_exon_start-$cnv_exon_end |") # check out CNV in this gene model region | |
| 1236 # or die "Cannot run tabix: $!\n"; | |
| 1237 open(DGV, "/dev/null"); | |
| 1238 while(<DGV>){ | |
| 1239 my @C = split /\t/, $_; | |
| 1240 next if $C[4] ne "CNV"; # todo: handle indels? | |
| 1241 my $dgv_start = $C[2]; | |
| 1242 my $dgv_end = $C[3]; | |
| 1243 my $dgv_direction = $C[5]; | |
| 1244 my $gain_cov_count = 0; | |
| 1245 my $loss_cov_count = 0; | |
| 1246 if($dgv_direction eq "Gain"){ | |
| 1247 for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ | |
| 1248 $gain_coverage[$i-$cnv_exon_start] = 1 unless defined $gain_coverage[$i-$cnv_exon_start]; | |
| 1249 $gain_cov_count++; | |
| 1250 } | |
| 1251 } | |
| 1252 elsif($dgv_direction eq "Loss"){ | |
| 1253 for(my $i = ($dgv_start < $cnv_exon_start ? $cnv_exon_start : $dgv_start); $i <= $dgv_end and $i <= $cnv_exon_end; $i++){ | |
| 1254 $loss_coverage[$i-$cnv_exon_start] = 1 unless defined $loss_coverage[$i-$cnv_exon_start]; | |
| 1255 $loss_cov_count++; | |
| 1256 } | |
| 1257 } | |
| 1258 if($dgv_direction eq "Gain" and $gain_cov_count > $dgv_gain_length){ | |
| 1259 $dgv_gain_id = $C[0]; | |
| 1260 $dgv_gain_length = $gain_cov_count; | |
| 1261 } | |
| 1262 if($dgv_direction eq "Loss" and $loss_cov_count > $dgv_loss_length){ | |
| 1263 $dgv_loss_id = $C[0]; | |
| 1264 $dgv_loss_length = $loss_cov_count; | |
| 1265 } | |
| 1266 } | |
| 1267 close(DGV); | |
| 1268 | |
| 1269 my $gain_coverage = 0; | |
| 1270 for my $count (@gain_coverage){ | |
| 1271 $gain_coverage++ if defined $count; | |
| 1272 } | |
| 1273 $gain_coverage = sprintf "%.3f", $gain_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion | |
| 1274 my $loss_coverage = 0; | |
| 1275 for my $count (@loss_coverage){ | |
| 1276 $loss_coverage++ if defined $count; | |
| 1277 } | |
| 1278 $loss_coverage = sprintf "%.3f", $loss_coverage/($cnv_exon_end-$cnv_exon_start+1); # make it a proportion | |
| 1279 | |
| 1280 my $src = "DGV"; | |
| 1281 my $dgv_id = "NA"; | |
| 1282 my $dgv_caveat; | |
| 1283 my $dgv_coverage; | |
| 1284 if($ploidy > 2){ | |
| 1285 if(not defined $dgv_gain_id){ | |
| 1286 if(defined $dgv_loss_id){ | |
| 1287 $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); | |
| 1288 $dgv_caveat = "; No gains are known in healthy controls, the DGV2 ID reported is for a loss in the same area"; | |
| 1289 $dgv_coverage = $loss_coverage; | |
| 1290 } | |
| 1291 else{ | |
| 1292 $dgv_id = "novel"; | |
| 1293 $dgv_coverage = "NA"; | |
| 1294 $src = "NA"; | |
| 1295 } | |
| 1296 } | |
| 1297 else{ | |
| 1298 $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); | |
| 1299 $dgv_coverage = $gain_coverage; | |
| 1300 } | |
| 1301 } | |
| 1302 elsif($ploidy < 2){ | |
| 1303 if(not defined $dgv_loss_id){ | |
| 1304 if(defined $dgv_gain_id){ | |
| 1305 $dgv_id = sprintf "%s/%.3f", $dgv_gain_id, $dgv_gain_length/($cnv_exon_end-$cnv_exon_start+1); | |
| 1306 $dgv_caveat = "; No losses are known in healthy controls, the DGV2 ID reported is for a gain in the same area"; | |
| 1307 $dgv_coverage = $gain_coverage; | |
| 1308 } | |
| 1309 else{ | |
| 1310 $dgv_id = "novel"; | |
| 1311 $dgv_coverage = "NA"; | |
| 1312 $src = "NA"; | |
| 1313 } | |
| 1314 } | |
| 1315 else{ | |
| 1316 $dgv_id = sprintf "%s/%.3f", $dgv_loss_id, $dgv_loss_length/($cnv_exon_end-$cnv_exon_start+1); | |
| 1317 $dgv_coverage = $loss_coverage; | |
| 1318 } | |
| 1319 } | |
| 1320 | |
| 1321 my $non_coding = 0; | |
| 1322 if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ | |
| 1323 $non_coding = 1; | |
| 1324 } | |
| 1325 $target_caveats .= $dgv_caveat if defined $dgv_caveat and $dgv_id ne "novel" and $target_caveats !~ /\Q$dgv_caveat\E/; | |
| 1326 #print "Recorded $cnv_chr:$cnv_start caveat $caveats\n"; | |
| 1327 # if it doesn't overlap an exon, we need to find out which two exons it's between | |
| 1328 if($ends_internally){ | |
| 1329 my $intron_found = 0; | |
| 1330 for(my $i = 0; $i < $#feature_ranges; $i++){ | |
| 1331 if($feature_ranges[$i]->[1] < $cnv_start and $feature_ranges[$i+1]->[0] > $cnv_end){ | |
| 1332 if($ploidy > 2){ # gain | |
| 1333 if($strand eq "-"){ | |
| 1334 record_snv("$target_parent\t", | |
| 1335 ($non_coding ? "g.$cnv_start\_$cnv_end" : | |
| 1336 "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), | |
| 1337 # pos Zygosity P-value Variant Reads Total Reads Ref Bases Var Bases Population Frequency Source Pop. freq. or DGV2 gain/loss coverage dbSNP or DGV2 ID | |
| 1338 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", | |
| 1339 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | |
| 1340 } | |
| 1341 else{ | |
| 1342 record_snv("$target_parent\t", | |
| 1343 ($non_coding ? "g.$cnv_start\_$cnv_end" : | |
| 1344 "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), | |
| 1345 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\tNA\t$p_value\tNA\tNA\t", | |
| 1346 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | |
| 1347 } | |
| 1348 } | |
| 1349 else{ # loss | |
| 1350 if($strand eq "-"){ | |
| 1351 record_snv("$target_parent\t", | |
| 1352 ($non_coding ? "g.$cnv_start\_$cnv_end" : | |
| 1353 "c.".($feature_ranges[$i+1]->[2])."+".($feature_ranges[$i+1]->[0]-$cnv_end)."_".($feature_ranges[$i+1]->[2]+1)."-".($cnv_start-$feature_ranges[$i]->[1])), | |
| 1354 "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | |
| 1355 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | |
| 1356 } | |
| 1357 else{ | |
| 1358 record_snv("$target_parent\t", | |
| 1359 ($non_coding ? "g.$cnv_start\_$cnv_end" : | |
| 1360 "c.".($feature_ranges[$i+1]->[2]-1)."+".($cnv_start-$feature_ranges[$i]->[1])."_".$feature_ranges[$i+1]->[2]."-".($feature_ranges[$i+1]->[0]-$cnv_end)), | |
| 1361 "del\t$strand\t$cnv_chr\t$cnv_start\t$cnv_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | |
| 1362 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t\n"); | |
| 1363 } | |
| 1364 } | |
| 1365 $intron_found = 1; last; | |
| 1366 } | |
| 1367 } | |
| 1368 warn "Logic error: CNV overlaps a gene ($target_parent), but is neither intronic nor exonic. Offending CNV was $_\n" unless $intron_found; | |
| 1369 next; | |
| 1370 } | |
| 1371 if($strand eq "-"){ | |
| 1372 my $tmp = $cnv_cdna_start; | |
| 1373 $cnv_cdna_start = $cnv_cdna_end; | |
| 1374 $cnv_cdna_end = $tmp; | |
| 1375 } | |
| 1376 # Make the location label pretty descriptive | |
| 1377 my $cnv_phase = ""; | |
| 1378 if($cnv_exon_start > $cnv_start or $cnv_exon_end < $cnv_end){ | |
| 1379 $cnv_phase = "CNV-$cnv_chr:$cnv_start-$cnv_end"; # Use phase to note that it's part of a bigger CNV than just the range of this feature | |
| 1380 } | |
| 1381 # if we get here, we're in a gained/deleted exon category | |
| 1382 # CNVs are fuzzy-edged (as opposed to large indels), so produce HGVS syntax that reflect this | |
| 1383 if($ploidy > 2){ # gain | |
| 1384 # find the exons encompassed by the CNV. NOTE that we assume that polyploidies are proximal | |
| 1385 record_snv("$target_parent\t", | |
| 1386 ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : | |
| 1387 "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), | |
| 1388 "[".($ploidy-1)."]\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\tNA\t$p_value\tNA\tNA\t", | |
| 1389 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); | |
| 1390 } | |
| 1391 else{ # loss | |
| 1392 #translate genome coordinates into cDNA coordinates | |
| 1393 record_snv("$target_parent\t", | |
| 1394 ($non_coding ? "g.".($cnv_exon_start > $cnv_start ? "$cnv_exon_start-?" : $cnv_start)."_".($cnv_exon_end < $cnv_end ? "$cnv_exon_end+?" : $cnv_end) : | |
| 1395 "c.$cnv_cdna_start".($off5?"-?":"")."_$cnv_cdna_end".($off3?"+?":"")), | |
| 1396 "del\t$strand\t$cnv_chr\t$cnv_exon_start\t$cnv_exon_end\t", ($ploidy == 1 ? "heterozygote" : "homozygote"), "\t$p_value\tNA\tNA\t", | |
| 1397 "NA\tNA\t$src\t$dgv_coverage\t$dgv_id\tNA\tNA\t".range2genes($cnv_chr,$cnv_start,$cnv_end)."\t$target_caveats\t$cnv_phase\n"); | |
| 1398 } | |
| 1399 } | |
| 1400 } | |
| 1401 close(CNV); | |
| 1402 | |
| 1403 } | |
| 1404 | |
| 1405 | |
| 1406 #sort genes by start, then longest if tied | |
| 1407 my %rc = qw(A T T A G C C G N N S W W S K M M K Y R R Y X X); | |
| 1408 print STDERR "Processing variant calls..." unless $quiet; | |
| 1409 %chr_read = (); | |
| 1410 open(VCFIN, $input_file) | |
| 1411 or die "Cannot open $input_file for reading: $!\n"; | |
| 1412 while(<VCFIN>){ | |
| 1413 if(/^\s*(?:#|$)/){ # blank or hash comment lines | |
| 1414 next; | |
| 1415 } | |
| 1416 chomp; | |
| 1417 my @fields = split /\t/, $_; | |
| 1418 | |
| 1419 next unless exists $feature_range{$fields[0]}; | |
| 1420 if(not $quiet and not exists $chr_read{$fields[0]}){ | |
| 1421 print STDERR " $fields[0]"; | |
| 1422 $chr_read{$fields[0]} = 1; | |
| 1423 #print STDERR "(not in reference file!)" unless exists $feature_range{$fields[0]}; | |
| 1424 } | |
| 1425 | |
| 1426 next if $fields[4] eq "<NON_REF>"; # GVCF background stuff | |
| 1427 next if $fields[9] eq "./." or $fields[9] eq "."; # no call | |
| 1428 my $chr = $fields[0]; | |
| 1429 next if defined $which_chr and $chr ne $which_chr and $chr ne "chr$which_chr" and "chr$chr" ne $which_chr; | |
| 1430 print STDERR "passes chr and field # test" if grep /dataset_7684.dat/, @ARGV; | |
| 1431 $chr = "chr$chr" if $chr !~ /^chr/; | |
| 1432 $fields[8] =~ s/\s+$//; | |
| 1433 my @values = split /:/, $fields[9]; | |
| 1434 #print STDERR join(" / ", @values), "\n" if $. == 84; | |
| 1435 my @keys = split /:/, $fields[8]; | |
| 1436 my $zygosity = "n/a"; | |
| 1437 my $quality = "n/a"; | |
| 1438 my $read_depth = "n/a"; | |
| 1439 my $variant_depths = "n/a"; | |
| 1440 for(my $i = 0; $i <= $#keys and $i <= $#values; $i++){ # values max index check because some genotypers are nasty and don't provide as many fields as they say they will | |
| 1441 if($keys[$i] eq "GT"){ | |
| 1442 if($values[$i] =~ /\./ or $values[$i] =~ /0\/0/){ # one genotype not described | |
| 1443 $zygosity = "none"; | |
| 1444 last; | |
| 1445 } | |
| 1446 else{ # genotypes described | |
| 1447 $zygosity = $values[$i] =~ /[02]/ ? "heterozygote" : "homozygote"; | |
| 1448 } | |
| 1449 } | |
| 1450 elsif($keys[$i] eq "DNM_CONFIG" and $zygosity eq "n/a"){ # hack | |
| 1451 $zygosity = $values[$i] =~ /^(.)\1/ ? "homozygote" : "heterozygote"; | |
| 1452 } | |
| 1453 elsif($keys[$i] eq "GQ" and $values[$i] ne "."){ | |
| 1454 #print "Checking GQ (index $i) $values[$i] gq2p\n" if $. == 84; | |
| 1455 $quality = gq2p($values[$i]); | |
| 1456 } | |
| 1457 elsif($keys[$i] eq "RD"){ # from some tools like denovo variant finders | |
| 1458 $read_depth = $values[$i]; | |
| 1459 } | |
| 1460 elsif($keys[$i] eq "DP"){ | |
| 1461 $read_depth = $values[$i]; | |
| 1462 } | |
| 1463 # the frequency of the variant can go by many names, here we have freebayes (A* are new and old versions) and atlas2_indel | |
| 1464 elsif($keys[$i] eq "AA" or $keys[$i] eq "VR" or $keys[$i] eq "AO"){ | |
| 1465 $variant_depths = $values[$i]; | |
| 1466 } | |
| 1467 # here we have GATK variant freq of form ref#,var# | |
| 1468 elsif($keys[$i] eq "AD"){ | |
| 1469 $variant_depths = $values[$i]; | |
| 1470 $variant_depths =~ s/^\d+,//; | |
| 1471 } | |
| 1472 else{ | |
| 1473 #print STDERR "Ignoring field $keys[$i]\n"; | |
| 1474 } | |
| 1475 } | |
| 1476 next if $zygosity eq "none"; # GVCF non-ref for example or when multiple samples are reported simultaneously | |
| 1477 $quality = z2p($1) if $fields[7] =~ /Z=(\d+\.\d+)/; | |
| 1478 if($quality eq "n/a" and $fields[5] ne "."){ | |
| 1479 $quality = gq2p($fields[5]); | |
| 1480 } | |
| 1481 if($fields[5] eq "0" and $fields[6] eq "PASS"){ # not qual and a PASS in the filter column | |
| 1482 $quality = 1; | |
| 1483 } | |
| 1484 elsif($quality ne "n/a" and $quality > $min_pvalue){ # p-value cutoff | |
| 1485 #print "Checking call quality $fields[5] gq2p\n" if $. == 84; | |
| 1486 next unless gq2p($fields[5]) <= $min_pvalue; # in some cases, programs like FreeBayes give low genotype quality such as when a call is borderline homo/het | |
| 1487 # in these cases it would be silly to reject the call if their are many supporting reads. | |
| 1488 } | |
| 1489 | |
| 1490 # Some tools like GATK don't report number of variant reads...infer from other data if possible | |
| 1491 if($variant_depths eq "n/a"){ | |
| 1492 my @key_value_pairs = split /;/, $fields[7]; | |
| 1493 for my $key_value_pair (@key_value_pairs){ | |
| 1494 if($key_value_pair !~ /^(.*?)=(.*)$/){ | |
| 1495 next; | |
| 1496 #next if $key_value_pair eq "INDEL"; # samtools peculiarity | |
| 1497 #die "Key-value pair field (column #8) does not have the format key=value for entry $key_value_pair (line #$. of ), please fix the VCF file\n"; | |
| 1498 } | |
| 1499 if($1 eq "AB"){ # GATK: for het calls, AB is ref/(ref+var), only one variant reported per line | |
| 1500 $variant_depths = ""; | |
| 1501 for my $ab (split /,/, $2){ | |
| 1502 $variant_depths .= int((1-$ab)*$read_depth).","; | |
| 1503 } | |
| 1504 chop $variant_depths; | |
| 1505 } | |
| 1506 elsif($1 eq "MLEAC"){ | |
| 1507 } | |
| 1508 elsif($1 eq "DP4"){ # samtools: high-quality ref-forward bases, ref-reverse, alt-forward and alt-reverse bases | |
| 1509 my @rds = split /,/, $2; | |
| 1510 $variant_depths = $rds[2]+$rds[3]; | |
| 1511 $read_depth = $rds[0]+$rds[1]+$variant_depths; | |
| 1512 if($fields[4] =~ /,/){ # samtools doesn't break down compound het calls into individual frequencies | |
| 1513 my $num_alt_genotypes = $fields[4] =~ tr/,/,/; | |
| 1514 $num_alt_genotypes++; | |
| 1515 my $even_prop = sprintf "%.2f", $variant_depths/$read_depth/$num_alt_genotypes; | |
| 1516 $variant_depths = join(",", ($even_prop) x $num_alt_genotypes); | |
| 1517 if(not exists $chr2caveats{"$chr:$fields[1]"}){ | |
| 1518 $chr2caveats{"$chr:$fields[1]"} = "compound het var freq n/a in orig VCF file, auto set to $even_prop"; | |
| 1519 } | |
| 1520 else{ | |
| 1521 $chr2caveats{"$chr:$fields[1]"} .= "; compound het var freq n/a in orig VCF file, auto set to $even_prop"; | |
| 1522 } | |
| 1523 } | |
| 1524 } | |
| 1525 } | |
| 1526 } | |
| 1527 if($variant_depths eq "n/a"){ # usually homo var calls, can only assume all reads are variant | |
| 1528 if($zygosity eq "homozygote"){ | |
| 1529 $variant_depths = $read_depth; | |
| 1530 if($read_depth ne "n/a"){ | |
| 1531 if(not exists $chr2caveats{"$chr:$fields[1]"}){ | |
| 1532 $chr2caveats{"$chr:$fields[1]"} = "homo var freq n/a in orig VCF file, auto set to 1.0"; | |
| 1533 } | |
| 1534 else{ | |
| 1535 $chr2caveats{"$chr:$fields[1]"} = "; homo var freq n/a in orig VCF file, auto set to 1.0"; | |
| 1536 } | |
| 1537 } | |
| 1538 } | |
| 1539 else{ | |
| 1540 if($read_depth ne "n/a"){ | |
| 1541 $variant_depths = int($read_depth/2); | |
| 1542 if(not exists $chr2caveats{"$chr:$fields[1]"}){ | |
| 1543 $chr2caveats{"$chr:$fields[1]"} = "het var freq n/a in orig VCF file, auto set to 0.5"; | |
| 1544 } | |
| 1545 else{ | |
| 1546 $chr2caveats{"$chr:$fields[1]"} = "; het var freq n/a in orig VCF file, auto set to 0.5"; | |
| 1547 } | |
| 1548 } | |
| 1549 else{ | |
| 1550 $variant_depths = $read_depth; | |
| 1551 } | |
| 1552 } | |
| 1553 } | |
| 1554 | |
| 1555 my $target_parents = $feature_intervaltree{$chr}->fetch($fields[1]-$flanking_bases, $fields[1]+length($fields[3])+$flanking_bases); | |
| 1556 # Last ditch, if not in a gene model, is it at least in an enrichment region? | |
| 1557 if(not @$target_parents){ | |
| 1558 next if not exists $enrichment_regions{$chr}; | |
| 1559 my $regions_ref = $enrichment_regions{$chr}; | |
| 1560 my $location = $fields[1]; | |
| 1561 my $strand = "+"; # for lack of a better choice | |
| 1562 for(my $i = find_earliest_index($location-$flanking_bases, $regions_ref); | |
| 1563 $i < $#$regions_ref and $location <= $regions_ref->[$i]->[1]+$flanking_bases; | |
| 1564 $i++){ | |
| 1565 next unless $regions_ref->[$i]->[0]-$flanking_bases <= $location and $regions_ref->[$i]->[1]+$flanking_bases >= $location; | |
| 1566 | |
| 1567 my $feature_name = "enrichment_target_$chr:".$regions_ref->[$i]->[0]."-".$regions_ref->[$i]->[1]; | |
| 1568 $feature_type{$feature_name} = "misc_enrichment_kit_target"; | |
| 1569 $feature_length{$feature_name} = $regions_ref->[$i]->[1]-$regions_ref->[$i]->[0]+1; | |
| 1570 my @variants = split /,/, $fields[4]; | |
| 1571 my @variant_depths = split /,/, $variant_depths; | |
| 1572 my $ref = uc($fields[3]); | |
| 1573 for(my $j = 0; $j <= $#variants; $j++){ | |
| 1574 my $variant = $variants[$j]; | |
| 1575 next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff | |
| 1576 my $variant_depth = $variant_depths[$j]; | |
| 1577 if($min_prop){ | |
| 1578 next unless $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; | |
| 1579 } | |
| 1580 if(length($ref) == 1 and length($variant) == 1){ # SNP | |
| 1581 record_snv("$feature_name\tg.$location", | |
| 1582 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1583 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1584 } | |
| 1585 elsif(length($ref) == 1 and length($variant) > 1){ # insertion | |
| 1586 record_snv("$feature_name\tg.$location\_",($location+1), | |
| 1587 "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1588 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1589 } | |
| 1590 elsif(length($variant) == 1 and length($ref) > 1){ # deletion | |
| 1591 record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), | |
| 1592 "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1593 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1594 } | |
| 1595 else{ # indel | |
| 1596 record_snv("$feature_name\tg.$location\_",($location+length($ref)-1), | |
| 1597 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1598 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1599 } | |
| 1600 } # end for variants | |
| 1601 next; # process next record, we've done all we can with a non-coding-gene SNP | |
| 1602 } | |
| 1603 } | |
| 1604 | |
| 1605 for my $target_parent (@$target_parents){ | |
| 1606 | |
| 1607 print STDERR "Checking parent $target_parent for on $chr:$fields[1] $fields[3] -> $fields[4]\n" if grep /dataset_7684.dat/, @ARGV; | |
| 1608 my @feature_ranges = @{$feature_range{$chr}->{$target_parent}}; | |
| 1609 # Calculate the position of the change within the feature range position | |
| 1610 my $strand = $feature_strand{$target_parent}; | |
| 1611 my $trans_table = exists $feature_transl_table{$target_parent} ? $feature_transl_table{$target_parent} : $default_transl_table; | |
| 1612 $fields[4]=~tr/"//d; # sometime strangely surroundsed by quotes | |
| 1613 my @variants = split /,/, $fields[4]; | |
| 1614 my @variant_depths = split /,/, $variant_depths; | |
| 1615 my @refs = (uc($fields[3])) x scalar(@variants); | |
| 1616 my @locations = ($fields[1]) x scalar(@variants); | |
| 1617 | |
| 1618 for(my $j = 0; $j <= $#variants; $j++){ | |
| 1619 my $ref = $refs[$j]; | |
| 1620 my $location = $locations[$j]; | |
| 1621 my $feature_offset = 0; | |
| 1622 my $feature_num = 0; | |
| 1623 my $variant = uc($variants[$j]); | |
| 1624 next if $variant eq "<NON_REF>" or $variant_depths[$j] eq "0"; # GVCF stuff | |
| 1625 my $variant_depth = $variant_depths[$j] || "n/a"; | |
| 1626 #print STDERR "Evaluating target parent $target_parent for $chr:$fields[1]-".($fields[1]+length($fields[3]))." -> ",join("/", @$target_parents) , "\n" if $fields[1] == 982994; | |
| 1627 | |
| 1628 # Break down MNPs into individual SNPs that are phased (TODO: skip if it's an inversion? would require amalgamating SNPs for tools that call them individually, phased :-P) | |
| 1629 if(length($variant) > 1 and length($variant) == length($ref)){ | |
| 1630 my @subvariants; | |
| 1631 my @subrefs; | |
| 1632 my @sublocations; | |
| 1633 my $phase_range = $location."-".($location+length($ref)-1); | |
| 1634 for(my $k = 0; $k < length($variant); $k++){ | |
| 1635 my $r = substr($ref, $k, 1); | |
| 1636 my $v = substr($variant, $k, 1); | |
| 1637 if($r ne $v){ | |
| 1638 push @subvariants, $v; | |
| 1639 push @subrefs, $r; | |
| 1640 push @sublocations, $location+$k; | |
| 1641 } | |
| 1642 elsif(@variants == 1){ | |
| 1643 next; # homo ref call | |
| 1644 } | |
| 1645 if($zygosity eq "heterozygote"){ | |
| 1646 # need to ignore case where a homozygous call (variant or ref) is included in a double non-ref het MNP | |
| 1647 if(@variants > 1){ | |
| 1648 my $homo = 1; | |
| 1649 for(my $l = 0; $l <= $#variants; $l++){ # using loop in case we handle oligoploid genomes in the future | |
| 1650 if(length($variants[$l]) <= $k or substr($variants[$l], $k, 1) ne $v){ | |
| 1651 $homo = 0; | |
| 1652 last; | |
| 1653 } | |
| 1654 } | |
| 1655 next if $homo; | |
| 1656 } | |
| 1657 my $phase_key = "$chr:".($location+$k).":$v"; | |
| 1658 my $phase_label = "M$j-$chr:$phase_range"; | |
| 1659 if(exists $chr2phase{$phase_key}){ | |
| 1660 if($chr2phase{$phase_key} !~ /$phase_label/){ | |
| 1661 $chr2phase{$phase_key} .= ",$phase_label"; | |
| 1662 } | |
| 1663 } | |
| 1664 else{ | |
| 1665 $chr2phase{$phase_key} = $phase_label; | |
| 1666 } | |
| 1667 } | |
| 1668 } | |
| 1669 # recycle this MNP variant loop to start processing the individual SNPs | |
| 1670 splice(@refs, $j, 1, @subrefs); | |
| 1671 splice(@variants, $j, 1, @subvariants); | |
| 1672 splice(@locations, $j, 1, @sublocations); | |
| 1673 splice(@variant_depths, $j, 1, ($variant_depth) x scalar(@subvariants)); | |
| 1674 $j--; | |
| 1675 next; | |
| 1676 } | |
| 1677 | |
| 1678 if($min_prop != 0 and $variant_depth eq "n/a" or $variant_depth eq "."){ | |
| 1679 print STDERR "Could not parse variant depth from $_\n" unless $quiet; | |
| 1680 next; | |
| 1681 } | |
| 1682 next unless $min_prop == 0 or $min_prop and $variant_depth >= $min_depth and $read_depth ne "n/a" and $variant_depth/$read_depth >= $min_prop; | |
| 1683 if($zygosity eq "NA"){ | |
| 1684 # make the call ourselves | |
| 1685 $zygosity = $variant_depths/$read_depth > 1-$min_prop ? "homozygote" : "heterozygote"; | |
| 1686 } | |
| 1687 # e.g. chr22 47857671 . CAAAG AAGAT,AAAAG 29.04 . | |
| 1688 if(length($variant) > 1 and length($variant) == length($ref)){ | |
| 1689 for(my $k = 0; $k < length($variant); $k++){ | |
| 1690 my $fixed_variant = $variant; | |
| 1691 substr($fixed_variant, $k, 1) = substr($ref, $k, 1); | |
| 1692 if($fixed_variant eq $ref){ # single base difference at base k between the two | |
| 1693 $ref = substr($ref, $k, 1); | |
| 1694 $variant = substr($variant, $k, 1); | |
| 1695 $location += $k; | |
| 1696 last; | |
| 1697 } | |
| 1698 } | |
| 1699 } | |
| 1700 | |
| 1701 # samtools reports indels with long common tails, causing non-canonical HGVS reporting and a problem when looking up the variant in dbSNP etc. | |
| 1702 # remove common tails to variant calls in order to try to rectify this | |
| 1703 else{ | |
| 1704 while(length($ref) > 1 and length($variant) > 1 and substr($ref, length($ref)-1) eq substr($variant, length($variant)-1)){ | |
| 1705 chop $ref; chop $variant; | |
| 1706 } | |
| 1707 } | |
| 1708 | |
| 1709 # See if a caveat should be added because the indel was in a polyhomomer region | |
| 1710 if(length($ref) > length($variant) and index($ref, $variant) == 0 and $fasta_index->fetch("$chr:".($location+1)."-".($location+length($ref)+1)) =~ /^([ACGT])\1+$/i){ | |
| 1711 if(not exists $chr2caveats{"$chr:$location"}){ | |
| 1712 $chr2caveats{"$chr:$location"} = "poly".uc($1)." region deletion"; | |
| 1713 } | |
| 1714 elsif($chr2caveats{"$chr:$location"} !~ /poly/){ | |
| 1715 $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region deletion"; | |
| 1716 } | |
| 1717 } | |
| 1718 elsif(length($ref) < length($variant) and index($variant, $ref) == 0 and substr($variant, 1) =~ /^([ACGT])\1+$/i){ | |
| 1719 if(not exists $chr2caveats{"$chr:$location"}){ | |
| 1720 $chr2caveats{"$chr:$location"} .= "poly".uc($1)." region insertion"; | |
| 1721 } | |
| 1722 elsif($chr2caveats{"$chr:$location"} !~ /poly/){ | |
| 1723 $chr2caveats{"$chr:$location"} .= "; poly".uc($1)." region insertion"; | |
| 1724 } | |
| 1725 } | |
| 1726 | |
| 1727 # Not a protein-coding gene? Report genomic cooordinates for HGVS | |
| 1728 if(not exists $feature_cds_max{$target_parent} or not defined $feature_cds_max{$target_parent} or $feature_cds_max{$target_parent} eq ""){ | |
| 1729 if(length($ref) == 1 and length($variant) == 1){ # SNP | |
| 1730 record_snv("$target_parent\tg.$location", | |
| 1731 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1732 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1733 } | |
| 1734 elsif(length($ref) == 1 and length($variant) > 1){ # insertion | |
| 1735 record_snv("$target_parent\tg.$location\_",($location+1), | |
| 1736 "ins",substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1737 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1738 } | |
| 1739 elsif(length($variant) == 1 and length($ref) > 1){ # deletion | |
| 1740 record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), | |
| 1741 "del",substr($ref, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1742 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1743 } | |
| 1744 else{ # indel | |
| 1745 record_snv("$target_parent\tg.$location\_",($location+length($ref)-1), | |
| 1746 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1747 join("\t",prop_info_key($chr,$location,$ref,$variant)),"\tNA\n"); | |
| 1748 } | |
| 1749 next; # process next record, we've done all we can with a non-coding-gene SNP | |
| 1750 } | |
| 1751 | |
| 1752 if($strand eq "-"){ | |
| 1753 # set up utr offset for correct CDS coordinates | |
| 1754 for(my $i = $#feature_ranges; $i >= 0; $i--){ | |
| 1755 # exon is completely 5' of the start | |
| 1756 if($feature_ranges[$i]->[0] > $feature_cds_max{$target_parent}){ | |
| 1757 #print STDERR "Whole 5' UTR exon $i: ",$feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1,"\n"; | |
| 1758 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 1759 } | |
| 1760 # exon with the cds start | |
| 1761 elsif($feature_ranges[$i]->[1] >= $feature_cds_max{$target_parent} and | |
| 1762 $feature_ranges[$i]->[0] <= $feature_cds_max{$target_parent}){ | |
| 1763 #print STDERR "Start codon in exon $i: ", $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1],"\n"; | |
| 1764 $feature_offset += $feature_cds_max{$target_parent} - $feature_ranges[$i]->[1]; | |
| 1765 last; | |
| 1766 } | |
| 1767 else{ | |
| 1768 die "The CDS for $target_parent (on negative strand) ends downstream ", | |
| 1769 "($feature_cds_max{$target_parent}) of the an exon", | |
| 1770 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; | |
| 1771 } | |
| 1772 } | |
| 1773 for(my $i = $#feature_ranges; $i >= 0; $i--){ | |
| 1774 my $feature = $feature_ranges[$i]; | |
| 1775 # in the 3' UTR region of the gene | |
| 1776 if($location < $feature_cds_min{$target_parent}){ | |
| 1777 my $feature_exon = 0; | |
| 1778 $feature = $feature_ranges[$feature_exon]; | |
| 1779 while($location > $feature->[1]+$flanking_bases and | |
| 1780 $feature_exon < $#feature_ranges){ | |
| 1781 $feature = $feature_ranges[++$feature_exon]; # find the 3' utr exon in which the variant is located | |
| 1782 } | |
| 1783 # utr exons passed entirely | |
| 1784 my $stop_exon = $feature_exon; | |
| 1785 while($feature_ranges[$stop_exon]->[1] < $feature_cds_min{$target_parent}){ | |
| 1786 $stop_exon++; | |
| 1787 } | |
| 1788 my $post_offset = $feature_cds_min{$target_parent}-$feature_ranges[$stop_exon]->[0]; # offset from the stop codon in the final coding exon | |
| 1789 while($stop_exon > $feature_exon){ | |
| 1790 $post_offset += $feature_ranges[$stop_exon]->[1]-$feature_ranges[$stop_exon]->[0]+1; | |
| 1791 $stop_exon--; | |
| 1792 } | |
| 1793 | |
| 1794 my $pos = $feature->[1]-$location+1+$post_offset; | |
| 1795 my $junction_dist; | |
| 1796 if($location < $feature->[0]){ # after a UTR containing exon? set as .*DD+DD | |
| 1797 $junction_dist = ($feature->[0]-$location); | |
| 1798 $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$junction_dist; | |
| 1799 } | |
| 1800 elsif($location > $feature->[1]){ # before a total UTR exon? set as .*DD-DD | |
| 1801 $junction_dist = -($location-$feature->[1]); | |
| 1802 $pos = ($post_offset+1).$junction_dist; | |
| 1803 } | |
| 1804 else{ # in the UTR exon | |
| 1805 if($location - $feature->[0] < $feature->[1] - $location){ # location is closer to exon donor site | |
| 1806 $junction_dist = -($location - $feature->[0]+1); # +1 for HGVS syntax compatibility (there is no position 0, direct skip from -1 to +1) | |
| 1807 } | |
| 1808 else{ | |
| 1809 $junction_dist = $feature->[1] - $location +1; | |
| 1810 } | |
| 1811 } | |
| 1812 if(length($ref) == 1 and length($variant) == 1){ | |
| 1813 my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | |
| 1814 # 3' UTR SNP | |
| 1815 record_snv("$target_parent\tc.*$pos", | |
| 1816 "$rc{$ref}>$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1817 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1818 join("\t",prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1819 } | |
| 1820 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 1821 my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); | |
| 1822 # 3' UTR insertion | |
| 1823 record_snv("$target_parent\tc.*", | |
| 1824 hgvs_plus($pos,-1),"_*",$pos, | |
| 1825 "ins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1826 #"ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1827 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1828 } | |
| 1829 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 1830 my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); | |
| 1831 my $delBases = substr($rc,0,length($rc)-1); | |
| 1832 if(length($ref) == 2){ | |
| 1833 # 3' UTR single base deletion | |
| 1834 record_snv("$target_parent\tc.*",hgvs_plus($pos,-1), | |
| 1835 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1836 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1837 } | |
| 1838 else{ | |
| 1839 # longer 3' UTR deletion | |
| 1840 record_snv("$target_parent\tc.*", | |
| 1841 hgvs_plus($pos,-length($ref)+1),"_*",hgvs_plus($pos, -1), | |
| 1842 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1843 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1844 } | |
| 1845 } | |
| 1846 else{ | |
| 1847 my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | |
| 1848 if($rc eq $ref and length($variant) > 3){ | |
| 1849 # 3' UTR inversion | |
| 1850 record_snv("$target_parent\tc.*", | |
| 1851 hgvs_plus($pos,-length($ref)+1),"_*",$pos, | |
| 1852 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1853 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1854 last; | |
| 1855 } | |
| 1856 | |
| 1857 # complex substitution in 3' UTR | |
| 1858 # Will break if stop codon is modified | |
| 1859 record_snv("$target_parent\tc.*", | |
| 1860 hgvs_plus($pos,-length($ref)+1),"_*", $pos, | |
| 1861 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1862 join("\t", prop_info_key($chr,$location,$ref,$variant,$junction_dist)),"\tNA\n"); | |
| 1863 } | |
| 1864 last; | |
| 1865 } | |
| 1866 # in the feature | |
| 1867 elsif($location >= $feature->[0] and $location <= $feature->[1]){ | |
| 1868 my $pos = $feature_offset+$feature->[1]-$location+1; | |
| 1869 if($location > $feature_cds_max{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one | |
| 1870 $pos = hgvs_plus($pos, -1); | |
| 1871 } | |
| 1872 my $first_exon_base = $feature_offset+1; | |
| 1873 my $exon_edge_dist = $feature->[1]-$location+1; # equiv of HGVS +X or -X intron syntax, but for exons | |
| 1874 $exon_edge_dist = $feature->[0]-$location-1 if abs($feature->[0]-$location-1) < $exon_edge_dist; # pick closer of donor and acceptor sites | |
| 1875 #print STDERR "Got ", $feature->[1]-$location+1, "vs. ", $feature->[0]-$location-1, ": chose $exon_edge_dist\n"; | |
| 1876 if(length($ref) == 1 and length($variant) == 1){ | |
| 1877 # SNP | |
| 1878 record_snv("$target_parent\tc.", | |
| 1879 $pos, | |
| 1880 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1881 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1882 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1883 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1884 } | |
| 1885 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 1886 my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | |
| 1887 my $insBases = substr($rc,1); | |
| 1888 # insertion | |
| 1889 record_snv("$target_parent\tc.", | |
| 1890 hgvs_plus_exon($pos, -1, $first_exon_base),"_",$pos,"ins$insBases", | |
| 1891 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1892 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1893 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1894 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1895 } | |
| 1896 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 1897 my $rc = join("",map({$rc{$_}} split(//,reverse($ref)))); | |
| 1898 my $delBases = substr($rc,0,length($rc)-1); | |
| 1899 # single nucleotide deletion | |
| 1900 if(length($ref) == 2){ | |
| 1901 record_snv("$target_parent\tc.", | |
| 1902 hgvs_plus_exon($pos, -1, $first_exon_base), | |
| 1903 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1904 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1905 ($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1906 #($pos-1 < 1 ? "NA" : $pos-1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1907 } | |
| 1908 # longer deletion | |
| 1909 else{ | |
| 1910 $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; | |
| 1911 record_snv("$target_parent\tc.", | |
| 1912 hgvs_plus_exon($pos, -length($ref)+1, $first_exon_base),"_", | |
| 1913 hgvs_plus_exon($pos, -1, $first_exon_base), | |
| 1914 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1915 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1916 ($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1917 #($pos-1 < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1918 } | |
| 1919 } | |
| 1920 else{ | |
| 1921 # complex substitution | |
| 1922 $exon_edge_dist = $feature->[1]-$location-length($ref)+1 if abs($feature->[1]-$location-length($ref)+1) < $exon_edge_dist; | |
| 1923 my $rc = join("",map({$rc{$_}} split(//,reverse($variant)))); | |
| 1924 if($rc eq $variant and length($variant) > 3){ | |
| 1925 # inversion | |
| 1926 record_snv("$target_parent\tc.", | |
| 1927 hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", | |
| 1928 $pos, | |
| 1929 "inv", | |
| 1930 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1931 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1932 ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1933 | |
| 1934 last; | |
| 1935 } | |
| 1936 record_snv("$target_parent\tc.", | |
| 1937 hgvs_plus_exon($pos,-length($ref)+1,$first_exon_base),"_", | |
| 1938 $pos, | |
| 1939 "delins$rc", | |
| 1940 "\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1941 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 1942 ($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1943 #($pos < 1 ? "NA" : $pos-length($ref)+1 < $first_exon_base ? "p.?" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 1944 } | |
| 1945 last; | |
| 1946 } | |
| 1947 # 5' of feature (on negative strand) | |
| 1948 elsif($location > $feature->[1]){ | |
| 1949 if(length($ref) == 1 and length($variant) == 1){ | |
| 1950 # intronic SNP | |
| 1951 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 1952 # Closer to acceptor site | |
| 1953 record_snv("$target_parent\tc.",$feature_offset+1, | |
| 1954 ($feature->[1]-$location), | |
| 1955 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1956 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1957 join("\t", prop_info_key($chr,$location,$ref,$variant, $feature->[1]-$location)),"\tNA\n"); | |
| 1958 } | |
| 1959 else{ | |
| 1960 # Closer to donor site | |
| 1961 record_snv("$target_parent\tc.",$feature_offset,"+", | |
| 1962 ($feature_ranges[$i+1]->[0]-$location), | |
| 1963 "$rc{$ref}>$rc{$variant}\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1964 #"$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1965 join("\t", prop_info_key($chr,$location,$ref,$variant, $feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | |
| 1966 } | |
| 1967 } | |
| 1968 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 1969 my $rc = join("",map({$rc{$_}} split(//,reverse(substr($variant,1))))); | |
| 1970 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 1971 # intronic insertion near acceptor | |
| 1972 my $pos = ($feature_offset+1).($feature->[1]-$location); | |
| 1973 record_snv("$target_parent\tc.", | |
| 1974 hgvs_plus($pos,-1),"_",$pos, | |
| 1975 "ins", | |
| 1976 $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1977 #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1978 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location-1)),"\tNA\n"); | |
| 1979 } | |
| 1980 else{ | |
| 1981 # intronic insertion near donor | |
| 1982 my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); | |
| 1983 record_snv("$target_parent\tc.", | |
| 1984 hgvs_plus($pos,-1),"_",$pos, | |
| 1985 "ins", | |
| 1986 $rc,"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1987 #substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 1988 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location+1)),"\tNA\n"); | |
| 1989 } | |
| 1990 } | |
| 1991 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 1992 # intronic deletion | |
| 1993 # single nucleotide deletion | |
| 1994 my $rc = reverse($ref); | |
| 1995 $rc=~tr/ACGT/TGCA/; | |
| 1996 my $delBases = substr($rc, 0, length($rc)-1); | |
| 1997 if(length($ref) == 2){ | |
| 1998 # single intronic deletion near acceptor | |
| 1999 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 2000 my $off = $feature->[1]-$location-1; | |
| 2001 record_snv("$target_parent\tc.", | |
| 2002 ($feature_offset+1),$off, | |
| 2003 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2004 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | |
| 2005 } | |
| 2006 # single intronic deletion near donor | |
| 2007 else{ | |
| 2008 my $pos = $feature_offset; | |
| 2009 my $off = $feature_ranges[$i+1]->[0]-$location+1; | |
| 2010 record_snv("$target_parent\tc.", | |
| 2011 hgvs_plus_exon($pos, $off, $pos), | |
| 2012 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2013 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | |
| 2014 } | |
| 2015 } | |
| 2016 # longer deletion | |
| 2017 else{ | |
| 2018 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 2019 # long intronic deletion near acceptor | |
| 2020 my $off = $feature->[1]-$location-1; | |
| 2021 my $pos = ($feature_offset+1).$off; | |
| 2022 record_snv("$target_parent\tc.", | |
| 2023 hgvs_plus($pos,-length($ref)+2),"_",$pos, | |
| 2024 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2025 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | |
| 2026 } | |
| 2027 else{ | |
| 2028 # long intronic deletion near donor | |
| 2029 my $off = $feature_ranges[$i+1]->[0]-$location+1; | |
| 2030 my $pos = ($feature_offset)."+".$off; | |
| 2031 record_snv("$target_parent\tc.", | |
| 2032 $pos,"_",hgvs_plus($pos,-length($ref)-1), | |
| 2033 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2034 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off-length($ref)+1 <= 2 ? "p.?" : "NA"),"\n"); | |
| 2035 } | |
| 2036 last; | |
| 2037 } | |
| 2038 } | |
| 2039 else{ | |
| 2040 my $rc = reverse($ref); | |
| 2041 $rc=~tr/ACGT/TGCA/; | |
| 2042 if($rc eq $variant and length($variant) > 3){ | |
| 2043 # intronic inversion near acceptor site | |
| 2044 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 2045 my $pos = ($feature_offset+1).($feature->[1]-$location); | |
| 2046 record_snv("$target_parent\tc.", | |
| 2047 hgvs_plus($pos,-length($ref)+1),"_",$pos, | |
| 2048 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2049 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); | |
| 2050 } | |
| 2051 else{ | |
| 2052 my $pos = ($feature_offset)."+".($feature_ranges[$i+1]->[0]-$location); | |
| 2053 record_snv("$target_parent\tc.", | |
| 2054 $pos,"_",hgvs_plus($pos, length($ref)-1), | |
| 2055 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2056 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | |
| 2057 } | |
| 2058 last; | |
| 2059 } | |
| 2060 $rc = reverse($variant); | |
| 2061 $rc=~tr/ACGT/TGCA/; | |
| 2062 # Intronic complex substitution | |
| 2063 if($i == $#feature_ranges or $feature->[1]-$location >= -1*$flanking_bases){ | |
| 2064 # complex intronic substitution near acceptor site | |
| 2065 my $pos = ($feature_offset+1).($feature->[1]-$location); | |
| 2066 record_snv("$target_parent\tc.", | |
| 2067 hgvs_plus($pos, -length($ref)+1),"_",$pos, | |
| 2068 "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2069 #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2070 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature->[1]-$location)),"\tNA\n"); | |
| 2071 } | |
| 2072 else{ | |
| 2073 # complex intronic substitution near donor site | |
| 2074 my $pos = $feature_offset."+".($feature_ranges[$i+1]->[0]-$location); | |
| 2075 record_snv("$target_parent\tc.", | |
| 2076 $pos,"_",hgvs_plus($pos, length($ref)-1), | |
| 2077 "delins$rc\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2078 #"delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2079 join("\t", prop_info_key($chr,$location,$ref,$variant,$feature_ranges[$i+1]->[0]-$location)),"\tNA\n"); | |
| 2080 } | |
| 2081 } | |
| 2082 last; | |
| 2083 } | |
| 2084 else{ | |
| 2085 #print STDERR "Offset going from ", $feature_offset, " to ", $feature_offset+$feature->[1]-$feature->[0]+1,"\n"; | |
| 2086 $feature_offset += $feature->[1]-$feature->[0]+1; | |
| 2087 #print STDERR "Set feature offset to $feature_offset by adding ",$feature->[1],"-", $feature->[0],"+1\n"; | |
| 2088 } | |
| 2089 } | |
| 2090 } | |
| 2091 else{ | |
| 2092 # forward strand | |
| 2093 | |
| 2094 # set up utr offset for correct CDS coordinates | |
| 2095 for(my $i = 0; $i <= $#feature_ranges; $i++){ | |
| 2096 # All 5' utr exon | |
| 2097 if($feature_ranges[$i]->[1] < $feature_cds_min{$target_parent}){ | |
| 2098 $feature_offset -= $feature_ranges[$i]->[1]-$feature_ranges[$i]->[0]+1; | |
| 2099 } | |
| 2100 # exon with the cds start | |
| 2101 elsif($feature_ranges[$i]->[1] >= $feature_cds_min{$target_parent} and | |
| 2102 $feature_ranges[$i]->[0] <= $feature_cds_min{$target_parent}){ | |
| 2103 $feature_offset -= $feature_cds_min{$target_parent} - $feature_ranges[$i]->[0]; | |
| 2104 last; | |
| 2105 } | |
| 2106 else{ | |
| 2107 die "The CDS for $target_parent starts upstream ($feature_cds_max{$target_parent}) of the first exon", | |
| 2108 " (", $feature_ranges[$i]->[0], "), which is illogical. Please revise the GFF file provided.\n"; | |
| 2109 } | |
| 2110 } | |
| 2111 for(my $i = 0; $i <= $#feature_ranges; $i++){ | |
| 2112 my $feature = $feature_ranges[$i]; | |
| 2113 # 3' of last coding position | |
| 2114 if($location > $feature_cds_max{$target_parent}){ | |
| 2115 # find the exon with the stop codon | |
| 2116 while($feature->[1] < $feature_cds_max{$target_parent}){ | |
| 2117 $feature = $feature_ranges[++$i]; | |
| 2118 } | |
| 2119 my $post_offset = $feature->[0]-$feature_cds_max{$target_parent}; | |
| 2120 while($location > $feature->[1]+$flanking_bases and | |
| 2121 $i < $#feature_ranges){ | |
| 2122 $post_offset += $feature->[1]-$feature->[0]+1; | |
| 2123 $feature = $feature_ranges[++$i]; # find the 3' utr exon in which the variant is located | |
| 2124 } | |
| 2125 my $pos = $location-$feature->[0]+$post_offset; | |
| 2126 #print STDERR "Positive strand: got $pos for $location, exon #$i of $#feature_ranges, post_offset is $post_offset\n" if $location-$feature->[1] > $flanking_bases; | |
| 2127 my $off; | |
| 2128 if($location > $feature->[1]){ # after a UTR containing exon? set as .*DD+DD | |
| 2129 $off = $location-$feature->[1]; | |
| 2130 $pos = ($post_offset+$feature->[1]-$feature->[0]+1)."+".$off; | |
| 2131 } | |
| 2132 elsif($location < $feature->[0]){ # before a total UTR exon? set as .*DD-DD | |
| 2133 $off = -($feature->[0]-$location); | |
| 2134 $pos = ($post_offset+1).$off; | |
| 2135 } | |
| 2136 else{ | |
| 2137 if($location-$feature->[0] < $feature->[1]-$location){ | |
| 2138 $off = $location-$feature->[0]+1; # +1 since HGVS skips right from -1 to +1 at exon boundary | |
| 2139 } | |
| 2140 else{ | |
| 2141 $off = $location-$feature->[1]-1; # will be negative | |
| 2142 } | |
| 2143 } | |
| 2144 if(length($ref) == 1 and length($variant) == 1){ | |
| 2145 # 3' UTR SNP | |
| 2146 record_snv("$target_parent\tc.*$pos", | |
| 2147 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2148 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | |
| 2149 } | |
| 2150 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 2151 # 3' UTR insertion | |
| 2152 record_snv("$target_parent\tc.*", | |
| 2153 hgvs_plus($pos,1),"_*",hgvs_plus($pos,2), | |
| 2154 "ins",substr($variant,1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2155 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | |
| 2156 } | |
| 2157 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 2158 my $delBases = substr($ref, 1); | |
| 2159 if(length($ref) == 2){ | |
| 2160 # 3' UTR single base deletion | |
| 2161 record_snv("$target_parent\tc.*",hgvs_plus($pos,1), | |
| 2162 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2163 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | |
| 2164 } | |
| 2165 else{ | |
| 2166 # longer 3' UTR deletion | |
| 2167 record_snv("$target_parent\tc.*", | |
| 2168 hgvs_plus($pos,1),"_*",hgvs_plus($pos,length($ref)-1), | |
| 2169 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2170 join("\t", prop_info_key($chr,$location,$ref,$variant, $off)),"\tNA\n"); | |
| 2171 } | |
| 2172 } | |
| 2173 else{ | |
| 2174 my $rc = reverse($ref); | |
| 2175 $rc=~tr/ACGT/TGCA/; | |
| 2176 if($rc eq $variant and length($variant) > 3){ | |
| 2177 # 3' UTR inversion | |
| 2178 record_snv("$target_parent\tc.*$pos", | |
| 2179 "_*",hgvs_plus($pos,length($ref)-1), | |
| 2180 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2181 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | |
| 2182 last; | |
| 2183 } | |
| 2184 # complex substitution in 3' UTR | |
| 2185 record_snv("$target_parent\tc.*$pos", | |
| 2186 "_*",hgvs_plus($pos,length($ref)-1), | |
| 2187 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2188 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\tNA\n"); | |
| 2189 } | |
| 2190 last; | |
| 2191 } | |
| 2192 # in the exon | |
| 2193 elsif($location >= $feature->[0] and $location <= $feature->[1]){ | |
| 2194 my $pos = $feature_offset+$location-$feature->[0]+1; | |
| 2195 my $last_exon_base = $feature_offset+$feature->[1]-$feature->[0]+1; | |
| 2196 my $exon_edge_dist = $location-$feature->[0]+1; # equiv of HGVS +X or -X intron syntax, but for exons | |
| 2197 $exon_edge_dist = $location-$feature->[1]-1 if abs($location-$feature->[1]-1) < $exon_edge_dist; # pick closer of donor and acceptor sites | |
| 2198 #print STDERR "Got ", $location-$feature->[0]+1, "vs. ", $location-$feature->[1]-1, ": chose $exon_edge_dist\n"; | |
| 2199 if($location < $feature_cds_min{$target_parent}){ #since there is no position 0, the pos is in UTR, subtract one | |
| 2200 $pos = hgvs_plus($pos, -1); | |
| 2201 } | |
| 2202 if(length($ref) == 1 and length($variant) == 1){ | |
| 2203 # SNP | |
| 2204 record_snv("$target_parent\tc.$pos", | |
| 2205 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2206 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2207 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2208 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2209 } | |
| 2210 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 2211 # insertion | |
| 2212 record_snv("$target_parent\tc.$pos", | |
| 2213 "_",hgvs_plus_exon($pos,1,$last_exon_base),"ins", | |
| 2214 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2215 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2216 ($pos < 1 ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2217 #($pos < 1 ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2218 } | |
| 2219 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 2220 my $delBases = substr($ref, 1); | |
| 2221 # single nucleotide deletion | |
| 2222 if(length($delBases) == 1){ | |
| 2223 record_snv("$target_parent\tc.", | |
| 2224 hgvs_plus_exon($pos,1,$last_exon_base), | |
| 2225 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2226 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2227 ($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2228 #($pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2229 } | |
| 2230 # longer deletion | |
| 2231 else{ | |
| 2232 $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; | |
| 2233 record_snv("$target_parent\tc.", | |
| 2234 hgvs_plus_exon($pos,1,$last_exon_base),"_", | |
| 2235 hgvs_plus_exon($pos,length($ref)-1,$last_exon_base), | |
| 2236 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2237 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2238 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2239 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2240 } | |
| 2241 } | |
| 2242 else{ | |
| 2243 $exon_edge_dist = $feature->[1]-$location-length($ref)-1 if abs($feature->[1]-$location-length($ref)-1) < $exon_edge_dist; | |
| 2244 my $rc = reverse($ref); | |
| 2245 $rc=~tr/ACGT/TGCA/; | |
| 2246 if($rc eq $variant and length($variant) > 3){ | |
| 2247 # inversion | |
| 2248 record_snv("$target_parent\tc.$pos", | |
| 2249 "_",hgvs_plus_exon($pos,length($ref)-1, $last_exon_base), | |
| 2250 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2251 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2252 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2253 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2254 last; | |
| 2255 } | |
| 2256 # complex substitution | |
| 2257 record_snv("$target_parent\tc.$pos", | |
| 2258 "_",hgvs_plus_exon($pos, length($ref)-1, $last_exon_base), | |
| 2259 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2260 join("\t", prop_info_key($chr,$location,$ref,$variant,$exon_edge_dist)),"\t", | |
| 2261 ($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2262 #($pos+length($ref)-1 > $last_exon_base ? "p.?" : $pos < 1 or $pos > $last_exon_base ? "NA" : hgvs_protein_key($chr,$location,$ref,$variant,$pos,$strand,$trans_table)),"\n"); | |
| 2263 } | |
| 2264 last; | |
| 2265 } | |
| 2266 # 5' of feature | |
| 2267 elsif($location < $feature->[0]){ | |
| 2268 if(length($ref) == 1 and length($variant) == 1){ | |
| 2269 # intronic SNP | |
| 2270 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2271 # Closer to donor site | |
| 2272 record_snv("$target_parent\tc.",$feature_offset,"+", | |
| 2273 ($location-$feature_ranges[$i-1]->[1]), | |
| 2274 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2275 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | |
| 2276 } | |
| 2277 else{ | |
| 2278 # Closer to acceptor site | |
| 2279 record_snv("$target_parent\tc.",$feature_offset+1, | |
| 2280 ($location-$feature->[0]), | |
| 2281 "$ref>$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2282 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | |
| 2283 } | |
| 2284 } | |
| 2285 elsif(length($ref) == 1 and length($variant) > 1 and substr($variant, 0, 1) eq $ref){ | |
| 2286 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2287 # intronic insertion near donor | |
| 2288 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | |
| 2289 record_snv("$target_parent\tc.", | |
| 2290 $pos,"_",hgvs_plus($pos,1), | |
| 2291 "ins", | |
| 2292 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2293 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | |
| 2294 } | |
| 2295 else{ | |
| 2296 # intronic insertion near acceptor | |
| 2297 my $pos = ($feature_offset+1).($location-$feature->[0]); | |
| 2298 record_snv("$target_parent\tc.", | |
| 2299 $pos,"_",hgvs_plus($pos,1), | |
| 2300 "ins", | |
| 2301 substr($variant, 1),"\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2302 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | |
| 2303 } | |
| 2304 } | |
| 2305 elsif(length($ref) > 1 and length($variant) == 1 and substr($ref, 0, 1) eq $variant){ | |
| 2306 # intronic deletion | |
| 2307 # single nucleotide deletion | |
| 2308 my $delBases = substr($ref, 1); | |
| 2309 if(length($ref) == 2){ | |
| 2310 # single intronic deletion near donor | |
| 2311 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2312 my $off = $location-$feature_ranges[$i-1]->[1]+1; | |
| 2313 record_snv("$target_parent\tc.", | |
| 2314 $feature_offset,"+",$off, | |
| 2315 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2316 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | |
| 2317 } | |
| 2318 # single intronic deletion near acceptor | |
| 2319 else{ | |
| 2320 my $pos = ($feature_offset+1); | |
| 2321 my $off = $location-$feature->[0]; | |
| 2322 record_snv("$target_parent\tc.", | |
| 2323 hgvs_plus_exon($pos, $off, $pos), | |
| 2324 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2325 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off >= -2 ? "p.?" : "NA"),"\n"); | |
| 2326 } | |
| 2327 } | |
| 2328 # longer deletion | |
| 2329 else{ | |
| 2330 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2331 # long intronic deletion near donor | |
| 2332 my $off = $location-$feature_ranges[$i-1]->[1]+1; | |
| 2333 my $pos = $feature_offset."+".$off; | |
| 2334 record_snv("$target_parent\tc.", | |
| 2335 $pos,"_",hgvs_plus($pos,length($ref)-2), | |
| 2336 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2337 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off <= 2 ? "p.?" : "NA"),"\n"); | |
| 2338 } | |
| 2339 else{ | |
| 2340 # long intronic deletion near acceptor | |
| 2341 my $off = $location-$feature->[0]+1; | |
| 2342 my $pos = ($feature_offset+1).$off; | |
| 2343 record_snv("$target_parent\tc.", | |
| 2344 $pos,"_",hgvs_plus($pos,length($ref)-2), | |
| 2345 "del$delBases\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2346 join("\t", prop_info_key($chr,$location,$ref,$variant,$off)),"\t",($off+length($ref)-2 >= -2 ? "p.?" : "NA"),"\n"); | |
| 2347 } | |
| 2348 } | |
| 2349 } | |
| 2350 else{ | |
| 2351 my $rc = reverse($ref); | |
| 2352 $rc=~tr/ACGT/TGCA/; | |
| 2353 if($rc eq $variant and length($variant) > 3){ | |
| 2354 # intronic inversion near donor site | |
| 2355 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2356 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | |
| 2357 record_snv("$target_parent\tc.", | |
| 2358 $pos,"_",hgvs_plus($pos,length($ref)-1), | |
| 2359 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2360 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | |
| 2361 } | |
| 2362 else{ | |
| 2363 my $pos = ($feature_offset+1).($location-$feature->[0]); | |
| 2364 record_snv("$target_parent\tc.", | |
| 2365 $pos,"_",hgvs_plus($pos, length($ref)-1), | |
| 2366 "inv\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2367 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | |
| 2368 } | |
| 2369 last; | |
| 2370 } | |
| 2371 # Intronic complex substitution | |
| 2372 # Note: sub maybe have comma in it to denote two possible variants | |
| 2373 if($i != 0 and $location-$feature->[0] < -1*$flanking_bases){ | |
| 2374 # complex intronic substitution near donor site | |
| 2375 my $pos = $feature_offset."+".($location-$feature_ranges[$i-1]->[1]); | |
| 2376 record_snv("$target_parent\tc.", | |
| 2377 $pos,"_",hgvs_plus($pos, length($ref)-1), | |
| 2378 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2379 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature_ranges[$i-1]->[1])),"\tNA\n"); | |
| 2380 } | |
| 2381 else{ | |
| 2382 # complex intronic substitution near acceptor site | |
| 2383 my $pos = ($feature_offset+1).($location-$feature->[0]); | |
| 2384 record_snv("$target_parent\tc.", | |
| 2385 $pos,"_",hgvs_plus($pos, length($ref)-1), | |
| 2386 "delins$variant\t$strand\t$chr\t$location\t$zygosity\t$quality\t$variant_depth\t$read_depth\t", | |
| 2387 join("\t", prop_info_key($chr,$location,$ref,$variant,$location-$feature->[0])),"\tNA\n"); | |
| 2388 } | |
| 2389 } | |
| 2390 last; | |
| 2391 } | |
| 2392 else{ | |
| 2393 # feature is past this exon | |
| 2394 $feature_offset += $feature->[1]-$feature->[0]+1; | |
| 2395 } | |
| 2396 } | |
| 2397 } | |
| 2398 } # for each variant on the line | |
| 2399 } # for each gene overlapping the site the VCF describes | |
| 2400 } # for each VCF line | |
| 2401 print STDERR "\n" unless $quiet; | |
| 2402 close(VCFIN); | |
| 2403 | |
| 2404 # Before we can start printing the variants, we need to look at the phase information and calculate the real haplotype HGVS changes | |
| 2405 #if(keys %chr2phase){ | |
| 2406 # Note that we could have samtools read-backed haplotype info, MNPs in the VCF, and pre-existing haplotypes in the input VCF (e.g. imputed or based on Mendelian inheritance where trios exist) | |
| 2407 # We need to create new disjoint sets of phased blocks from the (consistent) union of these data. | |
| 2408 # my $chr2phase2variants = combine_phase_data(\%chr2phase); | |
| 2409 | |
| 2410 # TODO: Calculate protein HGVS syntax for each variant, now that all phase data has been incorporated | |
| 2411 #for my $chr (keys %$chr2phase2variants){ | |
| 2412 # for my $phase (keys %{$chr2phase2variants{$chr}){ | |
| 2413 # # apply all phased changes to the reference chromosomal seq | |
| 2414 # my $phased_seq = $seq{$chr}; #reference | |
| 2415 # # sort the variants from 3' to 5' so that edits after indels don't need adjustment in their ref coordinate | |
| 2416 # my @sorted_variants = sort {my($a_pos) = $a =~ /:(\d+):/; my($b_pos) = $b =~ /:(\d+):/; $b_pos <=> $a_pos} @{$chr2phase2variants{$chr}->{$phase}}; | |
| 2417 # for my $variant (@sorted_variants){ | |
| 2418 # } | |
| 2419 # } | |
| 2420 #} | |
| 2421 #} | |
| 2422 | |
| 2423 # retrieve the MAF info en masse for each chromosome, as this is much more efficient | |
| 2424 my @out_lines; | |
| 2425 for my $snv (@snvs){ | |
| 2426 chomp $snv; | |
| 2427 my @fields = split /\t/, $snv; | |
| 2428 # For CNVs, all the fields are already filled out | |
| 2429 if(@fields > 13){ | |
| 2430 push @out_lines, join("\t", $feature_type{$fields[0]}, ($fields[0] =~ /\S/ ? $feature_length{$fields[0]} : "NA"), @fields); | |
| 2431 next; | |
| 2432 } | |
| 2433 my $variant_key = $fields[9]; | |
| 2434 $fields[9] = join("\t", prop_info($dbsnp,$internal_snp,$variant_key)); | |
| 2435 my $from = $fields[4]; | |
| 2436 my $chr_pos_key = $fields[3].":".$from; | |
| 2437 my $to = $fields[4]; # true for SNPs and insertions | |
| 2438 my @variant_key = split /:/, $variant_key; | |
| 2439 # For deletions and complex variants, calculate the affected reference genome range and set the 'to' | |
| 2440 if(length($variant_key[2]) > 1){ | |
| 2441 $to += length($variant_key[2])-1; | |
| 2442 } | |
| 2443 splice(@fields, 5, 0, $to); | |
| 2444 | |
| 2445 # Otherwise expand the key into the relevant MAF values | |
| 2446 $fields[0] =~ s/\/chr.*$//; # some transcript ids are repeated... we expect "id/chr#" in the GTF file to distinguish these, but should get rid of them at reporting time | |
| 2447 # the offset from the nearest exon border if coding | |
| 2448 push @fields, ($#variant_key > 3 ? $variant_key[4] : ""); | |
| 2449 # add gene name(s) | |
| 2450 push @fields, range2genes($fields[3], $from, $to+1); | |
| 2451 # add caveats | |
| 2452 my $c = $fields[3]; | |
| 2453 if(not exists $chr2mappability{$c}){ | |
| 2454 if($c =~ s/^chr//){ | |
| 2455 # nothing more | |
| 2456 } | |
| 2457 else{ | |
| 2458 $c = "chr$c"; | |
| 2459 } | |
| 2460 } | |
| 2461 my $mappability_caveats = exists $chr2mappability{$c} ? $chr2mappability{$c}->fetch($fields[4], $fields[4]+1) : []; | |
| 2462 if(ref $mappability_caveats eq "ARRAY" and @$mappability_caveats){ | |
| 2463 my %h; | |
| 2464 if(exists $chr2caveats{$chr_pos_key}){ | |
| 2465 if($chr2caveats{$chr_pos_key} !~ /non-unique/){ | |
| 2466 $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats)."; ".$chr2caveats{$chr_pos_key}; | |
| 2467 } | |
| 2468 } | |
| 2469 else{ | |
| 2470 $chr2caveats{$chr_pos_key} = join("; ", grep {not $h{$_}++} @$mappability_caveats); | |
| 2471 } | |
| 2472 } | |
| 2473 push @fields, (exists $chr2caveats{$chr_pos_key} ? $chr2caveats{$chr_pos_key} : ""); | |
| 2474 # add phase | |
| 2475 push @fields, find_phase($variant_key); | |
| 2476 push @out_lines, join("\t", $feature_type{$fields[0]}, $feature_length{$fields[0]}, @fields); | |
| 2477 } | |
| 2478 | |
| 2479 # Now tabulate the rare variant numbers | |
| 2480 my %gene2rares; | |
| 2481 my %gene2aa_rares; | |
| 2482 for my $line (@out_lines){ | |
| 2483 my @F = split /\t/, $line, -1; | |
| 2484 if($F[15] eq "NA" or $F[15] < $rare_variant_prop and (!$internal_snp or $F[17] < $rare_variant_prop)){ | |
| 2485 my $gene_list = $internal_snp ? $F[20] : $F[19]; | |
| 2486 next unless defined $gene_list; | |
| 2487 for my $g (split /; /, $gene_list){ | |
| 2488 $gene2rares{$g}++; | |
| 2489 # Check the cDNA HGVS syntax for relevance | |
| 2490 if($F[3] =~ /c.\d+/ or # coding | |
| 2491 $F[3] =~ /c.\d+.*-[12]/ or # splicing acceptor | |
| 2492 $F[3] =~ /c.\d+\+[12345]/){ # splicing donor | |
| 2493 $gene2aa_rares{$g}++; | |
| 2494 } | |
| 2495 } | |
| 2496 } | |
| 2497 } | |
| 2498 | |
| 2499 # Print the results | |
| 2500 for my $line (@out_lines){ | |
| 2501 my @F = split /\t/, $line, -1; | |
| 2502 my $gene_list = $internal_snp ? $F[20] : $F[19]; | |
| 2503 if(not defined $gene_list){ | |
| 2504 print OUT join("\t", @F, "", ""), "\n"; next; | |
| 2505 } | |
| 2506 | |
| 2507 my $max_gene_rare = 0; | |
| 2508 my $max_gene_aa_rare = 0; | |
| 2509 for my $g (split /; /, $gene_list){ | |
| 2510 next unless exists $gene2rares{$g}; | |
| 2511 if($gene2rares{$g} > $max_gene_rare){ | |
| 2512 $max_gene_rare = $gene2rares{$g}; | |
| 2513 } | |
| 2514 next unless exists $gene2aa_rares{$g}; | |
| 2515 if($gene2aa_rares{$g} > $max_gene_aa_rare){ | |
| 2516 $max_gene_aa_rare = $gene2aa_rares{$g}; | |
| 2517 } | |
| 2518 } | |
| 2519 print OUT join("\t", @F, $max_gene_rare, $max_gene_aa_rare), "\n"; | |
| 2520 } | |
| 2521 close(OUT); | |
| 2522 | |
| 2523 sub range2genes{ | |
| 2524 my ($c, $from, $to) = @_; | |
| 2525 if(not exists $gene_ids{$c}){ | |
| 2526 if($c =~ s/^chr//){ | |
| 2527 # nothing more | |
| 2528 } | |
| 2529 else{ | |
| 2530 $c = "chr$c"; | |
| 2531 } | |
| 2532 } | |
| 2533 if(exists $gene_ids{$c}){ | |
| 2534 my %have; | |
| 2535 return join("; ", grep {not $have{$_}++} @{$gene_ids{$c}->fetch($from, $to+1)}); | |
| 2536 } | |
| 2537 else{ | |
| 2538 return ""; | |
| 2539 } | |
| 2540 } | |
| 2541 sub combine_phase_data{ | |
| 2542 my ($phases) = @_; # map from variant to its phase data | |
| 2543 | |
| 2544 # Create a reverse mapping from phase regions to their variants | |
| 2545 my %chr2phase_region2variants; | |
| 2546 my @variants = keys %$phases; | |
| 2547 for my $variant (@variants){ | |
| 2548 my ($chr) = $variant =~ /^\S+?-(\S+):/; | |
| 2549 $chr2phase_region2variants{$chr} = {} if not exists $chr2phase_region2variants{$chr}; | |
| 2550 for my $phase_region (split /,/, $phases->{$variant}){ | |
| 2551 $chr2phase_region2variants{$chr}->{$phase_region} = [] if not exists $chr2phase_region2variants{$chr}->{$phase_region}; | |
| 2552 push @{$chr2phase_region2variants{$phase_region}}, $variant; | |
| 2553 } | |
| 2554 } | |
| 2555 | |
| 2556 # Now for each phased block known so far, see if any variant in it is also part of another block | |
| 2557 # If so, do a union since phasing is both transitive and commutative. | |
| 2558 # The quickest way to do this is to check for overlapping intervals, then check for common members amongst those that do overlap | |
| 2559 for my $chr (keys %chr2phase_region2variants){ | |
| 2560 my @ordered_phase_regions = sort {my($a_pos) = $a =~ /:(\d+)/; my($b_pos) = $b =~ /:(\d+)/; $a_pos <=> $b_pos} keys %{$chr2phase_region2variants{$chr}}; | |
| 2561 my $sets = new DisjointSets(scalar(@ordered_phase_regions)); | |
| 2562 | |
| 2563 for (my $i = 0; $i < $#ordered_phase_regions; $i++){ | |
| 2564 my ($start, $stop, $variant) = $ordered_phase_regions[$i]; | |
| 2565 for (my $j = $i+1; $j <= $#ordered_phase_regions; $j++){ | |
| 2566 my ($start2, $stop2, $variant2) = $ordered_phase_regions[$j]; | |
| 2567 if($start2 > $stop){ # won't overlap any regions after this in the sorted list | |
| 2568 last; | |
| 2569 } | |
| 2570 # If we get here, it is implicit that $stop >= $start2 and $start < $stop2, i.e. there is overlap | |
| 2571 # Now check if there is a shared variant (otherwise we might erroneously join blocks from different physical chromosomal arms) | |
| 2572 my $have_shared_variant = 0; | |
| 2573 my $overlapping_phase_region = $ordered_phase_regions[$j]; | |
| 2574 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}){ | |
| 2575 if($phases->{$variant} =~ /\b$overlapping_phase_region\b/){ | |
| 2576 $have_shared_variant = 1; last; | |
| 2577 } | |
| 2578 } | |
| 2579 # sanity check that there aren't conflicting variants in the new block (i.e. two different variants in the same position) | |
| 2580 my %pos2base; | |
| 2581 my $have_conflicting_variant = 0; | |
| 2582 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$i]}}, @{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$j]}}){ | |
| 2583 my ($pos, $base) = $variant =~ /(\d+):(.+?)$/; | |
| 2584 if(exists $pos2base{$pos} and $pos2base{$pos} ne $base){ | |
| 2585 # conflict, note with a caveat | |
| 2586 if(exists $chr2caveats{"$chr:$pos"}){ | |
| 2587 $chr2caveats{"$chr:$pos"} .= "; inconsistent haplotype phasing" unless $chr2caveats{"$chr:$pos"} =~ /inconsistent haplotype phasing/; | |
| 2588 } | |
| 2589 else{ | |
| 2590 $chr2caveats{"$chr:$pos"} = "inconsistent haplotype phasing"; | |
| 2591 } | |
| 2592 $have_conflicting_variant ||= 1; | |
| 2593 } | |
| 2594 elsif(not exists $pos2base{$pos}){ | |
| 2595 $pos2base{$pos} = $base; | |
| 2596 } | |
| 2597 } | |
| 2598 | |
| 2599 $sets->union($i+1, $j+1) if $have_shared_variant and not $have_conflicting_variant; # indexes are one-based for sets rather than 0-based | |
| 2600 } | |
| 2601 } | |
| 2602 my $phase_sets = $sets->sets; #disjoint haplotype sets | |
| 2603 my %region_counts; | |
| 2604 for my $phase_set (@$phase_sets){ | |
| 2605 next if scalar(@$phase_set) == 1; # No change to existing phase region (is disjoint from all others) | |
| 2606 # Generate a new haploblock to replace the old ones that are being merged | |
| 2607 my $merged_start = 10000000000; | |
| 2608 my $merged_end = 0; | |
| 2609 for my $ordered_phase_region_index (@$phase_set){ | |
| 2610 my ($start, $end) = $ordered_phase_regions[$ordered_phase_region_index-1] =~ /(\d+)-(\d+)$/; | |
| 2611 $merged_start = $start if $start < $merged_start; | |
| 2612 $merged_end = $end if $end > $merged_end; | |
| 2613 } | |
| 2614 # At the start of the region is a unique prefix so we can tell the arms apart if two haploblocks have the exact same boundary | |
| 2615 my $region_count = $region_counts{"$merged_start-$merged_end"}++; | |
| 2616 my $merged_haploblock_name = "Y$region_count-$chr:$merged_start-$merged_end"; | |
| 2617 # Assign this new name to overwrite the premerge values for each variant contained within | |
| 2618 for my $ordered_phase_region_index (@$phase_set){ | |
| 2619 for my $variant (@{$chr2phase_region2variants{$chr}->{$ordered_phase_regions[$ordered_phase_region_index-1]}}){ # incl. one-based set correction in 0-based array index | |
| 2620 print STDERR "Merging $variant from ", $phases->{$variant}, " into new block $merged_haploblock_name\n"; | |
| 2621 $phases->{$variant} = $merged_haploblock_name; | |
| 2622 } | |
| 2623 } | |
| 2624 } | |
| 2625 # TODO: if there are overlapping phase blocks still, but with different variants in the same position, we can infer that they are on the opposite strands... | |
| 2626 } | |
| 2627 } | |
| 2628 | |
| 2629 # Sees if the positions of the variant are in the range of a phased haplotype, returning which allele it belongs to | |
| 2630 sub find_phase{ | |
| 2631 my ($chr,$pos,$ref,$variant) = split /:/, $_[0]; | |
| 2632 return "" if length($ref) != length($variant); # Can only deal with SNPs (and broken down MNPs) for now | |
| 2633 for(my $i = 0; $i < length($ref); $i++){ | |
| 2634 my $key = "$chr:".($pos+$i).":".substr($variant, $i, 1); | |
| 2635 #print STDERR "Checking phase for $key\n" if $pos == 12907379; | |
| 2636 if(exists $chr2phase{$key}){ | |
| 2637 #print STDERR "returning phase data $chr2phase{$key}\n"; | |
| 2638 return $chr2phase{$key}; | |
| 2639 } | |
| 2640 elsif(exists $chr2phase{"chr".$key}){ | |
| 2641 #print STDERR "returning phase data ", $chr2phase{"chr".$key}, "\n"; | |
| 2642 return $chr2phase{"chr".$key}; | |
| 2643 } | |
| 2644 } | |
| 2645 return ""; | |
| 2646 } | |
| 2647 | |
| 2648 sub find_earliest_index{ | |
| 2649 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start | |
| 2650 my ($query, $array) = @_; | |
| 2651 | |
| 2652 return 0 if $query < $array->[0]->[0]; | |
| 2653 | |
| 2654 my ($left, $right, $prevCenter) = (0, $#$array, -1); | |
| 2655 | |
| 2656 while(1){ | |
| 2657 my $center = int (($left + $right)/2); | |
| 2658 | |
| 2659 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); | |
| 2660 | |
| 2661 return $center if $cmp == 0; | |
| 2662 if ($center == $prevCenter) { | |
| 2663 return $left; | |
| 2664 } | |
| 2665 $right = $center if $cmp < 0; | |
| 2666 $left = $center if $cmp > 0; | |
| 2667 $prevCenter = $center; | |
| 2668 } | |
| 2669 } | |
| 2670 |
