| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use Bio::DB::Sam; | 
|  | 4 use DBI; | 
|  | 5 use IPC::Open2; | 
|  | 6 use Set::IntervalTree; | 
|  | 7 use File::Basename; | 
|  | 8 use strict; | 
|  | 9 use warnings; | 
|  | 10 use vars qw($quiet %chr2variant_locs %polyphen_info %chr2polyphen_vcf_lines %chr2gerp_vcf_lines %range2genes $fasta_index $UNAFFECTED $tmpfile); | 
|  | 11 | 
|  | 12 $UNAFFECTED = -299999; # sentinel for splice site predictions | 
|  | 13 | 
|  | 14 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | 
|  | 15   print "Version 1.0\n"; | 
|  | 16   exit; | 
|  | 17 } | 
|  | 18 | 
|  | 19 # configuration file stuff | 
|  | 20 my %config; | 
|  | 21 my $dirname = dirname(__FILE__); | 
|  | 22 my $tool_dir = shift @ARGV; | 
|  | 23 if(not -e "$tool_dir/hgvs_annotate.loc"){ | 
|  | 24   system("cp $dirname/tool-data/hgvs_annotate.loc $tool_dir/hgvs_annotate.loc"); | 
|  | 25 } | 
|  | 26 open CONFIG, '<', "$tool_dir/hgvs_annotate.loc"; | 
|  | 27 while(<CONFIG>){ | 
|  | 28   next if $_ =~ /^#/; | 
|  | 29   (my $key,my $value) = split(/\s+/, $_); | 
|  | 30   $config{$key} = $value; | 
|  | 31 } | 
|  | 32 my $dbs_dir = $config{"dbs_dir"}; | 
|  | 33 close CONFIG; | 
|  | 34 | 
|  | 35 $quiet = 0; | 
|  | 36 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ | 
|  | 37   $quiet = 1; | 
|  | 38   shift @ARGV; | 
|  | 39 } | 
|  | 40 | 
|  | 41 @ARGV == 9 or @ARGV == 10 or die "Usage: $0 -q <SIFT dir> <POLYPHEN2 file> <GERP dir> <tissuedb file> <pathway file> <interpro domains.bed> <omim morbidmap> <input hgvs table.txt> <output hgvs table.annotated.txt> [reference.fasta]\n Where if the reference fasta is provided, we predict cryptic splicing effects (using MaxEntScan and GeneSplicer)\n"; | 
|  | 42 | 
|  | 43 my $sift_dir = $dbs_dir .  shift @ARGV; | 
|  | 44 my $polyphen_file = $dbs_dir . shift @ARGV; | 
|  | 45 my $gerp_dir = $dbs_dir . shift @ARGV; | 
|  | 46 my $tissue_file = $dbs_dir . shift @ARGV; | 
|  | 47 my $pathway_file = $dbs_dir . shift @ARGV; | 
|  | 48 my $interpro_bed_file = shift @ARGV; | 
|  | 49 my $morbidmap_file = shift @ARGV; | 
|  | 50 my $hgvs_table_file = shift @ARGV; | 
|  | 51 my $outfile = shift @ARGV; | 
|  | 52 my $predict_cryptic_splicings = @ARGV ? shift @ARGV : undef; | 
|  | 53 my $flanking_bases = 500; # assume this is ROI around a gene | 
|  | 54 my @lines; | 
|  | 55 | 
|  | 56 sub polyphen_gerp_info($$$){ | 
|  | 57     my($gerp_dir,$polyphen_file,$info_key) = @_; | 
|  | 58 | 
|  | 59     my($chr,$pos,$ref,$variant) = split /:/, $info_key; | 
|  | 60 | 
|  | 61     # is this the first call for this chromosome? If so, retrieve the VCF lines for it en masse | 
|  | 62     if(not exists $chr2polyphen_vcf_lines{$chr}){ | 
|  | 63       ($chr2polyphen_vcf_lines{$chr}, $chr2gerp_vcf_lines{$chr}) = retrieve_vcf_lines($gerp_dir,$polyphen_file,$chr); | 
|  | 64     } | 
|  | 65 | 
|  | 66     my @results; | 
|  | 67     if(not $ref or not $variant){ # can only get data for SNPs | 
|  | 68         @results = qw(NA NA NA); | 
|  | 69     } | 
|  | 70     #print STDERR "ref = $ref and variant = $variant for  $info_key\n"; | 
|  | 71     elsif(length($ref) == 1 and length($variant) == 1){ | 
|  | 72       #print STDERR "Grabbing poly/gerp for $chr,$pos,$ref,$variant\n"; | 
|  | 73       @results = polyphen_info($chr,$pos,$variant); | 
|  | 74       $results[2] = gerp_info($chr,$pos); | 
|  | 75     } | 
|  | 76 | 
|  | 77     # It may be that a novel MNP variant is a bunch of known SNPs in a row.  In this case, | 
|  | 78     # report the list of variants | 
|  | 79     elsif(length($ref) == length($variant) and $ref ne "NA"){ | 
|  | 80         my (@poly_scores, @poly_calls, @gerp_scores); | 
|  | 81         for(my $i = 0; $i < length($ref); $i++){ | 
|  | 82             my $refbase = substr($ref, $i, 1); | 
|  | 83             my $newbase = substr($variant, $i, 1); | 
|  | 84             if($newbase eq $refbase){ | 
|  | 85               push @poly_scores, "-"; | 
|  | 86               push @poly_calls, "-"; | 
|  | 87               push @gerp_scores, "-"; | 
|  | 88               next; | 
|  | 89             } | 
|  | 90             my @base_results = polyphen_info($chr,$pos+$i,$refbase,$newbase); | 
|  | 91             push @poly_scores, $base_results[0]; | 
|  | 92             push @poly_calls, $base_results[1]; | 
|  | 93             $base_results[2] = gerp_info($chr,$pos+$i); | 
|  | 94             push @gerp_scores, $base_results[2]; | 
|  | 95         } | 
|  | 96         $results[0] = join(",", @poly_scores); | 
|  | 97         $results[1] = join(",", @poly_calls); | 
|  | 98         $results[2] = join(",", @gerp_scores); | 
|  | 99     } | 
|  | 100     else{ | 
|  | 101         @results = qw(NA NA NA); | 
|  | 102     } | 
|  | 103 | 
|  | 104     return @results; | 
|  | 105 } | 
|  | 106 | 
|  | 107 sub get_variant_key($$$$){ | 
|  | 108   # params chr pos ref variant | 
|  | 109   my $c = $_[0]; | 
|  | 110   $c =~ s/^chr//; | 
|  | 111   my $prop_info_key = join(":", $c, @_[1..3]); | 
|  | 112   $chr2variant_locs{$c} = [] unless exists $chr2variant_locs{$c}; | 
|  | 113   push @{$chr2variant_locs{$c}}, $_[1]; | 
|  | 114   return $prop_info_key; | 
|  | 115 } | 
|  | 116 | 
|  | 117 sub retrieve_vcf_lines($$$){ | 
|  | 118   my ($gerp_dir, $polyphen_file, $chr) = @_; | 
|  | 119 | 
|  | 120   my (%polyphen_lines, %gerp_lines); | 
|  | 121 | 
|  | 122   if(not exists $chr2variant_locs{$chr}){ | 
|  | 123     return ({}, {}); # no data requested for this chromosome | 
|  | 124   } | 
|  | 125 | 
|  | 126   # build up the request | 
|  | 127   my @tabix_regions; | 
|  | 128   my @var_locs = grep /^\d+$/, @{$chr2variant_locs{$chr}}; # grep keeps point mutations only | 
|  | 129   if(not @var_locs){ | 
|  | 130     return ({}, {}); # no point mutations | 
|  | 131   } | 
|  | 132 | 
|  | 133   # sort by variant start location | 
|  | 134   for my $var_loc (sort {$a <=> $b} @var_locs){ | 
|  | 135     push @tabix_regions, $chr.":".$var_loc."-".$var_loc; | 
|  | 136   } | 
|  | 137   my $regions = join(" ", @tabix_regions); | 
|  | 138 | 
|  | 139   # retrieve the data | 
|  | 140   die "Cannot find Polyphen2 VCF file $polyphen_file\n" if not -e $polyphen_file; | 
|  | 141   open(VCF, "tabix $polyphen_file $regions |") | 
|  | 142     or die "Cannot run tabix on $polyphen_file: $!\n"; | 
|  | 143   while(<VCF>){ | 
|  | 144     if(/^\S+\t(\d+)/ and grep {$_ eq $1} @var_locs){ # take only main columns to save room, if possible | 
|  | 145       chomp; | 
|  | 146       $polyphen_lines{$1} = [] unless exists $polyphen_lines{$1}; | 
|  | 147       push @{$polyphen_lines{$1}}, $_; | 
|  | 148     } | 
|  | 149   } | 
|  | 150   close(VCF); | 
|  | 151 | 
|  | 152   my $vcf_file = "$gerp_dir/chr$chr.tab.gz"; | 
|  | 153   if(not -e $vcf_file){ | 
|  | 154     warn "Cannot find GERP VCF file $vcf_file\n" if not $quiet; | 
|  | 155     return ({}, {}); | 
|  | 156   } | 
|  | 157   open(VCF, "tabix $vcf_file $regions |") | 
|  | 158     or die "Cannot run tabix on $vcf_file: $!\n"; | 
|  | 159   while(<VCF>){ | 
|  | 160     if(/^(\S+\t(\d+)(?:\t\S+){2})/ and grep {$_ eq $2} @var_locs){ # take only main columns to save room, if possible | 
|  | 161       $gerp_lines{$2} = [] unless exists $gerp_lines{$2}; | 
|  | 162       push @{$gerp_lines{$2}}, $1; | 
|  | 163     } | 
|  | 164   } | 
|  | 165   close(VCF); | 
|  | 166 | 
|  | 167   return (\%polyphen_lines, \%gerp_lines); | 
|  | 168 } | 
|  | 169 | 
|  | 170 sub polyphen_info($$$){ | 
|  | 171   my ($chr, $pos, $variant_base) = @_; | 
|  | 172 | 
|  | 173   my $key = "$chr:$pos:$variant_base"; | 
|  | 174   if(exists $polyphen_info{$key}){ | 
|  | 175       return @{$polyphen_info{$key}}; | 
|  | 176   } | 
|  | 177 | 
|  | 178   #print STDERR "Checking for polyphen data for $chr, $pos, $variant_base\n"; | 
|  | 179   if(exists $chr2polyphen_vcf_lines{$chr}->{$pos}){ | 
|  | 180     for(@{$chr2polyphen_vcf_lines{$chr}->{$pos}}){ | 
|  | 181       #print STDERR "Checking $chr, $pos, $variant_base against $_"; | 
|  | 182       my @fields = split /\t/, $_; | 
|  | 183       if($fields[4] eq $variant_base){ | 
|  | 184 	  if($fields[6] eq "D"){ | 
|  | 185 	      $polyphen_info{$key} = [$fields[5],"deleterious"]; | 
|  | 186               last; | 
|  | 187 	  } | 
|  | 188 	  elsif($fields[6] eq "P"){ | 
|  | 189 	      $polyphen_info{$key} = [$fields[5],"poss. del."]; | 
|  | 190               last; | 
|  | 191 	  } | 
|  | 192 	  elsif($fields[6] eq "B"){ | 
|  | 193 	      $polyphen_info{$key} = [$fields[5],"benign"]; | 
|  | 194               last; | 
|  | 195 	  } | 
|  | 196 	  else{ | 
|  | 197 	      $polyphen_info{$key} = [$fields[5],$fields[6]]; | 
|  | 198               last; | 
|  | 199 	  } | 
|  | 200       } | 
|  | 201     } | 
|  | 202     if(not exists $polyphen_info{$key}){ | 
|  | 203       $polyphen_info{$key} = ["NA", "NA"]; | 
|  | 204     } | 
|  | 205   } | 
|  | 206   else{ | 
|  | 207     $polyphen_info{$key} = ["NA", "NA"]; | 
|  | 208   } | 
|  | 209   return @{$polyphen_info{$key}}; | 
|  | 210 } | 
|  | 211 | 
|  | 212 sub gerp_info($$){ | 
|  | 213     my ($chr,$pos) = @_; | 
|  | 214 | 
|  | 215     if(exists $chr2gerp_vcf_lines{$chr}->{$pos}){ | 
|  | 216       for(@{$chr2gerp_vcf_lines{$chr}->{$pos}}){ | 
|  | 217         my @fields = split /\t/, $_; | 
|  | 218         return $fields[3]; | 
|  | 219       } | 
|  | 220     } | 
|  | 221     return "NA"; | 
|  | 222 } | 
|  | 223 | 
|  | 224 sub predict_splicing{ | 
|  | 225   my ($chr, $pos, $ref, $var, $strand, $exon_edge_distance, $splice_type) = @_; | 
|  | 226 | 
|  | 227   my $disruption_level = 0; | 
|  | 228   my @disruption_details; | 
|  | 229   # The methods being used only look for up to a 30 base context, so only | 
|  | 230   # need to check for obliterated acceptors/donors if they are nearby | 
|  | 231   $exon_edge_distance-- if $exon_edge_distance > 0; # +1 is actually in the right spot, so decrement | 
|  | 232   my $edge_offset = $strand eq "-" ? -1*$exon_edge_distance : $exon_edge_distance; # flip offset sign if the call was on the negative strand (since +/- was relative to the transcript, not the genome coordinates) | 
|  | 233   my $annotated_detected = 0; | 
|  | 234   my ($genesplicer_orig, $genesplicer_orig_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref); | 
|  | 235   if($genesplicer_orig != 0){ | 
|  | 236     $annotated_detected = 1; | 
|  | 237   } | 
|  | 238   my ($maxentscan_orig, $maxentscan_orig_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref); | 
|  | 239   if(not $annotated_detected and $maxentscan_orig){ | 
|  | 240     $annotated_detected = 1; | 
|  | 241   } | 
|  | 242   my ($genesplicer_mod, $genesplicer_mod_pos) = call_genesplicer($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); | 
|  | 243   if($genesplicer_mod != $UNAFFECTED){ # was the mod within the range where this software could detect up or downstream effects on the annotated splice site? | 
|  | 244     if($genesplicer_orig_pos != $genesplicer_mod_pos){ | 
|  | 245       $disruption_level+=0.75; | 
|  | 246       if($genesplicer_mod_pos == -1){ | 
|  | 247         push @disruption_details, "Variant obliterates GeneSplicer predicted splice site from the reference sequence ($chr:$genesplicer_orig_pos, orig score $genesplicer_orig)"; | 
|  | 248       } | 
|  | 249       elsif($genesplicer_orig_pos != -1){ | 
|  | 250         push @disruption_details, "Variant moves GeneSplicer preferred splice site from annotated location $chr:$genesplicer_orig_pos to $chr:$genesplicer_mod_pos (scores: orig=$genesplicer_orig, new=$genesplicer_mod)"; | 
|  | 251       } | 
|  | 252     } | 
|  | 253     elsif($genesplicer_orig-$genesplicer_mod > 1){ | 
|  | 254       $disruption_level+=0.5; | 
|  | 255       push @disruption_details, "Variant significantly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; | 
|  | 256     } | 
|  | 257     elsif($genesplicer_orig > $genesplicer_mod){ | 
|  | 258       $disruption_level+=0.25; | 
|  | 259       push @disruption_details, "Variant slightly lowers GeneSplicer score for original $splice_type splice site located at $chr:$genesplicer_mod_pos ($genesplicer_mod vs. $genesplicer_orig)"; | 
|  | 260     } | 
|  | 261     $genesplicer_orig = $genesplicer_mod; # so that splice site comparison vs. cryptic is accurate | 
|  | 262   } | 
|  | 263   my ($maxentscan_mod, $maxentscan_mod_pos) = call_maxentscan($splice_type, $chr, $strand, $pos-$edge_offset, $ref, $var, $exon_edge_distance); | 
|  | 264   if($maxentscan_mod != $UNAFFECTED){ | 
|  | 265     if($maxentscan_mod_pos != $maxentscan_orig_pos){ | 
|  | 266       $disruption_level+=1; | 
|  | 267       if($maxentscan_mod_pos == -1){ | 
|  | 268         push @disruption_details, "Variant obliterates MaxEntScan predicted splice site from the reference sequence ($chr:$maxentscan_orig_pos, orig score $maxentscan_orig)"; | 
|  | 269       } | 
|  | 270       elsif($maxentscan_orig_pos != -1){ | 
|  | 271         push @disruption_details, "Variant moves MaxEntScan preferred splice site from annotated location $chr:$maxentscan_orig_pos to $chr:$maxentscan_mod_pos (scores: orig=$maxentscan_orig, new=$maxentscan_mod)"; | 
|  | 272       } | 
|  | 273     } | 
|  | 274     elsif($maxentscan_orig-$maxentscan_mod > 1){ | 
|  | 275       $disruption_level+=0.5; | 
|  | 276       push @disruption_details, "Variant significantly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; | 
|  | 277     } | 
|  | 278     elsif($maxentscan_orig>$maxentscan_mod){ | 
|  | 279       $disruption_level+=0.25; | 
|  | 280       push @disruption_details, "Variant slightly lowers MaxEntScan score for original $splice_type splice site located at $chr:$maxentscan_mod_pos ($maxentscan_mod vs. $maxentscan_orig)"; | 
|  | 281     } | 
|  | 282     $maxentscan_orig = $maxentscan_mod; | 
|  | 283   } | 
|  | 284 | 
|  | 285   # Regardless of location, see if there is an increased chance of creating a cryptic splicing site | 
|  | 286   my ($genesplicer_control, $genesplicer_control_pos) = call_genesplicer($splice_type, $chr, $strand, $pos, $ref); | 
|  | 287   my ($genesplicer_cryptic, $genesplicer_cryptic_pos)  = call_genesplicer($splice_type, $chr, $strand, $pos, $ref, $var); | 
|  | 288   if(defined $genesplicer_orig and $genesplicer_control < $genesplicer_orig and $genesplicer_orig < $genesplicer_cryptic){ | 
|  | 289     $disruption_level+=0.75; | 
|  | 290     push @disruption_details, "Variant causes GeneSplicer score for a potential cryptic $splice_type splice site at $chr:$genesplicer_cryptic_pos to become higher than the annotated splice site $chr:$genesplicer_orig_pos (orig=$genesplicer_orig vs. var=$genesplicer_cryptic)"; | 
|  | 291   } | 
|  | 292   if($genesplicer_control - $genesplicer_cryptic < -1){ | 
|  | 293     $disruption_level+=0.5; | 
|  | 294     push @disruption_details, "Variant raises GeneSplicer score for a potential cryptic $splice_type splice site (orig=$genesplicer_control vs. var=$genesplicer_cryptic)"; | 
|  | 295   } | 
|  | 296   my ($maxentscan_control, $maxentscan_control_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref); | 
|  | 297   my ($maxentscan_cryptic, $maxentscan_cryptic_pos) = call_maxentscan($splice_type, $chr, $strand, $pos, $ref, $var); | 
|  | 298   if(defined $maxentscan_orig and $maxentscan_control < $maxentscan_orig and $maxentscan_orig < $maxentscan_cryptic){ | 
|  | 299     $disruption_level++; | 
|  | 300     push @disruption_details, "Variant causes MaxEntScan score for a potential cryptic $splice_type splice site at $chr:$maxentscan_cryptic_pos to become higher than the annotated splice site $chr:$maxentscan_orig_pos (orig=$maxentscan_orig vs. var=$maxentscan_cryptic)"; | 
|  | 301   } | 
|  | 302   if($maxentscan_control - $maxentscan_cryptic < -1 and $maxentscan_cryptic > 0){ | 
|  | 303     $disruption_level+=0.5; | 
|  | 304     push @disruption_details, "Variant raises MaxEntScan score for a potential cryptic $splice_type splice site (orig=$maxentscan_control vs. var=$maxentscan_cryptic)"; | 
|  | 305   } | 
|  | 306 | 
|  | 307   # If neither method predicted the annotated donor site, and the alternate site isn't predicted either, note that so the user doesn't think the mod is definite okay | 
|  | 308   if(not @disruption_details and $maxentscan_orig < 0 and $genesplicer_orig < 0){ | 
|  | 309     $disruption_level+=0.5; | 
|  | 310     push @disruption_details, "Neither the annotated splice site nor any potential cryptic site introduced by the variant are predicted by either GeneSplicer or MaxEntScan, therefore the alt splicing potential for this variant should be evaluated manually"; | 
|  | 311   } | 
|  | 312 | 
|  | 313   return ($disruption_level, join("; ", @disruption_details)); | 
|  | 314 } | 
|  | 315 | 
|  | 316 sub call_genesplicer{ | 
|  | 317   my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; | 
|  | 318 | 
|  | 319   if($splice_type eq "acceptor"){ | 
|  | 320     $pos += $strand eq  "-" ? 2: -2; | 
|  | 321   } | 
|  | 322   my $num_flanking = 110; # seems to croak if less than 160bp provided | 
|  | 323   my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins | 
|  | 324   my $site_window = defined $var_offset ? 2 : ($indel_size > 0 ? 29+$indel_size : 29); # exact position if looking for exisitng site effect, otherwise 10 or 10 + the insert size | 
|  | 325   my $cmd = "genesplicer"; | 
|  | 326   my $start = $pos - $num_flanking; | 
|  | 327   $start = 1 if $start < 1; | 
|  | 328   my $stop = $pos + $num_flanking + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 2*$num_flanking after the edit | 
|  | 329   $stop++; # end coordinate is exclusive | 
|  | 330   return ($UNAFFECTED, -1) if defined $var_offset and ($pos+$var_offset < $start or $pos+$var_offset > $stop); # variant is outside of range we can detect affect on original splice site | 
|  | 331   my $seq = $fasta_index->fetch("$chr:$start-$stop"); | 
|  | 332   if(defined $var){ | 
|  | 333     if(defined $var_offset){ # substitution away from the site of interest | 
|  | 334       substr($seq, $num_flanking+$var_offset, length($ref)) = $var; | 
|  | 335     } | 
|  | 336     else{ # substitution at the site of interest | 
|  | 337       substr($seq, $num_flanking, length($ref)) = $var; | 
|  | 338     } | 
|  | 339   } | 
|  | 340   if($strand eq "-"){ | 
|  | 341     $seq = reverse($seq); | 
|  | 342     $seq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 343   } | 
|  | 344   $tmpfile = "$$.fasta"; | 
|  | 345   open(TMP, ">$tmpfile") or die "Could not open $tmpfile for writing: $!\n"; | 
|  | 346   print TMP ">$chr $strand $pos $ref",($var?" $var":""), ($var_offset?" $var_offset":""),"\n$seq\n"; | 
|  | 347   substr($seq, $num_flanking, 2) = lc(substr($seq, $num_flanking, 2)); | 
|  | 348   #print STDERR "$chr $strand $pos $ref $var $var_offset: $seq\n"; | 
|  | 349   close(TMP); | 
|  | 350   #print STDERR "$cmd $tmpfile /export/achri_data/programs/GeneSplicer/human\n"; | 
|  | 351   #<STDIN>; | 
|  | 352   open(GS, "$cmd $tmpfile human 2> /dev/null|") | 
|  | 353     or die "Could not run $cmd: $!\n"; | 
|  | 354   my $highest_score = 0; | 
|  | 355   my $highest_position = -1; | 
|  | 356   while(<GS>){ | 
|  | 357     # End5 End3 Score "confidence" splice_site_type | 
|  | 358     # E.g. : | 
|  | 359     # 202 203 2.994010 Medium donor | 
|  | 360     chomp; | 
|  | 361     my @F = split /\s+/, $_; | 
|  | 362     next if $F[1] < $F[0]; # wrong strand | 
|  | 363     if($splice_type eq $F[4]){ | 
|  | 364       if(abs($F[0]-$num_flanking-1) <= $site_window){ # right type and approximate location | 
|  | 365         #print STDERR "*$_\n"; | 
|  | 366         if($F[2] > $highest_score){ | 
|  | 367           $highest_score = $F[2]; | 
|  | 368           # last bit accounts for indels in the position offset | 
|  | 369           $highest_position = $pos + ($strand eq "-" ? -1: 1)*(-$num_flanking + $F[0] - 1) + (($F[0] <= $num_flanking && $strand eq "+" or $F[0] >= $num_flanking && $strand eq "-") ? -$indel_size : 0); | 
|  | 370         } | 
|  | 371       } | 
|  | 372     } | 
|  | 373     #else{print STDERR $_,"\n";} | 
|  | 374   } | 
|  | 375   close(GS); | 
|  | 376   return ($highest_score, $highest_position); | 
|  | 377 } | 
|  | 378 | 
|  | 379 sub call_maxentscan{ | 
|  | 380   my($splice_type, $chr, $strand, $pos, $ref, $var, $var_offset) = @_; | 
|  | 381 | 
|  | 382   my $indel_size = defined $var ? length($var) - length($ref) : 0; # - is del, + is ins | 
|  | 383   my $cmd; | 
|  | 384   my ($num_flanking5, $num_flanking3); | 
|  | 385   my ($region5, $region3); | 
|  | 386   my ($start, $stop); | 
|  | 387   my $highest_score = -9999; | 
|  | 388   my $highest_position = -1; | 
|  | 389   # note that donor and acceptor are very different beasts for MaxEntScan | 
|  | 390   if($splice_type eq "acceptor"){ | 
|  | 391     if($strand eq "-"){ | 
|  | 392       $pos += 2; | 
|  | 393     } | 
|  | 394     else{ | 
|  | 395       $pos -= 2; | 
|  | 396     } | 
|  | 397     $cmd = "score3.pl"; | 
|  | 398     $num_flanking5 = 18; | 
|  | 399     $num_flanking3 = 4; | 
|  | 400   } | 
|  | 401   else{ # donor | 
|  | 402     $cmd = "score5.pl"; | 
|  | 403     $num_flanking5 = 3; | 
|  | 404     $num_flanking3 = 5; | 
|  | 405   } | 
|  | 406   # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region | 
|  | 407   $region5 = defined $var_offset ? 0 : $num_flanking5; # when var_offset is defined, we are looking only at the original splice site so no need to look in the general region | 
|  | 408   $region3 = defined $var_offset ? 0 : $num_flanking3; | 
|  | 409   if($strand eq "-"){ | 
|  | 410     $start = $pos - $num_flanking3 - $region3; | 
|  | 411     $stop = $pos + $num_flanking5 + $region5 + ($indel_size < 0 ? -1*$indel_size:0); | 
|  | 412     #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking3 - $region3)\n"; | 
|  | 413   } | 
|  | 414   else{ | 
|  | 415     $start = $pos - $num_flanking5 - $region5; | 
|  | 416     $stop = $pos + $num_flanking3 + $region3 + ($indel_size < 0 ? -1*$indel_size:0); # add extra bases if a del so we still have 23 or 9 bases as the case may be | 
|  | 417     #print STDERR "Fetching $chr:$start-$stop ($pos - $num_flanking5 - $region5)\n"; | 
|  | 418   } | 
|  | 419   if(defined $var_offset and ( | 
|  | 420      $strand eq "+" and ($pos+$var_offset < $start or $pos+$var_offset > $stop) or | 
|  | 421      $strand eq "-" and ($pos-$var_offset < $start or $pos-$var_offset > $stop))){ # variant is outside of range we can detect affect on original splice site | 
|  | 422     return ($UNAFFECTED, -1) | 
|  | 423   } | 
|  | 424 | 
|  | 425   my $seq = $fasta_index->fetch("$chr:$start-$stop"); | 
|  | 426   my @subseqs; | 
|  | 427   if(defined $var){ | 
|  | 428       if(defined $var_offset){ # substitution away from the site of interest | 
|  | 429         #print STDERR "Subbing in string of length ", length($seq), "near splice site $ref -> $var at ", ($strand eq "-" ? "$region3+$num_flanking3-$var_offset":"$region5+$num_flanking5+$var_offset"), "\n"; | 
|  | 430         substr($seq, ($strand eq "-" ? $region3+$num_flanking3-$var_offset:$region5+$num_flanking5+$var_offset), length($ref)) = $var; | 
|  | 431       } | 
|  | 432       else{ # substitution at the site of interest | 
|  | 433         substr($seq, ($strand eq "-" ? $region3+$num_flanking3:$region5+$num_flanking5), length($ref)) = $var; | 
|  | 434       } | 
|  | 435   } | 
|  | 436   if($strand eq "-"){ | 
|  | 437       $seq = reverse($seq); | 
|  | 438       $seq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 439   } | 
|  | 440   # get all of the 23 or 9 base pair subsequences, depending on whether it's donors or acceptors we are looking for | 
|  | 441   for(my $i = 0; $i+$num_flanking5+$num_flanking3+1 <= length($seq); $i++){ | 
|  | 442       push @subseqs, substr($seq, $i, $num_flanking5+$num_flanking3+1); | 
|  | 443   } | 
|  | 444   my $pid = open2(\*CHILD_OUT, \*CHILD_IN, "$cmd -"); | 
|  | 445   $pid or die "Could not run $cmd: $!\n"; | 
|  | 446   print CHILD_IN join("\n",@subseqs); | 
|  | 447   close(CHILD_IN); | 
|  | 448   while(<CHILD_OUT>){ | 
|  | 449       chomp; | 
|  | 450       # out is just "seq[tab]score" | 
|  | 451       my @F = split /\s+/, $_; | 
|  | 452       if($F[1] > $highest_score){ | 
|  | 453         #print STDERR ">"; | 
|  | 454         $highest_score = $F[1]; | 
|  | 455         # since we printed the subseqs in order, $. (containing the line number of the prediction output) gives us the offset in the original seq | 
|  | 456         $highest_position = $start + ($strand eq "-" ? length($seq) - $num_flanking5 - $. : $num_flanking5 + $. -1)+(($. <= $num_flanking5+$region5 && $strand eq "+" or $. >= $num_flanking5+$region5 && $strand eq "-") ? -$indel_size : 0); | 
|  | 457       } | 
|  | 458       #print STDERR $_,"\n"; | 
|  | 459   } | 
|  | 460   close(CHILD_OUT); | 
|  | 461   waitpid($pid, 0); | 
|  | 462 | 
|  | 463   return ($highest_score, $highest_position); | 
|  | 464 } | 
|  | 465 | 
|  | 466 sub get_range_info($$$$){ | 
|  | 467    my ($chr2ranges, $chr, $start, $stop) = @_; | 
|  | 468 | 
|  | 469    if(not exists $chr2ranges->{$chr}){ | 
|  | 470      return {}; | 
|  | 471    } | 
|  | 472    my $range_key = "$chr:$start:$stop"; | 
|  | 473 | 
|  | 474    # Use a binary search to find the leftmost range of interest | 
|  | 475    my $left_index = 0; | 
|  | 476    my $right_index = $#{$chr2ranges->{$chr}}; | 
|  | 477    my $cursor = int($right_index/2); | 
|  | 478    while($left_index != $right_index and $left_index != $cursor and $right_index != $cursor){ | 
|  | 479      # need to go left | 
|  | 480      if($chr2ranges->{$chr}->[$cursor]->[0] >= $start){ | 
|  | 481        $right_index = $cursor; | 
|  | 482        $cursor = int(($left_index+$cursor)/2); | 
|  | 483      } | 
|  | 484      # need to go to the right | 
|  | 485      elsif($chr2ranges->{$chr}->[$cursor]->[1] <= $stop){ | 
|  | 486        $left_index = $cursor; | 
|  | 487        $cursor = int(($right_index+$cursor)/2); | 
|  | 488      } | 
|  | 489      else{ | 
|  | 490        $right_index = $cursor; | 
|  | 491        $cursor--; # in the range, go left until not in the range any more | 
|  | 492      } | 
|  | 493    } | 
|  | 494 | 
|  | 495    my %names; | 
|  | 496    for (; $cursor <= $#{$chr2ranges->{$chr}}; $cursor++){ | 
|  | 497       my $range_ref = $chr2ranges->{$chr}->[$cursor]; | 
|  | 498       if($range_ref->[0] > $stop){ | 
|  | 499         last; | 
|  | 500       } | 
|  | 501       elsif($range_ref->[0] <= $stop and $range_ref->[1] >= $start){ | 
|  | 502         $names{$range_ref->[2]}++; | 
|  | 503       } | 
|  | 504    } | 
|  | 505 | 
|  | 506    #print STDERR "Names for range $chr:$start-$stop are ", join("/", keys %names), "\n"; | 
|  | 507    return \%names; | 
|  | 508 } | 
|  | 509 | 
|  | 510 if(not -d $sift_dir){ | 
|  | 511     die "The specified SIFT directory $sift_dir does not exist\n"; | 
|  | 512 } | 
|  | 513 | 
|  | 514 print STDERR "Reading in OMIM morbid map...\n" unless $quiet; | 
|  | 515 die "Data file $morbidmap_file does not exist, aborting.\n" if not -e $morbidmap_file; | 
|  | 516 my %gene2morbids; | 
|  | 517 open(TAB, $morbidmap_file) | 
|  | 518   or die "Cannot open OMIM morbid map file $morbidmap_file for reading: $!\n"; | 
|  | 519 while(<TAB>){ | 
|  | 520   my @fields = split /\|/, $_; | 
|  | 521   #print STDERR "got ", ($#fields+1), " fields in $_\n"; | 
|  | 522   #my @fields = split /\|/, $_; | 
|  | 523   my $morbid = $fields[0]; | 
|  | 524   $morbid =~ s/\s*\(\d+\)$//; # get rid of trailing parenthetical number | 
|  | 525   my $omim_id = $fields[2]; | 
|  | 526   for my $genename (split /, /, $fields[1]){ | 
|  | 527     if(not exists $gene2morbids{$genename}){ | 
|  | 528       $gene2morbids{$genename}  = "OMIM:$omim_id $morbid"; | 
|  | 529     } | 
|  | 530     else{ | 
|  | 531       $gene2morbids{$genename} .= "; $morbid"; | 
|  | 532     } | 
|  | 533   } | 
|  | 534 } | 
|  | 535 close(TAB); | 
|  | 536 | 
|  | 537 print STDERR "Reading in interpro mappings...\n" unless $quiet; | 
|  | 538 die "Data file $interpro_bed_file does not exist, aborting.\n" if not -e $interpro_bed_file; | 
|  | 539 my %interpro_ids; | 
|  | 540 open(TAB, $interpro_bed_file) | 
|  | 541   or die "Cannot open gene name BED file $interpro_bed_file for reading: $!\n"; | 
|  | 542 while(<TAB>){ | 
|  | 543   chomp; | 
|  | 544   # format should be "chr start stop interpro desc..." | 
|  | 545   my @fields = split /\t/, $_; | 
|  | 546   my $c = $fields[0]; | 
|  | 547   if(not exists $interpro_ids{$c}){ | 
|  | 548      $interpro_ids{$c} = [] unless exists $interpro_ids{$c}; | 
|  | 549      $interpro_ids{"chr".$c} = [] unless $c =~ /^chr/; | 
|  | 550      $interpro_ids{$1} = [] if $c =~ /^chr(\S+)/; | 
|  | 551   } | 
|  | 552   my $dataref = [@fields[1..3]]; | 
|  | 553   push @{$interpro_ids{$c}}, $dataref; | 
|  | 554   push @{$interpro_ids{"chr".$c}}, $dataref unless $c =~ /^chr/; | 
|  | 555   push @{$interpro_ids{$1}}, $dataref if $c =~ /^chr(\S+)/; | 
|  | 556 } | 
|  | 557 close(TAB); | 
|  | 558 print STDERR "Sorting interpro matches per contig...\n" unless $quiet; | 
|  | 559 for my $chr (keys %interpro_ids){ | 
|  | 560   $interpro_ids{$chr} = [sort {$a->[0] <=> $b->[0]} @{$interpro_ids{$chr}}]; | 
|  | 561 } | 
|  | 562 | 
|  | 563 # {uc(gene_name) => space separated info} | 
|  | 564 print STDERR "Reading in tissue info...\n" unless $quiet; | 
|  | 565 my %tissues; | 
|  | 566 my %tissue_desc; | 
|  | 567 open(TISSUE, $tissue_file) | 
|  | 568     or die "Cannot open $tissue_file for reading: $!\n"; | 
|  | 569 while(<TISSUE>){ | 
|  | 570     chomp; | 
|  | 571     my @fields = split /\t/, $_; | 
|  | 572     my $gname = uc($fields[1]); | 
|  | 573     $tissue_desc{$gname} = $fields[2]; | 
|  | 574     next unless $#fields == 4; | 
|  | 575     my @tis = split /;\s*/, $fields[4]; | 
|  | 576     pop @tis; | 
|  | 577     if(@tis < 6){ | 
|  | 578 	$tissues{$gname} = join(">", @tis); | 
|  | 579     } | 
|  | 580     elsif(@tis < 24){ | 
|  | 581 	$tissues{$gname} = join(">", @tis[0..5]).">..."; | 
|  | 582     } | 
|  | 583     else{ | 
|  | 584 	$tissues{$gname} = join(">", @tis[0..(int($#tis/5))]).">..."; | 
|  | 585     } | 
|  | 586 } | 
|  | 587 close(TISSUE); | 
|  | 588 | 
|  | 589 print STDERR "Loading pathway info...\n" unless $quiet; | 
|  | 590 my %pathways; | 
|  | 591 my %pathway_ids; | 
|  | 592 open(PATHWAY, $pathway_file) | 
|  | 593     or die "Cannot open $pathway_file for reading: $!\n"; | 
|  | 594 while(<PATHWAY>){ | 
|  | 595     chomp; | 
|  | 596     my @fields = split /\t/, $_; | 
|  | 597     next unless @fields >= 4; | 
|  | 598     for my $gname (split /\s+/, $fields[1]){ | 
|  | 599       $gname = uc($gname); | 
|  | 600       if(not exists $pathways{$gname}){ | 
|  | 601 	  $pathway_ids{$gname} = $fields[2]; | 
|  | 602 	  $pathways{$gname} = $fields[3]; | 
|  | 603       } | 
|  | 604       else{ | 
|  | 605 	  $pathway_ids{$gname} .= "; ".$fields[2]; | 
|  | 606 	  $pathways{$gname} .= "; ".$fields[3]; | 
|  | 607       } | 
|  | 608    } | 
|  | 609 } | 
|  | 610 close(PATHWAY); | 
|  | 611 | 
|  | 612 print STDERR "Loading SIFT indices...\n" unless $quiet; | 
|  | 613 # Read in the database table list | 
|  | 614 my %chr_tables; | 
|  | 615 open(DBLIST, "$sift_dir/bins.list") | 
|  | 616     or die "Cannot open $sift_dir/bins.list for reading: $!\n"; | 
|  | 617 while(<DBLIST>){ | 
|  | 618     chomp; | 
|  | 619     my @fields = split /\t/, $_; | 
|  | 620     if(not exists $chr_tables{$fields[1]}){ | 
|  | 621 	$chr_tables{$fields[1]} = {}; | 
|  | 622     } | 
|  | 623     my $chr = $fields[1]; | 
|  | 624     $chr = "chr$chr" unless $chr =~ /^chr/; | 
|  | 625     $chr_tables{$chr}->{"$chr\_$fields[2]_$fields[3]"} = [$fields[2],$fields[3]]; | 
|  | 626 } | 
|  | 627 close(DBLIST); | 
|  | 628 | 
|  | 629 #Connect to database | 
|  | 630 my $db_gene = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_Supp.sqlite","","",{RaiseError=>1, AutoCommit=>1}); | 
|  | 631 | 
|  | 632 my $db_chr; | 
|  | 633 my $fr_stmt; # retrieval for frameshift | 
|  | 634 my $snp_stmt; # retrieval for SNP | 
|  | 635 my $cur_chr = ""; | 
|  | 636 my $cur_max_pos; | 
|  | 637 | 
|  | 638 if(defined $predict_cryptic_splicings){ | 
|  | 639   if(not -e $predict_cryptic_splicings){ | 
|  | 640     die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, does not exist.\n"; | 
|  | 641   } | 
|  | 642   if(not -e $predict_cryptic_splicings.".fai" and not -w dirname($predict_cryptic_splicings)){ | 
|  | 643     die "Reference FastA file for splicing predictions, $predict_cryptic_splicings, is not indexed, and the directory is not writable.\n"; | 
|  | 644   } | 
|  | 645   print STDERR "Loading reference FastA index...\n" unless $quiet; | 
|  | 646   $fasta_index = Bio::DB::Sam::Fai->load($predict_cryptic_splicings); | 
|  | 647 } | 
|  | 648 | 
|  | 649 print STDERR "Annotating variants...\n" unless $quiet; | 
|  | 650 # Assume contigs and positions in order | 
|  | 651 print STDERR "Processing file $hgvs_table_file...\n" unless $quiet; | 
|  | 652 open(HGVS, $hgvs_table_file) | 
|  | 653     or die "Cannot open HGVS file $hgvs_table_file for reading: $!\n"; | 
|  | 654 # Output file for annotated snps and frameshifts | 
|  | 655 my $header = <HGVS>; # header | 
|  | 656 chomp $header; | 
|  | 657 my @headers = split /\t/, $header; | 
|  | 658 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $ftype_column, $dbid_column, $maf_src_column, $maf_column, $internal_freq_column, | 
|  | 659     $cdna_hgvs_column, $aa_hgvs_column, $transcript_column, $transcript_length_column, $strand_column, $zygosity_column, $splice_dist_column, $caveats_column, $phase_column, | 
|  | 660     $srcs_column, $pvalue_column, $to_column, $genename_column, $rares_column, $rares_header); | 
|  | 661 my %req_columns = ( | 
|  | 662 	"Chr" => \$chr_column, | 
|  | 663 	"DNA From" => \$pos_column, | 
|  | 664 	"DNA To" => \$to_column, | 
|  | 665 	"Gene Name" => \$genename_column, | 
|  | 666 	"Ref base" => \$ref_column, | 
|  | 667 	"Obs base" => \$alt_column, | 
|  | 668 	"Variant Reads" => \$alt_cnt_column, | 
|  | 669 	"Total Reads" => \$tot_cnt_column, | 
|  | 670 	"Feature type" => \$ftype_column, | 
|  | 671 	"Variant DB ID" => \$dbid_column, | 
|  | 672 	"Pop. freq. source" => \$maf_src_column, | 
|  | 673 	"Pop. freq." => \$maf_column, | 
|  | 674 	"Transcript HGVS" => \$cdna_hgvs_column, | 
|  | 675 	"Protein HGVS" => \$aa_hgvs_column, | 
|  | 676         "Selected transcript" => \$transcript_column, | 
|  | 677 	"Transcript length" => \$transcript_length_column, | 
|  | 678 	"Strand" => \$strand_column, | 
|  | 679 	"Zygosity" => \$zygosity_column, | 
|  | 680 	"Closest exon junction (AA coding variants)" => \$splice_dist_column, | 
|  | 681 	"Caveats" => \$caveats_column, | 
|  | 682 	"Phase" => \$phase_column, | 
|  | 683         "P-value" => \$pvalue_column); | 
|  | 684 &load_header_columns(\%req_columns, \@headers); | 
|  | 685 for(my $i = 0; $i <= $#headers; $i++){ | 
|  | 686   # Optional columns go here | 
|  | 687   if($headers[$i] eq "Sources"){ | 
|  | 688     $srcs_column = $i; | 
|  | 689   } | 
|  | 690   elsif($headers[$i] eq "Internal pop. freq."){ | 
|  | 691     $internal_freq_column = $i; | 
|  | 692   } | 
|  | 693   elsif($headers[$i] =~ /^Num rare variants/){ | 
|  | 694     $rares_column = $i; | 
|  | 695     $rares_header = $headers[$i]; | 
|  | 696   } | 
|  | 697 } | 
|  | 698 open(OUT, ">$outfile") | 
|  | 699     or die "Cannot open $outfile for writing: $!\n"; | 
|  | 700 print OUT join("\t", "Chr", "DNA From", "DNA To", "Ref base", "Obs base", "% ref read","% variant read", | 
|  | 701 	   "Variant Reads", "Total Reads", "Feature type","Variant type", "Variant context", "Variant DB ID", "Pop. freq. source", | 
|  | 702 	   "Pop. freq."), (defined $internal_freq_column ? "\tInternal pop. freq.\t" : "\t"), join("\t", "SIFT Score", "Polyphen2 Score", "Polyphen2 call", "GERP score", | 
|  | 703 	   "Gene Name", "Transcript HGVS", | 
|  | 704 	   "Protein HGVS", "Zygosity","Closest exon junction (AA coding variants)","Splicing alteration potential","Splicing alteration details", | 
|  | 705 	   "OMIM Morbid Map","Tissue expression", "Pathways", "Pathway refs", | 
|  | 706 	   "Gene Function", "Spanning InterPro Domains", "P-value", "Caveats", "Phase", | 
|  | 707            "Selected transcript", "Transcript length", "Strand"), | 
|  | 708            (defined $rares_column ? "\t$rares_header" : ""), | 
|  | 709            (defined $srcs_column ? "\tSources" : ""), "\n"; | 
|  | 710 while(<HGVS>){ | 
|  | 711     chomp; | 
|  | 712     my @fields = split /\t/, $_, -1; | 
|  | 713     my $mode; | 
|  | 714     my $protein_hgvs = $fields[$aa_hgvs_column]; | 
|  | 715     my $refSeq = $fields[$ref_column]; | 
|  | 716     my $newBase = $fields[$alt_column]; | 
|  | 717     my $splicing_potential = "NA"; | 
|  | 718     my $splicing_details = "NA"; | 
|  | 719     if(defined $predict_cryptic_splicings){ | 
|  | 720       my $distance = $fields[$splice_dist_column]; # only for coding variants | 
|  | 721       my $site_type; | 
|  | 722       if($distance eq "NA"){ | 
|  | 723       } | 
|  | 724       elsif($distance){ | 
|  | 725         $site_type = $distance < 0 ? "donor" : "acceptor"; # exon context | 
|  | 726       } | 
|  | 727       else{ | 
|  | 728         if($fields[$cdna_hgvs_column] =~ /\d+([\-+]\d+)/){ # get from the cDNA HGVS otherwise | 
|  | 729           $distance = $1; | 
|  | 730           $distance =~ s/^\+//; | 
|  | 731           $site_type = $distance > 0 ? "donor" : "acceptor"; # intron context | 
|  | 732         } | 
|  | 733       } | 
|  | 734       # only do splicing predictions for novel or rare variants, since these predictions are expensive | 
|  | 735       if($distance and $distance ne "NA" and ($fields[$maf_column] eq "NA" or $fields[$maf_column] <= 0.01)){ | 
|  | 736         #print STDERR "Running prediction for rare variant ($fields[3], MAF=$fields[14]): $fields[5], $fields[6], $refSeq, $newBase, $fields[4], $distance, $site_type\n"; | 
|  | 737         ($splicing_potential, $splicing_details) = &predict_splicing($fields[$chr_column], $fields[$pos_column], $refSeq, $newBase, $fields[$strand_column], $distance, $site_type); | 
|  | 738       } | 
|  | 739     } | 
|  | 740     my $lookupBase; | 
|  | 741     if($fields[$strand_column] eq "-"){ | 
|  | 742       if($refSeq ne "NA"){ | 
|  | 743         $refSeq = reverse($refSeq); | 
|  | 744         $refSeq =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 745       } | 
|  | 746       if($newBase ne "NA"){ | 
|  | 747         $newBase = reverse($newBase); | 
|  | 748         $newBase =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 749         $newBase =~ s(TN/)(NA/); | 
|  | 750         $newBase =~ s(/TN)(/NA); | 
|  | 751       } | 
|  | 752     } | 
|  | 753     if($fields[$cdna_hgvs_column] =~ /^g\..*?(?:ins|delins|>)([ACGTN]+)$/){ | 
|  | 754       $mode = "snps"; | 
|  | 755     } | 
|  | 756     elsif($fields[$cdna_hgvs_column] =~ /^g\..*?del([ACGTN]+)$/){ | 
|  | 757       $mode = "snps"; | 
|  | 758     } | 
|  | 759     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+(?:[\-+]\d+)?ins([ACGTN]+)$/ or $fields[$cdna_hgvs_column] =~ /^c\.\d+(?:[\-+]\d+)?_\d+ins([ACGTN]+)$/){ | 
|  | 760         if(not length($1)%3){ | 
|  | 761             $mode = "snps"; | 
|  | 762         } | 
|  | 763         else{ | 
|  | 764             $mode = "frameshifts"; | 
|  | 765         } | 
|  | 766     } | 
|  | 767     elsif($fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:[\-+]\d+)?_(\d+)(?:[\-+]\d+)?delins([ACGTN]+)$/){ | 
|  | 768 	if(not (length($3)-$2+$1-1)%3){ | 
|  | 769 	    $mode = "snps"; | 
|  | 770 	} | 
|  | 771 	else{ | 
|  | 772 	    $mode = "frameshifts"; | 
|  | 773 	} | 
|  | 774     } | 
|  | 775     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[+\-]\d+_\d+[+\-]\d+delins([ACGTN]+)$/ or | 
|  | 776           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or | 
|  | 777           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?del([ACGTN]*)$/ or | 
|  | 778           $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+del([ACGTN]*)$/ or | 
|  | 779           $fields[$cdna_hgvs_column] =~ /^c\.[\-*]\d+(?:[\-+]\d+)?_[\-*]\d+(?:[\-+]\d+)?(?:ins|delins)?([ACGTN]+)$/){ | 
|  | 780 	$mode = "snps"; | 
|  | 781     } | 
|  | 782     elsif($fields[$cdna_hgvs_column] =~ /^c\.([-*]?\d+)del(\S+)/){ | 
|  | 783         my $dist = $1; | 
|  | 784         $dist =~ s/^\*//; | 
|  | 785         if(abs($dist) < 4){ | 
|  | 786           $mode = "snps"; | 
|  | 787         } | 
|  | 788         else{ | 
|  | 789           $mode = "frameshifts"; | 
|  | 790         } | 
|  | 791     } | 
|  | 792     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+ins(\S*)/ or | 
|  | 793           $fields[$cdna_hgvs_column] =~ /^c\.\d+[\-+]\d+_\d+[\-+]\d+del(\S*)/){ | 
|  | 794         $mode = "snps"; | 
|  | 795     } | 
|  | 796     elsif($fields[$cdna_hgvs_column] =~ /^c\.(-?\d+)_(\d+)(?:\+\d+)?del(\S+)/ or | 
|  | 797           $fields[$cdna_hgvs_column] =~ /^c\.(\d+)(?:\-\d+)?_(\d+)del(\S+)/){ | 
|  | 798 	if(not ($2-$1+1)%3){ | 
|  | 799 	    $mode = "snps"; | 
|  | 800 	} | 
|  | 801 	else{ | 
|  | 802 	    $mode = "frameshifts"; | 
|  | 803 	} | 
|  | 804     } | 
|  | 805     elsif($fields[$cdna_hgvs_column] =~ /^c\.\d+_\d+ins([ACGTN]+)$/){ | 
|  | 806 	if(not length($1)%3){ | 
|  | 807 	    $mode = "snps"; | 
|  | 808 	} | 
|  | 809 	else{ | 
|  | 810 	    $mode = "frameshifts"; | 
|  | 811 	} | 
|  | 812     } | 
|  | 813     # special case of shifted start codon | 
|  | 814     elsif($fields[$cdna_hgvs_column] =~ /^c\.-1_1+ins([ACGTN]+)$/ or | 
|  | 815           $fields[$cdna_hgvs_column] =~ /inv$/){ | 
|  | 816         $mode = "snps"; | 
|  | 817     } | 
|  | 818     elsif($fields[$cdna_hgvs_column] =~ /[ACGTN]>([ACGTN])$/){ | 
|  | 819 	# todo: handle multiple consecutive substitutions, will have to run polyphen separately | 
|  | 820         $lookupBase = $newBase; | 
|  | 821 	if($fields[$strand_column] eq "-"){ # revert to positive strand variant for SIFT index | 
|  | 822 	    $lookupBase =~ tr/ACGTacgt/TGCAtgca/; | 
|  | 823         } | 
|  | 824 	$mode = "snps"; | 
|  | 825     } | 
|  | 826     else{ | 
|  | 827 	$mode = "snps"; | 
|  | 828     } | 
|  | 829     if($fields[$chr_column] ne $cur_chr){ | 
|  | 830 	#Prepared DB connection on per-chromosome basis | 
|  | 831 	$cur_chr = $fields[$chr_column]; | 
|  | 832 	$cur_chr =~ s/^chr//; | 
|  | 833 	$db_chr = DBI->connect("dbi:SQLite:dbname=$sift_dir/Human_CHR$cur_chr.sqlite", | 
|  | 834 			       "", | 
|  | 835 			       "", | 
|  | 836 			       { RaiseError => 1, AutoCommit => 1 }) unless not -e "$sift_dir/Human_CHR$cur_chr.sqlite"; | 
|  | 837 | 
|  | 838 	$cur_max_pos = -1; | 
|  | 839     } | 
|  | 840     if($fields[$pos_column] =~ /^\d+$/ and $cur_max_pos < $fields[$pos_column]){ | 
|  | 841 	undef $fr_stmt; | 
|  | 842 	undef $snp_stmt; | 
|  | 843         my $c = $fields[$chr_column]; | 
|  | 844         $c = "chr$c" if $c !~ /^chr/; | 
|  | 845 	for my $chr_table_key (keys %{$chr_tables{$c}}){ | 
|  | 846 	    my $chr_table_range = $chr_tables{$c}->{$chr_table_key}; | 
|  | 847 	    if($chr_table_range->[0] <= $fields[$pos_column] and $chr_table_range->[1] >= $fields[$pos_column]){ | 
|  | 848 		$fr_stmt = $db_chr->prepare("select SCORE". | 
|  | 849 					    " from $chr_table_key where COORD1 = ?"); | 
|  | 850 		$snp_stmt = $db_chr->prepare("select SCORE". | 
|  | 851 					     " from $chr_table_key where COORD1 = ? AND NT2 = ?"); | 
|  | 852 		$cur_max_pos = $chr_table_range->[1]; | 
|  | 853 		last; | 
|  | 854 	    } | 
|  | 855 	} | 
|  | 856 	if(not defined $fr_stmt and not $quiet){ | 
|  | 857 	    warn "Got request for position not in range of SIFT ($c:$fields[$pos_column])\n"; | 
|  | 858 	} | 
|  | 859     } | 
|  | 860 | 
|  | 861     my @cols; | 
|  | 862     if(not $fr_stmt){ | 
|  | 863        @cols = ("NA"); | 
|  | 864     } | 
|  | 865     elsif($mode eq "frameshifts"){ | 
|  | 866 	$fr_stmt->execute($fields[$pos_column]); | 
|  | 867 	@cols = $fr_stmt->fetchrow_array(); | 
|  | 868     } | 
|  | 869     else{ | 
|  | 870 	$snp_stmt->execute($fields[$pos_column]-1, $lookupBase); | 
|  | 871 	@cols = $snp_stmt->fetchrow_array(); | 
|  | 872     } | 
|  | 873     # See if the change is in a CDS, and if it's non-synonymous | 
|  | 874     my ($type, $location) = ("silent", "intron"); | 
|  | 875     my $start = $fields[$pos_column]; | 
|  | 876     my $stop = $fields[$to_column]; | 
|  | 877     my $interpro = get_range_info(\%interpro_ids, $fields[$chr_column], $start, $stop); | 
|  | 878     my $domains = join("; ", sort keys %$interpro); | 
|  | 879 | 
|  | 880     my $sift_score = "NA"; | 
|  | 881     my $variant_key = "NA"; | 
|  | 882     my $transcript_hgvs = $fields[$cdna_hgvs_column]; | 
|  | 883     if($transcript_hgvs =~ /\.\d+[ACGTN]>/ or $transcript_hgvs =~ /\.\d+(?:_|del|ins)/ or $transcript_hgvs =~ /[+\-]\?/){ | 
|  | 884 	$location = "exon"; | 
|  | 885     } | 
|  | 886     elsif($transcript_hgvs =~ /\.\d+[\-+](\d+)/){ | 
|  | 887 	if($1 < 21){ | 
|  | 888 	    $type = "splice"; | 
|  | 889 	} | 
|  | 890 	else{ | 
|  | 891 	    $type = "intronic"; | 
|  | 892 	} | 
|  | 893 	$location = "splice site"; | 
|  | 894     } | 
|  | 895     elsif($transcript_hgvs =~ /\.\*\d+/){ | 
|  | 896 	$location = "3'UTR"; | 
|  | 897     } | 
|  | 898     elsif($transcript_hgvs =~ /\.-\d+/){ | 
|  | 899 	$location = "5'UTR"; | 
|  | 900     } | 
|  | 901     if($mode eq "frameshifts"){ | 
|  | 902 	$type = "frameshift"; | 
|  | 903     } | 
|  | 904     else{ | 
|  | 905 	if(not defined $protein_hgvs){ | 
|  | 906             $type = "unknown"; | 
|  | 907         } | 
|  | 908         elsif($protein_hgvs eq "NA"){ | 
|  | 909 	    $type = "non-coding"; | 
|  | 910 	} | 
|  | 911 	elsif($protein_hgvs =~ /=$/){ | 
|  | 912 	    $type = "silent"; | 
|  | 913 	} | 
|  | 914 	elsif($protein_hgvs =~ /\d+\*/){ # nonsense | 
|  | 915 	    $type = "nonsense"; | 
|  | 916 	} | 
|  | 917 	elsif($protein_hgvs =~ /ext/){ # nonsense | 
|  | 918 	    $type = "nonstop"; | 
|  | 919 	} | 
|  | 920         elsif($protein_hgvs eq "p.0?" or $protein_hgvs eq "p.?"){ | 
|  | 921             $type = "unknown"; | 
|  | 922         } | 
|  | 923 	else{ | 
|  | 924 	    $type = "missense"; | 
|  | 925 	} | 
|  | 926 | 
|  | 927 	$sift_score = $cols[0] || "NA"; | 
|  | 928 | 
|  | 929         # Record the need to fetch VCF polyphen and gerp data later (en masse, for efficiency) | 
|  | 930         # Use newBase instead of lookupBase for PolyPhen since the orientation is always relative to the transcript | 
|  | 931 	$variant_key = get_variant_key($fields[$chr_column], $fields[$pos_column], $fields[$ref_column], $newBase); | 
|  | 932         #print STDERR "Variant key is $variant_key\n"; | 
|  | 933     } | 
|  | 934     my $transcript_type = $fields[$ftype_column]; | 
|  | 935     my $transcript_length = $fields[$transcript_length_column]; | 
|  | 936     my $transcript_id = $fields[$transcript_column]; # sift score is derived from this transcript | 
|  | 937     my $transcript_strand = $fields[$strand_column]; | 
|  | 938     my $zygosity = $fields[$zygosity_column]; | 
|  | 939     my $rsID = $fields[$dbid_column]; | 
|  | 940     my $popFreq = $fields[$maf_column]; | 
|  | 941     my $internalFreq; | 
|  | 942     $internalFreq = $fields[$internal_freq_column] if defined $internal_freq_column; | 
|  | 943     my $popFreqSrc = $fields[$maf_src_column]; | 
|  | 944 | 
|  | 945     # Get the gene name and omim info | 
|  | 946     my %seen; | 
|  | 947     my @gene_names = split /; /, $fields[$genename_column]; | 
|  | 948     my $morbid = join("; ", grep {/\S/ and not $seen{$_}++} map({$gene2morbids{$_} or ""} @gene_names)); | 
|  | 949     my $tissue = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissues{$_} or ""} @gene_names)); | 
|  | 950     my $function = join("; ", grep {/\S/ and not $seen{$_}++} map({$tissue_desc{$_} or ""} @gene_names)); | 
|  | 951     my $pathways = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathways{$_} or ""} @gene_names)); | 
|  | 952     my $pathway_ids = join("; ", grep {/\S/ and not $seen{$_}++} map({$pathway_ids{$_} or ""} @gene_names)); | 
|  | 953 | 
|  | 954     # It may be that the calls and freqs are multiple concatenated values, e.g. "X; Y" | 
|  | 955     my @tot_bases = split /; /,$fields[$tot_cnt_column]; | 
|  | 956     my @var_bases = split /; /,$fields[$alt_cnt_column]; | 
|  | 957     my (@pct_nonvar, @pct_var); | 
|  | 958     for (my $i = 0; $i <= $#tot_bases; $i++){ | 
|  | 959         push @pct_nonvar, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int(($tot_bases[$i]-$var_bases[$i])/$tot_bases[$i]*100+0.5); | 
|  | 960         push @pct_var, $tot_bases[$i] =~ /^(NA|0)$/ ? $tot_bases[$i] : int($var_bases[$i]/$tot_bases[$i]*100+0.5); | 
|  | 961     } | 
|  | 962 | 
|  | 963     push @lines, [ | 
|  | 964 		   $fields[$chr_column], # chr | 
|  | 965 		   $fields[$pos_column], # pos | 
|  | 966 		   $fields[$to_column], | 
|  | 967 		   $fields[$ref_column], | 
|  | 968 		   $fields[$alt_column], | 
|  | 969 		   join("; ", @pct_nonvar), # pct ref | 
|  | 970 		   join("; ", @pct_var),  # pct var | 
|  | 971 		   $fields[$alt_cnt_column], # num var | 
|  | 972 		   $fields[$tot_cnt_column], # num reads | 
|  | 973 		   $transcript_type, # protein_coding, pseudogene, etc. | 
|  | 974 		   $type, | 
|  | 975 		   $location, | 
|  | 976 		   $rsID, | 
|  | 977 		   $popFreqSrc, | 
|  | 978 		   (defined $internal_freq_column ? ($popFreq,$internalFreq) : ($popFreq)), | 
|  | 979 		   $sift_score, | 
|  | 980 		   $variant_key, # 16th field | 
|  | 981 		   join("; ", @gene_names), | 
|  | 982 		   $transcript_hgvs, | 
|  | 983 		   $protein_hgvs, | 
|  | 984 		   $zygosity, | 
|  | 985                    $fields[$splice_dist_column], # distance to exon edge | 
|  | 986                    $splicing_potential, | 
|  | 987                    $splicing_details, | 
|  | 988                    $morbid, | 
|  | 989 		   $tissue, | 
|  | 990 		   $pathways, | 
|  | 991 		   $pathway_ids, | 
|  | 992 		   $function, | 
|  | 993                    $domains, | 
|  | 994                    $fields[$pvalue_column], | 
|  | 995                    $fields[$caveats_column], # caveats | 
|  | 996                    (defined $rares_column ? ($fields[$rares_column],$transcript_id) : ($transcript_id)), | 
|  | 997 		   $transcript_length, | 
|  | 998                    $transcript_strand, | 
|  | 999 		   (defined $srcs_column ? ($fields[$phase_column],$fields[$srcs_column]) : ($fields[$phase_column]))]; | 
|  | 1000 } | 
|  | 1001 | 
|  | 1002 print STDERR "Adding Polyphen2 and GERP info...\n" unless $quiet; | 
|  | 1003 # retrieve the polyphen and gerp info en masse for each chromosome, as this is much more efficient | 
|  | 1004 my $vkey_col = 16+(defined $internal_freq_column ? 1 : 0); | 
|  | 1005 for my $line_fields (@lines){ | 
|  | 1006   # expand the key into the relevant MAF values | 
|  | 1007   $line_fields->[$vkey_col] = join("\t", polyphen_gerp_info($gerp_dir,$polyphen_file,$line_fields->[$vkey_col])); # fields[$vkey_col] is variant key | 
|  | 1008   print OUT join("\t", @{$line_fields}), "\n"; | 
|  | 1009 } | 
|  | 1010 close(OUT); | 
|  | 1011 unlink $tmpfile if defined $tmpfile; | 
|  | 1012 | 
|  | 1013 sub load_header_columns{ | 
|  | 1014   my ($reqs_hash_ref, $headers_array_ref) = @_; | 
|  | 1015   my %unfulfilled; | 
|  | 1016   for my $field_name (keys %$reqs_hash_ref){ | 
|  | 1017     $unfulfilled{$field_name} = 1; | 
|  | 1018   } | 
|  | 1019   for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ | 
|  | 1020     for my $req_header_name (keys %$reqs_hash_ref){ | 
|  | 1021       if($req_header_name eq $headers_array_ref->[$i]){ | 
|  | 1022         ${$reqs_hash_ref->{$req_header_name}} = $i; | 
|  | 1023         delete $unfulfilled{$req_header_name}; | 
|  | 1024         last; | 
|  | 1025       } | 
|  | 1026     } | 
|  | 1027   } | 
|  | 1028   if(keys %unfulfilled){ | 
|  | 1029     die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; | 
|  | 1030   } | 
|  | 1031 } |