| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 | 
|  | 6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had. | 
|  | 7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n"; | 
|  | 8 my $hgvs_in_file = shift @ARGV; | 
|  | 9 my $hgvs_out_file = shift @ARGV; | 
|  | 10 my $sample_name = shift @ARGV; | 
|  | 11 my $bam_file = shift @ARGV; | 
|  | 12 my $homo_fn = shift @ARGV; | 
|  | 13 my $het_fn = shift @ARGV; | 
|  | 14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001) | 
|  | 15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01) | 
|  | 16 | 
|  | 17 open(HGVS, $hgvs_in_file) | 
|  | 18   or die "Cannot open $hgvs_in_file for reading: $!\n"; | 
|  | 19 my $header_line = <HGVS>; | 
|  | 20 chomp $header_line; | 
|  | 21 my @headers = split /\t/, $header_line; | 
|  | 22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column); | 
|  | 23 for(my $i = 0; $i <= $#headers; $i++){ | 
|  | 24   if($headers[$i] eq "Chr"){ | 
|  | 25     $chr_column = $i; | 
|  | 26   } | 
|  | 27   elsif($headers[$i] eq "DNA From"){ | 
|  | 28     $pos_column = $i; | 
|  | 29   } | 
|  | 30   elsif($headers[$i] eq "Ref base"){ | 
|  | 31     $ref_column = $i; | 
|  | 32   } | 
|  | 33   elsif($headers[$i] eq "Obs base"){ | 
|  | 34     $alt_column = $i; | 
|  | 35   } | 
|  | 36   elsif($headers[$i] eq "Zygosity"){ | 
|  | 37     $zygosity_column = $i; | 
|  | 38   } | 
|  | 39   elsif($headers[$i] eq "Variant Reads"){ | 
|  | 40     $alt_cnt_column = $i; | 
|  | 41   } | 
|  | 42   elsif($headers[$i] eq "Total Reads"){ | 
|  | 43     $tot_cnt_column = $i; | 
|  | 44   } | 
|  | 45   elsif($headers[$i] eq "Sources"){ | 
|  | 46     $srcs_column = $i; | 
|  | 47   } | 
|  | 48 } | 
|  | 49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column; | 
|  | 50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column; | 
|  | 51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column; | 
|  | 52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column; | 
|  | 53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column; | 
|  | 54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column; | 
|  | 55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column; | 
|  | 56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column; | 
|  | 57 open(OUT, ">$hgvs_out_file") | 
|  | 58   or die "Cannot open $hgvs_out_file for writing: $!\n"; | 
|  | 59 print OUT $header_line, "\n"; # header line | 
|  | 60 my ($last_mpileup, $last_key); | 
|  | 61 while(<HGVS>){ | 
|  | 62   chomp; | 
|  | 63   my @F = split /\t/, $_; | 
|  | 64   if(index($F[$#F], $sample_name) != -1){ | 
|  | 65     print OUT "$_\n";  # keep line as-is | 
|  | 66     next; | 
|  | 67   } | 
|  | 68   else{ | 
|  | 69     # Not in a call right now | 
|  | 70     # If there are reads supporting the call, add the sample name to the list (last column) | 
|  | 71     # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name | 
|  | 72     my $chr = $F[$chr_column]; | 
|  | 73     my $pos = $F[$pos_column]; | 
|  | 74     my $ref = $F[$ref_column]; | 
|  | 75     my $alt = $F[$alt_column]; | 
|  | 76 | 
|  | 77     # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary | 
|  | 78     if(length($ref) > 1 and length($ref) == length($alt)){ | 
|  | 79       while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){ | 
|  | 80         substr($ref, 0, 1) = ""; | 
|  | 81         substr($alt, 0, 1) = ""; | 
|  | 82         $pos++; | 
|  | 83       } | 
|  | 84       while(length($ref) and substr($ref, -1) eq substr($alt, -1)){ | 
|  | 85         substr($ref, -1) = ""; | 
|  | 86         substr($alt, -1) = ""; | 
|  | 87       } | 
|  | 88     } | 
|  | 89 | 
|  | 90     # (todo: what about compound het?) | 
|  | 91     my $zygosity = $F[$zygosity_column]; | 
|  | 92     # see if the coverage is below a called zygosity threshold | 
|  | 93     my $end = $pos; | 
|  | 94     # comment out next bit if we assume that reads have all indels left aligned | 
|  | 95     #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){ | 
|  | 96     #  $end += length($alt) - length($ref); | 
|  | 97     #} | 
|  | 98     my $mpileup; | 
|  | 99     if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line | 
|  | 100       $mpileup = $last_mpileup; | 
|  | 101     } | 
|  | 102     else{ | 
|  | 103       $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`; | 
|  | 104       $last_mpileup = $mpileup; | 
|  | 105       $last_key = "$chr:$pos:$ref:$alt"; | 
|  | 106     } | 
|  | 107 | 
|  | 108     $ref = "" if $ref eq "NA"; | 
|  | 109     $alt = "" if $alt eq "NA"; | 
|  | 110     # lines look something like chr10	74100774	N	164	G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg... | 
|  | 111     # deletions are -1N, while insertions are +1g | 
|  | 112     for my $line (split /\n/, $mpileup){ | 
|  | 113       my @M = split /\t/, $line; | 
|  | 114       next if $M[0] ne $chr; # header or message stuff | 
|  | 115       my $depth = $M[3]; | 
|  | 116       if($depth == 0){ # No reads at all | 
|  | 117         $F[$srcs_column] .= "; ~$sample_name";  # add this sample to the last column | 
|  | 118         $F[$zygosity_column] .= "; low"; # zygosity unknown | 
|  | 119         $F[$alt_cnt_column] .= "; 0"; # alt count | 
|  | 120         $F[$tot_cnt_column] .= "; 0"; # tot count | 
|  | 121         #print OUT join("\t", @F), "\n"; | 
|  | 122         last; | 
|  | 123       } | 
|  | 124       my %base_count; | 
|  | 125       while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){ | 
|  | 126         my $call = uc($&);  # upper case version of letter | 
|  | 127         if($1){ # call is -1n, or +3gcc, etc. | 
|  | 128            if(substr($1, 0, 1) eq "+"){ # insertion | 
|  | 129              my $offset = substr($1, 1); | 
|  | 130              $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number | 
|  | 131              pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed | 
|  | 132            } | 
|  | 133         } | 
|  | 134         elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call | 
|  | 135            next; # ignore | 
|  | 136         } | 
|  | 137         $base_count{$call}++; | 
|  | 138       } | 
|  | 139       my $base_count = 0; | 
|  | 140       if(length($ref) == 1 and length($alt) == 1){ #SNP | 
|  | 141         if(exists $base_count{$alt}){ | 
|  | 142           $base_count = $base_count{$alt}; | 
|  | 143         } | 
|  | 144       } | 
|  | 145       elsif(length($ref) > length($alt)){ #del | 
|  | 146         my $del_length = length($alt)-length($ref); #  e.g. -2 | 
|  | 147         if(exists $base_count{$del_length}){ | 
|  | 148           $base_count = $base_count{$del_length}; | 
|  | 149         } | 
|  | 150       } | 
|  | 151       elsif(length($ref) < length($alt)){ #ins | 
|  | 152         my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref));  # e.g. +3AGT | 
|  | 153         if(exists $base_count{$ins_call}){ | 
|  | 154           $base_count = $base_count{$ins_call}; | 
|  | 155         } | 
|  | 156       } | 
|  | 157       else{ | 
|  | 158         # MNP variant? | 
|  | 159         #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n"; | 
|  | 160         next; | 
|  | 161       } | 
|  | 162       if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call | 
|  | 163           $F[$srcs_column] .= "; +$sample_name";  # add this sample to the last column | 
|  | 164           $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity | 
|  | 165           $F[$alt_cnt_column] .= "; ".$base_count; # alt count | 
|  | 166           $F[$tot_cnt_column] .= "; $depth"; # tot count | 
|  | 167       } | 
|  | 168       elsif($zygosity =~ /het/ and $depth < $het_fn){ | 
|  | 169           $F[$srcs_column].= "; ~$sample_name"; | 
|  | 170           $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity | 
|  | 171           $F[$alt_cnt_column] .= "; $base_count"; | 
|  | 172           $F[$tot_cnt_column] .= "; $depth"; # tot count | 
|  | 173       } | 
|  | 174       elsif($depth < $homo_fn){ | 
|  | 175           $F[$srcs_column].= "; ~$sample_name"; | 
|  | 176           $F[$zygosity_column] .= "; low"; # zygosity | 
|  | 177           $F[$alt_cnt_column] .= "; $base_count"; | 
|  | 178           $F[$tot_cnt_column] .= "; $depth"; # tot count | 
|  | 179       } | 
|  | 180     } | 
|  | 181     print OUT join("\t", @F), "\n"; | 
|  | 182   } | 
|  | 183 } | 
|  | 184 close(HGVS); | 
|  | 185 close(OUT); |