Mercurial > repos > yusuf > hgvs_vcf
comparison hgvs_to_vcf @ 0:138d81f259c8 default tip
intial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:38:09 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:138d81f259c8 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 # Converts a HGVS tab delimited table into a VCF file for processing by tools that need that format. | |
| 7 | |
| 8 @ARGV == 4 or die "Usage: $0 <input hgvs.txt> <output.vcf> <sample id> <reference genome>\n"; | |
| 9 | |
| 10 open(HGVS, $ARGV[0]) | |
| 11 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
| 12 my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $dbid_column, $cdna_hgvs_column, | |
| 13 $aa_hgvs_column, $transcript_column, $zygosity_column, $caveats_column, $phase_column, $pvalue_column, $genename_column); | |
| 14 my $headers = <HGVS>; | |
| 15 chomp $headers; | |
| 16 my @headers = split /\t/, $headers; | |
| 17 my %req_columns = ( | |
| 18 "Chr" => \$chr_column, | |
| 19 "DNA From" => \$pos_column, | |
| 20 "Gene Name" => \$genename_column, | |
| 21 "Ref base" => \$ref_column, | |
| 22 "Obs base" => \$alt_column, | |
| 23 "Variant Reads" => \$alt_cnt_column, | |
| 24 "Total Reads" => \$tot_cnt_column, | |
| 25 "Variant DB ID" => \$dbid_column, | |
| 26 "Transcript HGVS" => \$cdna_hgvs_column, | |
| 27 "Protein HGVS" => \$aa_hgvs_column, | |
| 28 "Selected transcript" => \$transcript_column, | |
| 29 "Zygosity" => \$zygosity_column, | |
| 30 "Caveats" => \$caveats_column, | |
| 31 "Phase" => \$phase_column, | |
| 32 "P-value" => \$pvalue_column); | |
| 33 &load_header_columns(\%req_columns, \@headers); | |
| 34 my $phasing = "none"; | |
| 35 while(<HGVS>){ | |
| 36 my @F = split /\t/, $_; | |
| 37 if($F[$phase_column] ne ""){ | |
| 38 $phasing = "partial"; | |
| 39 last; | |
| 40 } | |
| 41 } | |
| 42 close(HGVS); | |
| 43 open(HGVS, $ARGV[0]) | |
| 44 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
| 45 | |
| 46 open(VCFOUT, ">$ARGV[1]") | |
| 47 or die "Cannot open $ARGV[1] for writing: $!\n"; | |
| 48 my ($sec, $min, $hr, $day, $month, $year) = localtime(); | |
| 49 $year += 1900; | |
| 50 $month = "0$month" if $month < 10; | |
| 51 $day = "0$day" if $day < 10; | |
| 52 $, = " "; | |
| 53 print VCFOUT <<END; | |
| 54 ##fileformat=VCFv4.1 | |
| 55 ##fileDate=$year$month$day | |
| 56 ##source=hgvs_to_vcf | |
| 57 ##reference=$ARGV[3] | |
| 58 ##phasing=$phasing | |
| 59 ##commandline="@ARGV" | |
| 60 ##FILTER=<ID=CAVEATS,Description="The variant call is possibly a false positive due to non-unique mapping, homopolymer stretch, etc."> | |
| 61 ##INFO=<ID=GENE,Number=A,Type=String,Description="List of genes that have transcripts overlapping the variant region"> | |
| 62 ##INFO=<ID=HGVS,Number=A,Type=String,Description="Effect in HGVS syntax (protein version if available, otherwise cDNA or genomic)"> | |
| 63 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |
| 64 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype"> | |
| 65 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth"> | |
| 66 ##FORMAT=<ID=RA,Number=1,Type=Integer,Description="Reference allele observation count"> | |
| 67 ##FORMAT=<ID=AA,Number=1,Type=Integer,Description="Alternate allele observation count"> | |
| 68 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $ARGV[2] | |
| 69 END | |
| 70 my $log10 = log(10); | |
| 71 <HGVS>; # get rid of header line | |
| 72 while(<HGVS>){ | |
| 73 chomp; | |
| 74 my @F = split /\t/, $_; | |
| 75 my $filter = $F[$caveats_column] ne "" ? "CAVEATS" : "PASS"; | |
| 76 my $ref = $F[$ref_column]; # always on + strand | |
| 77 my $alt = $F[$alt_column]; | |
| 78 my $qual = $F[$pvalue_column]; # to be coverted to MAQ | |
| 79 if($qual == 0){ | |
| 80 $qual = 100; | |
| 81 } | |
| 82 else{ | |
| 83 $qual = -10*log($qual)/$log10; # log10 because log is actually ln in Perl | |
| 84 } | |
| 85 my $genotype = $F[$zygosity_column] =~ /homo/ ? "1/1" : "0/1"; # todo: handle compound het SNP | |
| 86 my $transcript = $F[$transcript_column]; | |
| 87 $transcript =~ s/;.*//; | |
| 88 my $genenames = $F[$genename_column]; | |
| 89 $genenames =~ s/;\s*/,/; | |
| 90 | |
| 91 print VCFOUT join("\t", $F[$chr_column], $F[$pos_column], ($F[$dbid_column] =~ /rs/ ? $F[$dbid_column] : "."), $ref, $alt, $qual, $filter, | |
| 92 ($F[$genename_column] ne "" ? "GENE:".$F[$genename_column].";" : "")."HGVS:$transcript,".($F[$aa_hgvs_column] ne "NA" ? $F[$aa_hgvs_column]:$F[$cdna_hgvs_column]), | |
| 93 "GT:GQ:RA:AA", "$genotype:$qual:".($F[$tot_cnt_column]-$F[$alt_cnt_column]).":".$F[$alt_cnt_column]),"\n"; | |
| 94 } | |
| 95 | |
| 96 sub load_header_columns{ | |
| 97 my ($reqs_hash_ref, $headers_array_ref) = @_; | |
| 98 my %unfulfilled; | |
| 99 for my $field_name (keys %$reqs_hash_ref){ | |
| 100 $unfulfilled{$field_name} = 1; | |
| 101 } | |
| 102 for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ | |
| 103 for my $req_header_name (keys %$reqs_hash_ref){ | |
| 104 if($req_header_name eq $headers_array_ref->[$i]){ | |
| 105 ${$reqs_hash_ref->{$req_header_name}} = $i; | |
| 106 delete $unfulfilled{$req_header_name}; | |
| 107 last; | |
| 108 } | |
| 109 } | |
| 110 } | |
| 111 if(keys %unfulfilled){ | |
| 112 die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; | |
| 113 } | |
| 114 } | |
| 115 |
