# HG changeset patch # User Yusuf Ali # Date 1427312289 21600 # Node ID 138d81f259c873863a76008277f614bb7853b9d6 intial commit diff -r 000000000000 -r 138d81f259c8 HGVS2VCF.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/HGVS2VCF.xml Wed Mar 25 13:38:09 2015 -0600 @@ -0,0 +1,19 @@ + + + + echo 1.0.0 + hgvs_to_vcf $in_hgvs_table $output_vcf $sample_name hg19 + + + + + + + + + + + + + This tools converts the tabular format used to describe variants and associate annotation at ACHRI into a standard Variant Calls Format (VCF) version 4.1 file, which is accepted by many genetic analysis programs. + diff -r 000000000000 -r 138d81f259c8 hgvs_to_vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgvs_to_vcf Wed Mar 25 13:38:09 2015 -0600 @@ -0,0 +1,115 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# Converts a HGVS tab delimited table into a VCF file for processing by tools that need that format. + +@ARGV == 4 or die "Usage: $0 \n"; + +open(HGVS, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +my ($chr_column, $pos_column, $ref_column, $alt_column, $alt_cnt_column, $tot_cnt_column, $dbid_column, $cdna_hgvs_column, + $aa_hgvs_column, $transcript_column, $zygosity_column, $caveats_column, $phase_column, $pvalue_column, $genename_column); +my $headers = ; +chomp $headers; +my @headers = split /\t/, $headers; +my %req_columns = ( + "Chr" => \$chr_column, + "DNA From" => \$pos_column, + "Gene Name" => \$genename_column, + "Ref base" => \$ref_column, + "Obs base" => \$alt_column, + "Variant Reads" => \$alt_cnt_column, + "Total Reads" => \$tot_cnt_column, + "Variant DB ID" => \$dbid_column, + "Transcript HGVS" => \$cdna_hgvs_column, + "Protein HGVS" => \$aa_hgvs_column, + "Selected transcript" => \$transcript_column, + "Zygosity" => \$zygosity_column, + "Caveats" => \$caveats_column, + "Phase" => \$phase_column, + "P-value" => \$pvalue_column); +&load_header_columns(\%req_columns, \@headers); +my $phasing = "none"; +while(){ + my @F = split /\t/, $_; + if($F[$phase_column] ne ""){ + $phasing = "partial"; + last; + } +} +close(HGVS); +open(HGVS, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; + +open(VCFOUT, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; +my ($sec, $min, $hr, $day, $month, $year) = localtime(); +$year += 1900; +$month = "0$month" if $month < 10; +$day = "0$day" if $day < 10; +$, = " "; +print VCFOUT < +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $ARGV[2] +END +my $log10 = log(10); +; # get rid of header line +while(){ + chomp; + my @F = split /\t/, $_; + my $filter = $F[$caveats_column] ne "" ? "CAVEATS" : "PASS"; + my $ref = $F[$ref_column]; # always on + strand + my $alt = $F[$alt_column]; + my $qual = $F[$pvalue_column]; # to be coverted to MAQ + if($qual == 0){ + $qual = 100; + } + else{ + $qual = -10*log($qual)/$log10; # log10 because log is actually ln in Perl + } + my $genotype = $F[$zygosity_column] =~ /homo/ ? "1/1" : "0/1"; # todo: handle compound het SNP + my $transcript = $F[$transcript_column]; + $transcript =~ s/;.*//; + my $genenames = $F[$genename_column]; + $genenames =~ s/;\s*/,/; + + print VCFOUT join("\t", $F[$chr_column], $F[$pos_column], ($F[$dbid_column] =~ /rs/ ? $F[$dbid_column] : "."), $ref, $alt, $qual, $filter, + ($F[$genename_column] ne "" ? "GENE:".$F[$genename_column].";" : "")."HGVS:$transcript,".($F[$aa_hgvs_column] ne "NA" ? $F[$aa_hgvs_column]:$F[$cdna_hgvs_column]), + "GT:GQ:RA:AA", "$genotype:$qual:".($F[$tot_cnt_column]-$F[$alt_cnt_column]).":".$F[$alt_cnt_column]),"\n"; +} + +sub load_header_columns{ + my ($reqs_hash_ref, $headers_array_ref) = @_; + my %unfulfilled; + for my $field_name (keys %$reqs_hash_ref){ + $unfulfilled{$field_name} = 1; + } + for(my $i = 0; $i <= $#{$headers_array_ref}; $i++){ + for my $req_header_name (keys %$reqs_hash_ref){ + if($req_header_name eq $headers_array_ref->[$i]){ + ${$reqs_hash_ref->{$req_header_name}} = $i; + delete $unfulfilled{$req_header_name}; + last; + } + } + } + if(keys %unfulfilled){ + die "Aborting. Could not find headers in the input file for the following required fields: ", join(", ", sort keys %unfulfilled), "\n"; + } +} +