comparison dibayes_gff2vcf @ 0:18d965813efc default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:37:21 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:18d965813efc
1 #!/usr/bin/env perl
2
3 use Statistics::Zed;
4 use strict;
5 use warnings;
6
7 if($ARGV[0] eq "-v"){
8 print "Version 1.0\n";
9 exit;
10 }
11
12 @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n";
13
14 my $zed = new Statistics::Zed;
15 my $log_10_inv = 1/log(10);
16 open(GFF3IN, $ARGV[0])
17 or die "Cannot open input GFF file $ARGV[0] for reading: $!\n";
18 open(VCFOUT, ">$ARGV[1]")
19 or die "Cannot open output VCF file $ARGV[1] for writing: $!\n";
20
21 my @date = localtime();
22 my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3];
23
24 my $ref_file;
25 my $source;
26 while(<GFF3IN>){
27 last if /^#[^#]/; # end of file headers marked by lack of double hash
28 if(/source-version\s+(.*)$/){
29 $source = $1;
30 }
31 elsif(/genome-build\s+(.*)$/){
32 $ref_file = $1;
33 }
34 }
35
36 print VCFOUT <<END;
37 ##fileformat=VCFv4.0
38 ##fileDate=$date
39 ##source=$source
40 ##reference=$ref_file
41 ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score">
42 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
43 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
44 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
45 ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth">
46 ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth">
47 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown
48 END
49
50 while(<GFF3IN>){
51 next if /^#/;
52 chomp;
53 my @F = split /\t/, $_;
54 my $seq = $F[0];
55 my $pos = $F[3];
56 my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5]));
57 my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv);
58 my @info = split /;/, $F[8];
59 my $depth_reads = ".";
60 my $ref_base;
61 my @alleles;
62 my $ref_reads;
63 my $var_reads;
64 for my $tag (@info){
65 my ($key, $value) = split /=/, $tag;
66 if($key eq "coverage"){
67 $depth_reads = $value;
68 }
69 elsif($key eq "reference"){
70 $ref_base = $value;
71 }
72 elsif($key eq "allele-call"){
73 push @alleles, split /\//, $value;
74 }
75 elsif($key eq "refAlleleCounts"){
76 $ref_reads = $value;
77 }
78 elsif($key eq "novelAlleleCounts"){
79 $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored
80 }
81 }
82
83 my $genotype = "0/1"; # default het for one non-ref allele
84 if(@alleles == 1){ # homo for one var allele
85 $genotype = "1/1";
86 }
87 elsif($ref_base eq $alleles[0]){
88 # het for one non-reference
89 shift @alleles;
90 }
91 elsif($ref_base eq $alleles[1]){
92 # het for one non-reference
93 pop @alleles;
94 }
95 else{
96 # het for two non-ref alleles
97 $genotype = "1/2";
98 $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly
99 $var_reads = "$var_reads,$var_reads";
100 }
101
102 # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME...
103 print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ",
104 "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n";
105 }
106 close(VCFOUT);
107 close(GFF3IN);