annotate dibayes_gff2vcf @ 0:18d965813efc default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:37:21 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use Statistics::Zed;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use strict;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use warnings;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 if($ARGV[0] eq "-v"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 print "Version 1.0\n";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 exit;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 @ARGV == 2 or die "Usage: $0 <diBayes_snp_calls.gff3> <output.vcf>\n";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 my $zed = new Statistics::Zed;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 my $log_10_inv = 1/log(10);
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 open(GFF3IN, $ARGV[0])
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 or die "Cannot open input GFF file $ARGV[0] for reading: $!\n";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 open(VCFOUT, ">$ARGV[1]")
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 or die "Cannot open output VCF file $ARGV[1] for writing: $!\n";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 my @date = localtime();
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 my $date = ($date[5]+1900).($date[4] < 10 ? "0" : "").$date[4].($date[3] < 10 ? "0" : "").$date[3];
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 my $ref_file;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 my $source;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 while(<GFF3IN>){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 last if /^#[^#]/; # end of file headers marked by lack of double hash
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 if(/source-version\s+(.*)$/){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 $source = $1;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 elsif(/genome-build\s+(.*)$/){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 $ref_file = $1;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 print VCFOUT <<END;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 ##fileformat=VCFv4.0
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 ##fileDate=$date
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 ##source=$source
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 ##reference=$ref_file
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 ##INFO=<ID=Z,Number=1,Type=Float,Description="SNP z-score">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 ##FORMAT=<ID=RR,Number=1,Type=Integer,Description="Reference Read Depth">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 ##FORMAT=<ID=VR,Number=1,Type=Integer,Description="Major Variant Read Depth">
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Unknown
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 END
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 while(<GFF3IN>){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 next if /^#/;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 chomp;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 my @F = split /\t/, $_;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 my $seq = $F[0];
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 my $pos = $F[3];
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my $z_score = sprintf "%.4f", ($F[5] < 0.0000000001 ? "200" : $zed->p2z(value => $F[5]));
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my $qual = $F[5] == 0 ? 100 : int(-10*log($F[5])*$log_10_inv);
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 my @info = split /;/, $F[8];
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 my $depth_reads = ".";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 my $ref_base;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 my @alleles;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 my $ref_reads;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 my $var_reads;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 for my $tag (@info){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 my ($key, $value) = split /=/, $tag;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 if($key eq "coverage"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 $depth_reads = $value;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 elsif($key eq "reference"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 $ref_base = $value;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 elsif($key eq "allele-call"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 push @alleles, split /\//, $value;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 elsif($key eq "refAlleleCounts"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 $ref_reads = $value;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 elsif($key eq "novelAlleleCounts"){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 $var_reads = $value; # note: when het for two non-ref alleles, this is a wierd value to be ignored
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 my $genotype = "0/1"; # default het for one non-ref allele
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 if(@alleles == 1){ # homo for one var allele
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 $genotype = "1/1";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 elsif($ref_base eq $alleles[0]){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 # het for one non-reference
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 shift @alleles;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 elsif($ref_base eq $alleles[1]){
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 # het for one non-reference
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 pop @alleles;
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 else{
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 # het for two non-ref alleles
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 $genotype = "1/2";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 $var_reads = int(($depth_reads-$ref_reads)/2); #diBayes needs guesstimate since GFF doesn't encode this properly
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 $var_reads = "$var_reads,$var_reads";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 # Required VCF format is CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE_NAME...
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 print VCFOUT join("\t", $seq, $pos, ".", $ref_base, join(",", @alleles), $qual, ".", "Z=$z_score", "GT:VR:RR:DP:GQ",
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 "$genotype:$var_reads:$ref_reads:$depth_reads:."), "\n";
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 }
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 close(VCFOUT);
18d965813efc intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 close(GFF3IN);