Mercurial > repos > miller-lab > snp_analysis_conversion
comparison vcf2pgSnp.pl @ 2:35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
author | cathy |
---|---|
date | Tue, 28 May 2013 17:54:02 -0400 |
parents | |
children | edf12470a1a6 |
comparison
equal
deleted
inserted
replaced
1:1d8b23a21735 | 2:35c20b109be5 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 | |
4 #convert from a vcf file to a pgSnp file. | |
5 #frequency count = chromosome count | |
6 #either a single column/individual | |
7 #or all columns as a population | |
8 | |
9 my $in; | |
10 my $stCol = 9; | |
11 my $endCol; | |
12 if (@ARGV && scalar @ARGV == 2) { | |
13 $stCol = shift @ARGV; | |
14 $in = shift @ARGV; | |
15 if ($stCol eq 'all') { $stCol = 10; } | |
16 else { $endCol = $stCol; } | |
17 $stCol--; #go from 1 based to zero based column number | |
18 if ($stCol < 9) { | |
19 print "ERROR genotype fields don't start until column 10\n"; | |
20 exit; | |
21 } | |
22 }elsif (@ARGV && scalar @ARGV == 1) { | |
23 $in = shift @ARGV; | |
24 }elsif (@ARGV) { | |
25 print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; | |
26 exit; | |
27 } | |
28 | |
29 open(FH, $in) or die "Couldn't open $in, $!\n"; | |
30 while (<FH>) { | |
31 chomp; | |
32 if (/^\s*#/) { next; } #skip comments/headers | |
33 if (/^\s*$/) { next; } #skip blank lines | |
34 my @f = split(/\t/); | |
35 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... | |
36 my $a; | |
37 my %nt; | |
38 my %all; | |
39 my $cnt = 0; | |
40 my $var; | |
41 if ($f[3] eq 'N') { next; } #ignore ref=N | |
42 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite | |
43 #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate | |
44 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason | |
45 my $ind = 0; | |
46 if ($f[8] ne 'GT') { #more than just genotype | |
47 my @t = split(/:/, $f[8]); | |
48 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } | |
49 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } | |
50 } | |
51 #count 0's, 1's, 2's | |
52 if (!$endCol) { $endCol = $#f; } | |
53 foreach my $col ($stCol .. $endCol) { | |
54 if ($ind > 0) { | |
55 my @t = split(/:/, $f[$col]); | |
56 $f[$col] = $t[$ind] . ":"; #only keep genotype part | |
57 } | |
58 if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { | |
59 $nt{$1}++; | |
60 $nt{$2}++; | |
61 }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single | |
62 $nt{$1}++; | |
63 } #else ignore | |
64 } | |
65 if (%nt) { | |
66 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } | |
67 print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info | |
68 my $cnt = scalar(keys %nt); | |
69 my $fr; | |
70 my $sc; | |
71 my $all; | |
72 if (exists $nt{0}) { | |
73 $all = uc($f[3]); | |
74 $fr = $nt{0}; | |
75 $sc = 0; | |
76 } | |
77 if (!exists $nt{0} && exists $nt{1}) { | |
78 if ($f[4] =~ /([ACTG]),?/) { | |
79 $all = $1; | |
80 $fr = $nt{1}; | |
81 $sc = 0; | |
82 }else { die "bad variant nt $f[4] for nt 1"; } | |
83 }elsif (exists $nt{1}) { | |
84 if ($f[4] =~ /([ACTG]),?/) { | |
85 $all .= '/' . $1; | |
86 $fr .= ",$nt{1}"; | |
87 $sc .= ",0"; | |
88 }else { die "bad variant nt $f[4] for nt 1"; } | |
89 } | |
90 if (exists $nt{2}) { | |
91 if ($f[4] =~ /^[ACTG],([ACTG]),?/) { | |
92 $all .= '/' . $1; | |
93 $fr .= ",$nt{2}"; | |
94 $sc .= ",0"; | |
95 }else { die "bad variant nt $f[4] for nt 2"; } | |
96 } | |
97 if (exists $nt{3}) { | |
98 if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { | |
99 $all .= '/' . $1; | |
100 $fr .= ",$nt{3}"; | |
101 $sc .= ",0"; | |
102 }else { die "bad variant nt $f[4] for nt 3"; } | |
103 } | |
104 if (exists $nt{4}) { | |
105 if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { | |
106 $all .= '/' . $1; | |
107 $fr .= ",$nt{4}"; | |
108 $sc .= ",0"; | |
109 }else { die "bad variant nt $f[4] for nt 4"; } | |
110 } | |
111 print "$all\t$cnt\t$fr\t$sc\n"; | |
112 } | |
113 } | |
114 close FH; | |
115 | |
116 exit; |