Mercurial > repos > devteam > vcf2pgsnp
comparison vcf2pgSnp.pl @ 0:5fca46616675 draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 28 Jul 2014 11:55:29 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:5fca46616675 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 use strict; | |
| 3 | |
| 4 #convert from a vcf file to a pgSnp file. | |
| 5 #frequency count = chromosome count | |
| 6 #either a single column/individual | |
| 7 #or all columns as a population | |
| 8 | |
| 9 my $in; | |
| 10 my $stCol = 9; | |
| 11 my $endCol; | |
| 12 if (@ARGV && scalar @ARGV == 2) { | |
| 13 $stCol = shift @ARGV; | |
| 14 $in = shift @ARGV; | |
| 15 if ($stCol eq 'all') { $stCol = 10; } | |
| 16 else { $endCol = $stCol; } | |
| 17 $stCol--; #go from 1 based to zero based column number | |
| 18 if ($stCol < 9) { | |
| 19 print "ERROR genotype fields don't start until column 10\n"; | |
| 20 exit; | |
| 21 } | |
| 22 }elsif (@ARGV && scalar @ARGV == 1) { | |
| 23 $in = shift @ARGV; | |
| 24 }elsif (@ARGV) { | |
| 25 print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; | |
| 26 exit; | |
| 27 } | |
| 28 | |
| 29 open(FH, $in) or die "Couldn't open $in, $!\n"; | |
| 30 while (<FH>) { | |
| 31 chomp; | |
| 32 if (/^\s*#/) { next; } #skip comments/headers | |
| 33 if (/^\s*$/) { next; } #skip blank lines | |
| 34 my @f = split(/\t/); | |
| 35 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... | |
| 36 my $a; | |
| 37 my %nt; | |
| 38 my %all; | |
| 39 my $cnt = 0; | |
| 40 my $var; | |
| 41 if ($f[3] eq 'N') { next; } #ignore ref=N | |
| 42 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite | |
| 43 #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate | |
| 44 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason | |
| 45 my $ind = 0; | |
| 46 if ($f[8] ne 'GT') { #more than just genotype | |
| 47 my @t = split(/:/, $f[8]); | |
| 48 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } | |
| 49 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } | |
| 50 } | |
| 51 #count 0's, 1's, 2's | |
| 52 if (!$endCol) { $endCol = $#f; } | |
| 53 foreach my $col ($stCol .. $endCol) { | |
| 54 if ($ind > 0) { | |
| 55 my @t = split(/:/, $f[$col]); | |
| 56 $f[$col] = $t[$ind] . ":"; #only keep genotype part | |
| 57 } | |
| 58 if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { | |
| 59 $nt{$1}++; | |
| 60 $nt{$2}++; | |
| 61 }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single | |
| 62 $nt{$1}++; | |
| 63 } #else ignore | |
| 64 } | |
| 65 if (%nt) { | |
| 66 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } | |
| 67 print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info | |
| 68 my $cnt = scalar(keys %nt); | |
| 69 my $fr; | |
| 70 my $sc; | |
| 71 my $all; | |
| 72 if (exists $nt{0}) { | |
| 73 $all = uc($f[3]); | |
| 74 $fr = $nt{0}; | |
| 75 $sc = 0; | |
| 76 } | |
| 77 if (!exists $nt{0} && exists $nt{1}) { | |
| 78 if ($f[4] =~ /([ACTG]),?/) { | |
| 79 $all = $1; | |
| 80 $fr = $nt{1}; | |
| 81 $sc = 0; | |
| 82 }else { die "bad variant nt $f[4] for nt 1"; } | |
| 83 }elsif (exists $nt{1}) { | |
| 84 if ($f[4] =~ /([ACTG]),?/) { | |
| 85 $all .= '/' . $1; | |
| 86 $fr .= ",$nt{1}"; | |
| 87 $sc .= ",0"; | |
| 88 }else { die "bad variant nt $f[4] for nt 1"; } | |
| 89 } | |
| 90 if (exists $nt{2}) { | |
| 91 if ($f[4] =~ /^[ACTG],([ACTG]),?/) { | |
| 92 $all .= '/' . $1; | |
| 93 $fr .= ",$nt{2}"; | |
| 94 $sc .= ",0"; | |
| 95 }else { die "bad variant nt $f[4] for nt 2"; } | |
| 96 } | |
| 97 if (exists $nt{3}) { | |
| 98 if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { | |
| 99 $all .= '/' . $1; | |
| 100 $fr .= ",$nt{3}"; | |
| 101 $sc .= ",0"; | |
| 102 }else { die "bad variant nt $f[4] for nt 3"; } | |
| 103 } | |
| 104 if (exists $nt{4}) { | |
| 105 if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { | |
| 106 $all .= '/' . $1; | |
| 107 $fr .= ",$nt{4}"; | |
| 108 $sc .= ",0"; | |
| 109 }else { die "bad variant nt $f[4] for nt 4"; } | |
| 110 } | |
| 111 print "$all\t$cnt\t$fr\t$sc\n"; | |
| 112 } | |
| 113 } | |
| 114 close FH; | |
| 115 | |
| 116 exit; |
