Mercurial > repos > miller-lab > snp_analysis_conversion
annotate vcf2pgSnpMult.pl @ 3:edf12470a1a6 default tip
Bugfix from Belinda, in vcf2pgSnp.pl
author | Cathy Riemer <cathy+hg@bx.psu.edu> |
---|---|
date | Thu, 19 Mar 2015 12:06:34 -0400 |
parents | 35c20b109be5 |
children |
rev | line source |
---|---|
2
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
1 #!/usr/bin/perl -w |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
2 use strict; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
3 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
4 #convert from a vcf file to a pgSnp file with multiple sets of the allele |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
5 # specific columns |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
6 #frequency count = chromosome count |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
7 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
8 my $in; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
9 my $stCol = 9; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
10 my $endCol; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
11 if (@ARGV && scalar @ARGV == 1) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
12 $in = shift @ARGV; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
13 }else { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
14 print "usage: vcf2pgSnpMult.pl file.vcf > file.pgSnpMult\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
15 exit; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
16 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
17 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
18 if ($in =~ /.gz$/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
19 open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
20 }else { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
21 open(FH, $in) or die "Couldn't open $in, $!\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
22 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
23 while (<FH>) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
24 chomp; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
25 if (/^\s*#/) { next; } #skip comments/headers |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
26 if (/^\s*$/) { next; } #skip blank lines |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
27 my @f = split(/\t/); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
28 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
29 my $a; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
30 my %nt; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
31 my %all; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
32 my $cnt = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
33 my $var; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
34 if ($f[3] eq 'N') { next; } #ignore ref=N |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
35 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
36 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
37 my $ind = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
38 if ($f[8] ne 'GT') { #more than just genotype |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
39 my @t = split(/:/, $f[8]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
40 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
41 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
42 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
43 if (!$endCol) { $endCol = $#f; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
44 #put f[3] => nt{0} and split f[4] for rest of nt{} |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
45 $nt{0} = $f[3]; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
46 my @t = split(/,/, $f[4]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
47 for (my $i=0; $i<=$#t; $i++) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
48 my $j = $i + 1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
49 $nt{$j} = $t[$i]; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
50 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
51 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
52 print "$f[0]\t", ($f[1]-1), "\t$f[1]"; #position info |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
53 foreach my $col ($stCol .. $endCol) { #add each individual (4 columns) |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
54 if ($ind > 0) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
55 my @t = split(/:/, $f[$col]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
56 $f[$col] = $t[$ind] . ":"; #only keep genotype part |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
57 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
58 print "\t"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
59 if ($f[$col] =~ /^(\d).(\d)/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
60 my $a1 = $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
61 my $a2 = $2; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
62 if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
63 if (!exists $nt{$a2}) { die "ERROR bad allele $a2 in $f[3] $f[4]\n"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
64 if ($a1 eq $a2) { #homozygous |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
65 print "$nt{$a1}\t1\t2\t0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
66 }else { #heterozygous |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
67 print "$nt{$a1}/$nt{$a2}\t2\t1,1\t0,0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
68 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
69 }elsif ($f[$col] =~ /^(\d):/) { #chrY or male chrX, single |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
70 my $a1 = $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
71 if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
72 print "$nt{$a1}\t1\t1\t0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
73 }else { #don't know how to parse |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
74 die "ERROR unknown genotype $f[$col]\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
75 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
76 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
77 print "\n"; #end this SNP |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
78 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
79 close FH; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
80 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
81 exit; |