annotate vcf2pgSnpMult.pl @ 3:edf12470a1a6 default tip

Bugfix from Belinda, in vcf2pgSnp.pl
author Cathy Riemer <cathy+hg@bx.psu.edu>
date Thu, 19 Mar 2015 12:06:34 -0400
parents 35c20b109be5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
1 #!/usr/bin/perl -w
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
2 use strict;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
3
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
4 #convert from a vcf file to a pgSnp file with multiple sets of the allele
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
5 # specific columns
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
6 #frequency count = chromosome count
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
7
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
8 my $in;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
9 my $stCol = 9;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
10 my $endCol;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
11 if (@ARGV && scalar @ARGV == 1) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
12 $in = shift @ARGV;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
13 }else {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
14 print "usage: vcf2pgSnpMult.pl file.vcf > file.pgSnpMult\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
15 exit;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
16 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
17
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
18 if ($in =~ /.gz$/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
19 open(FH, "zcat $in |") or die "Couldn't open $in, $!\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
20 }else {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
21 open(FH, $in) or die "Couldn't open $in, $!\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
22 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
23 while (<FH>) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
24 chomp;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
25 if (/^\s*#/) { next; } #skip comments/headers
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
26 if (/^\s*$/) { next; } #skip blank lines
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
27 my @f = split(/\t/);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
28 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ...
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
29 my $a;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
30 my %nt;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
31 my %all;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
32 my $cnt = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
33 my $var;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
34 if ($f[3] eq 'N') { next; } #ignore ref=N
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
35 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
36 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
37 my $ind = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
38 if ($f[8] ne 'GT') { #more than just genotype
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
39 my @t = split(/:/, $f[8]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
40 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
41 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
42 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
43 if (!$endCol) { $endCol = $#f; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
44 #put f[3] => nt{0} and split f[4] for rest of nt{}
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
45 $nt{0} = $f[3];
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
46 my @t = split(/,/, $f[4]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
47 for (my $i=0; $i<=$#t; $i++) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
48 my $j = $i + 1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
49 $nt{$j} = $t[$i];
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
50 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
51 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
52 print "$f[0]\t", ($f[1]-1), "\t$f[1]"; #position info
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
53 foreach my $col ($stCol .. $endCol) { #add each individual (4 columns)
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
54 if ($ind > 0) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
55 my @t = split(/:/, $f[$col]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
56 $f[$col] = $t[$ind] . ":"; #only keep genotype part
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
57 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
58 print "\t";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
59 if ($f[$col] =~ /^(\d).(\d)/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
60 my $a1 = $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
61 my $a2 = $2;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
62 if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
63 if (!exists $nt{$a2}) { die "ERROR bad allele $a2 in $f[3] $f[4]\n"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
64 if ($a1 eq $a2) { #homozygous
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
65 print "$nt{$a1}\t1\t2\t0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
66 }else { #heterozygous
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
67 print "$nt{$a1}/$nt{$a2}\t2\t1,1\t0,0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
68 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
69 }elsif ($f[$col] =~ /^(\d):/) { #chrY or male chrX, single
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
70 my $a1 = $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
71 if (!exists $nt{$a1}) { die "ERROR bad allele $a1 in $f[3] $f[4]\n"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
72 print "$nt{$a1}\t1\t1\t0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
73 }else { #don't know how to parse
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
74 die "ERROR unknown genotype $f[$col]\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
75 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
76 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
77 print "\n"; #end this SNP
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
78 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
79 close FH;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
80
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
81 exit;