Mercurial > repos > miller-lab > genome_diversity
comparison gd_snp2vcf.pl @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
30:4188853b940b | 31:a631c2f6d913 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 | |
4 #convert from gd_snp file to vcf file (with dbSNP fields) | |
5 | |
6 #gd_snp table format: | |
7 #1. chr | |
8 #2. position (0 based) | |
9 #3. ref allele | |
10 #4. second allele | |
11 #5. overall quality | |
12 #foreach individual (6-9, 10-13, ...) | |
13 #a. count of allele in 3 | |
14 #b. count of allele in 4 | |
15 #c. genotype call (-1, or count of ref allele) | |
16 #d. quality of genotype call (quality of non-ref allele from masterVar) | |
17 | |
18 if (!@ARGV) { | |
19 print "usage: gd_snp2vcf.pl file.gd_snp[.gz|.bz2] -geno=8[,12:16,20...] -handle=HANDLE -batch=BATCHNAME -ref=REFERENCEID [-bioproj=XYZ -biosamp=ABC -population=POPID[,POPID2...] -chrCol=9 -posCol=9 ] > snpsForSubmission.vcf\n"; | |
20 exit; | |
21 } | |
22 | |
23 my $in = shift @ARGV; | |
24 my $genoCols = ''; | |
25 my $handle; | |
26 my $batch; | |
27 my $bioproj; | |
28 my $biosamp; | |
29 my $ref; | |
30 my $pop; | |
31 my $cr = 0; #allow to use alternate reference? | |
32 my $cp = 1; | |
33 my $meta; | |
34 my $offset = 0; #offset for genotype column, gd_snp vs gd_genotype indivs file | |
35 foreach (@ARGV) { | |
36 if (/-geno=([0-9,]+)/) { $genoCols .= "$1:"; } | |
37 elsif (/-geno=(.*)/) { $genoCols .= readGeno($1); } | |
38 elsif (/-off=([0-9])/) { $offset = $1; } | |
39 elsif (/-handle=(.*)/) { $handle = $1; } | |
40 elsif (/-batch=(.*)/) { $batch = $1; } | |
41 elsif (/-bioproj=(.*)/) { $bioproj = $1; } | |
42 elsif (/-biosamp=(.*)/) { $biosamp = $1; } | |
43 elsif (/-ref=(.*)/) { $ref = $1; } | |
44 elsif (/-population=(\S+)/) { $pop = $1; } | |
45 elsif (/-chrCol=(\d+)/) { $cr = $1 - 1; } | |
46 elsif (/-posCol=(\d+)/) { $cp = $1 - 1; } | |
47 elsif (/-metaOut=(.*)/) { $meta = $1; } | |
48 } | |
49 if ($cr < 0 or $cp < 0) { die "ERROR the column numbers should be 1 based.\n"; } | |
50 | |
51 #remove trailing delimiters | |
52 $genoCols =~ s/,:/:/g; | |
53 $genoCols =~ s/[,:]$//; | |
54 | |
55 my @gnc = split(/,|:/, $genoCols); | |
56 | |
57 if ($in =~ /.gz$/) { | |
58 open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; | |
59 }elsif ($in =~ /.bz2$/) { | |
60 open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; | |
61 }else { | |
62 open(FH, $in) or die "Couldn't open $in, $!\n"; | |
63 } | |
64 my @head = prepHeader(); | |
65 if (@head) { | |
66 print join("\n", @head), "\n"; | |
67 #now column headers | |
68 print "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; | |
69 if (defined $pop) { | |
70 $pop =~ s/,$//; | |
71 my $t = $pop; | |
72 $t =~ s/,/\t/g; | |
73 print "\tFORMAT\t$t"; | |
74 } | |
75 print "\n"; | |
76 } | |
77 while (<FH>) { | |
78 chomp; | |
79 if (/^#/) { next; } | |
80 if (/^\s*$/) { next; } | |
81 my @f = split(/\t/); | |
82 #vcf columns: chrom pos id ref alt qual filter info | |
83 # info must have VRT=[0-9] 1==SNV 2=indel 6=NoVariation 8=MNV ... | |
84 my $vrt = 1; | |
85 if ($f[2] !~ /^[ACTG]$/ or $f[3] !~ /^[ACTG]$/) { | |
86 die "Sorry this can only do SNV's at this time\n"; | |
87 } | |
88 if (scalar @gnc == 1 && !defined $pop) { #single genotype column | |
89 if (!defined $f[4] or $f[4] == -1) { $f[4] = '.'; } | |
90 if ($f[$gnc[0]-1] == 2) { $vrt = 6; } #reference match | |
91 if ($f[$gnc[0]-1] == -1) { next; } #no data, don't use | |
92 print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\n"; | |
93 #TODO? put read counts in comment? | |
94 }elsif ($pop) { #do as population | |
95 my @cols; | |
96 foreach my $gp (split(/:/,$genoCols)) { #foreach population | |
97 my @g = split(/,/, $gp); | |
98 my $totChrom = 2*(scalar @g); | |
99 my $totRef = 0; | |
100 foreach my $i (@g) { if (!defined $f[$i-1] or $f[$i-1] == -1) { $totChrom -= 2; next; } $totRef += $f[$i-1]; } | |
101 if ($totChrom == $totRef) { $vrt = 6; } | |
102 if ($totRef > $totChrom) { die "ERROR likely the wrong column was chosen for genotype\n"; } | |
103 my $altCnt = $totChrom - $totRef; | |
104 push(@cols, "$totChrom:$altCnt"); | |
105 } | |
106 print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\tNA:AC\t", join("\t", @cols), "\n"; | |
107 }else { #leave allele counts off | |
108 my $totChrom = 2*(scalar @gnc); | |
109 my $totRef = 0; | |
110 foreach my $i (@gnc) { if ($f[$i-1] == -1) { $totChrom -= 2; next; } $totRef += $f[$i-1]; } | |
111 if ($totChrom == $totRef) { $vrt = 6; } | |
112 print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\n"; | |
113 } | |
114 } | |
115 close FH or die "Couldn't close $in, $!\n"; | |
116 | |
117 if ($meta) { | |
118 open(FH, ">", $meta) or die "Couldn't open $meta, $!\n"; | |
119 print FH "TYPE: CONT\n", | |
120 "HANDLE: $handle\n", | |
121 "NAME: \n", | |
122 "FAX: \n", | |
123 "TEL: \n", | |
124 "EMAIL: \n", | |
125 "LAB: \n", | |
126 "INST: \n", | |
127 "ADDR: \n", | |
128 "||\n", | |
129 "TYPE: METHOD\n", | |
130 "HANDLE: $handle\n", | |
131 "ID: \n", | |
132 "METHOD_CLASS: Sequence\n", | |
133 "TEMPLATE_TYPE: \n", | |
134 "METHOD:\n", | |
135 "||\n"; | |
136 if ($pop) { | |
137 my @p = split(/,/, $pop); | |
138 foreach my $t (@p) { | |
139 print FH | |
140 "TYPE: POPULATION\n", | |
141 "HANDLE: $handle\n", | |
142 "ID: $t\n", | |
143 "POPULATION: \n", | |
144 "||\n"; | |
145 } | |
146 } | |
147 print FH "TYPE: SNPASSAY\n", | |
148 "HANDLE: $handle\n", | |
149 "BATCH: $batch\n", | |
150 "MOLTYPE: \n", | |
151 "METHOD: \n", | |
152 "ORGANISM: \n", | |
153 "||\n", | |
154 "TYPE: SNPPOPUSE | SNPINDUSE\n", | |
155 "HANDLE: $handle\n", | |
156 "BATCH: \n", | |
157 "METHOD: \n", | |
158 "||\n"; | |
159 | |
160 close FH or die "Couldn't close $meta, $!\n"; | |
161 } | |
162 | |
163 exit 0; | |
164 | |
165 #parse old header and add or create new | |
166 sub prepHeader { | |
167 my @h; | |
168 $h[0] = '##fileformat=VCFv4.1'; | |
169 my ($day, $mo, $yr) = (localtime)[3,4,5]; | |
170 $mo++; | |
171 $yr+=1900; | |
172 $h[1] = '##fileDate=' . "$yr$mo$day"; | |
173 $h[2] = "##handle=$handle"; | |
174 $h[3] = "##batch=$batch"; | |
175 my $i = 4; | |
176 if ($bioproj) { $h[$i] = "##bioproject_id=$bioproj"; $i++; } | |
177 if ($biosamp) { $h[$i] = "##biosample_id=$biosamp"; $i++; } | |
178 $h[$i] = "##reference=$ref"; ##reference=GCF_999999.99 | |
179 #$i++; | |
180 #$h[$i] = '##INFO=<ID=LID, Number=1,Type=string, Description="Unique local variation ID or name for display. The LID provided here combined with the handle must be unique for a particular submitter.">' | |
181 $i++; | |
182 $h[$i] = '##INFO=<ID=VRT,Number=1,Type=Integer,Description="Variation type,1 - SNV: single nucleotide variation,2 - DIV: deletion/insertion variation,3 - HETEROZYGOUS: variable, but undefined at nucleotide level,4 - STR: short tandem repeat (microsatellite) variation, 5 - NAMED: insertion/deletion variation of named repetitive element,6 - NO VARIATON: sequence scanned for variation, but none observed,7 - MIXED: cluster contains submissions from 2 or more allelic classes (not used) ,8 - MNV: multiple nucleotide variation with all eles of common length greater than 1,9 - Exception">'; | |
183 #sometimes have allele freqs? | |
184 if (defined $pop) { | |
185 $i++; | |
186 $h[$i] = "##FORMAT=<ID=NA,Number=1,Type=Integer,Description=\"Number of alleles for the population.\""; | |
187 $i++; | |
188 $h[$i] = '##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count for each alternate allele.">'; | |
189 my @p = split(/,/, $pop); | |
190 foreach my $t (@p) { | |
191 $i++; | |
192 $h[$i] = "##population_id=$t"; | |
193 } | |
194 } | |
195 #PMID? | |
196 ##INFO=<ID=PMID,Number=.,Type=Integer,Description="PubMed ID linked to variation if available."> | |
197 | |
198 return @h; | |
199 } | |
200 ####End | |
201 | |
202 #read genotype columns from a file | |
203 sub readGeno { | |
204 my $list = shift @_; | |
205 my @files = split(/,/, $list); | |
206 my $cols=''; | |
207 foreach my $file (@files) { | |
208 open(FH, $file) or die "Couldn't read $file, $!\n"; | |
209 while (<FH>) { | |
210 chomp; | |
211 my @f = split(/\s+/); | |
212 if ($f[0] =~/\D/) { die "ERROR expect an integer for the column\n"; } | |
213 $f[0] += $offset; | |
214 $cols .= "$f[0],"; | |
215 } | |
216 close FH; | |
217 $cols .= ":"; | |
218 } | |
219 $cols =~ s/,:$//; | |
220 return $cols; | |
221 } | |
222 ####End |