Mercurial > repos > miller-lab > genome_diversity
diff gd_snp2vcf.pl @ 31:a631c2f6d913
Update to Miller Lab devshed revision 3c4110ffacc3
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Fri, 20 Sep 2013 13:25:27 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gd_snp2vcf.pl Fri Sep 20 13:25:27 2013 -0400 @@ -0,0 +1,222 @@ +#!/usr/bin/perl -w +use strict; + +#convert from gd_snp file to vcf file (with dbSNP fields) + +#gd_snp table format: +#1. chr +#2. position (0 based) +#3. ref allele +#4. second allele +#5. overall quality +#foreach individual (6-9, 10-13, ...) +#a. count of allele in 3 +#b. count of allele in 4 +#c. genotype call (-1, or count of ref allele) +#d. quality of genotype call (quality of non-ref allele from masterVar) + +if (!@ARGV) { + print "usage: gd_snp2vcf.pl file.gd_snp[.gz|.bz2] -geno=8[,12:16,20...] -handle=HANDLE -batch=BATCHNAME -ref=REFERENCEID [-bioproj=XYZ -biosamp=ABC -population=POPID[,POPID2...] -chrCol=9 -posCol=9 ] > snpsForSubmission.vcf\n"; + exit; +} + +my $in = shift @ARGV; +my $genoCols = ''; +my $handle; +my $batch; +my $bioproj; +my $biosamp; +my $ref; +my $pop; +my $cr = 0; #allow to use alternate reference? +my $cp = 1; +my $meta; +my $offset = 0; #offset for genotype column, gd_snp vs gd_genotype indivs file +foreach (@ARGV) { + if (/-geno=([0-9,]+)/) { $genoCols .= "$1:"; } + elsif (/-geno=(.*)/) { $genoCols .= readGeno($1); } + elsif (/-off=([0-9])/) { $offset = $1; } + elsif (/-handle=(.*)/) { $handle = $1; } + elsif (/-batch=(.*)/) { $batch = $1; } + elsif (/-bioproj=(.*)/) { $bioproj = $1; } + elsif (/-biosamp=(.*)/) { $biosamp = $1; } + elsif (/-ref=(.*)/) { $ref = $1; } + elsif (/-population=(\S+)/) { $pop = $1; } + elsif (/-chrCol=(\d+)/) { $cr = $1 - 1; } + elsif (/-posCol=(\d+)/) { $cp = $1 - 1; } + elsif (/-metaOut=(.*)/) { $meta = $1; } +} +if ($cr < 0 or $cp < 0) { die "ERROR the column numbers should be 1 based.\n"; } + +#remove trailing delimiters +$genoCols =~ s/,:/:/g; +$genoCols =~ s/[,:]$//; + +my @gnc = split(/,|:/, $genoCols); + +if ($in =~ /.gz$/) { + open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; +}elsif ($in =~ /.bz2$/) { + open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; +}else { + open(FH, $in) or die "Couldn't open $in, $!\n"; +} +my @head = prepHeader(); +if (@head) { + print join("\n", @head), "\n"; + #now column headers + print "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if (defined $pop) { + $pop =~ s/,$//; + my $t = $pop; + $t =~ s/,/\t/g; + print "\tFORMAT\t$t"; + } + print "\n"; +} +while (<FH>) { + chomp; + if (/^#/) { next; } + if (/^\s*$/) { next; } + my @f = split(/\t/); + #vcf columns: chrom pos id ref alt qual filter info + # info must have VRT=[0-9] 1==SNV 2=indel 6=NoVariation 8=MNV ... + my $vrt = 1; + if ($f[2] !~ /^[ACTG]$/ or $f[3] !~ /^[ACTG]$/) { + die "Sorry this can only do SNV's at this time\n"; + } + if (scalar @gnc == 1 && !defined $pop) { #single genotype column + if (!defined $f[4] or $f[4] == -1) { $f[4] = '.'; } + if ($f[$gnc[0]-1] == 2) { $vrt = 6; } #reference match + if ($f[$gnc[0]-1] == -1) { next; } #no data, don't use + print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\n"; + #TODO? put read counts in comment? + }elsif ($pop) { #do as population + my @cols; + foreach my $gp (split(/:/,$genoCols)) { #foreach population + my @g = split(/,/, $gp); + my $totChrom = 2*(scalar @g); + my $totRef = 0; + foreach my $i (@g) { if (!defined $f[$i-1] or $f[$i-1] == -1) { $totChrom -= 2; next; } $totRef += $f[$i-1]; } + if ($totChrom == $totRef) { $vrt = 6; } + if ($totRef > $totChrom) { die "ERROR likely the wrong column was chosen for genotype\n"; } + my $altCnt = $totChrom - $totRef; + push(@cols, "$totChrom:$altCnt"); + } + print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\tNA:AC\t", join("\t", @cols), "\n"; + }else { #leave allele counts off + my $totChrom = 2*(scalar @gnc); + my $totRef = 0; + foreach my $i (@gnc) { if ($f[$i-1] == -1) { $totChrom -= 2; next; } $totRef += $f[$i-1]; } + if ($totChrom == $totRef) { $vrt = 6; } + print "$f[$cr]\t$f[$cp]\t$f[$cr];$f[$cp]\t$f[2]\t$f[3]\t$f[4]\t.\tVRT=$vrt\n"; + } +} +close FH or die "Couldn't close $in, $!\n"; + +if ($meta) { + open(FH, ">", $meta) or die "Couldn't open $meta, $!\n"; + print FH "TYPE: CONT\n", + "HANDLE: $handle\n", + "NAME: \n", + "FAX: \n", + "TEL: \n", + "EMAIL: \n", + "LAB: \n", + "INST: \n", + "ADDR: \n", + "||\n", + "TYPE: METHOD\n", + "HANDLE: $handle\n", + "ID: \n", + "METHOD_CLASS: Sequence\n", + "TEMPLATE_TYPE: \n", + "METHOD:\n", + "||\n"; + if ($pop) { + my @p = split(/,/, $pop); + foreach my $t (@p) { + print FH + "TYPE: POPULATION\n", + "HANDLE: $handle\n", + "ID: $t\n", + "POPULATION: \n", + "||\n"; + } + } + print FH "TYPE: SNPASSAY\n", + "HANDLE: $handle\n", + "BATCH: $batch\n", + "MOLTYPE: \n", + "METHOD: \n", + "ORGANISM: \n", + "||\n", + "TYPE: SNPPOPUSE | SNPINDUSE\n", + "HANDLE: $handle\n", + "BATCH: \n", + "METHOD: \n", + "||\n"; + + close FH or die "Couldn't close $meta, $!\n"; +} + +exit 0; + +#parse old header and add or create new +sub prepHeader { + my @h; + $h[0] = '##fileformat=VCFv4.1'; + my ($day, $mo, $yr) = (localtime)[3,4,5]; + $mo++; + $yr+=1900; + $h[1] = '##fileDate=' . "$yr$mo$day"; + $h[2] = "##handle=$handle"; + $h[3] = "##batch=$batch"; + my $i = 4; + if ($bioproj) { $h[$i] = "##bioproject_id=$bioproj"; $i++; } + if ($biosamp) { $h[$i] = "##biosample_id=$biosamp"; $i++; } + $h[$i] = "##reference=$ref"; ##reference=GCF_999999.99 + #$i++; + #$h[$i] = '##INFO=<ID=LID, Number=1,Type=string, Description="Unique local variation ID or name for display. The LID provided here combined with the handle must be unique for a particular submitter.">' + $i++; + $h[$i] = '##INFO=<ID=VRT,Number=1,Type=Integer,Description="Variation type,1 - SNV: single nucleotide variation,2 - DIV: deletion/insertion variation,3 - HETEROZYGOUS: variable, but undefined at nucleotide level,4 - STR: short tandem repeat (microsatellite) variation, 5 - NAMED: insertion/deletion variation of named repetitive element,6 - NO VARIATON: sequence scanned for variation, but none observed,7 - MIXED: cluster contains submissions from 2 or more allelic classes (not used) ,8 - MNV: multiple nucleotide variation with all eles of common length greater than 1,9 - Exception">'; + #sometimes have allele freqs? + if (defined $pop) { + $i++; + $h[$i] = "##FORMAT=<ID=NA,Number=1,Type=Integer,Description=\"Number of alleles for the population.\""; + $i++; + $h[$i] = '##FORMAT=<ID=AC,Number=.,Type=Integer,Description="Allele count for each alternate allele.">'; + my @p = split(/,/, $pop); + foreach my $t (@p) { + $i++; + $h[$i] = "##population_id=$t"; + } + } + #PMID? +##INFO=<ID=PMID,Number=.,Type=Integer,Description="PubMed ID linked to variation if available."> + + return @h; +} +####End + +#read genotype columns from a file +sub readGeno { + my $list = shift @_; + my @files = split(/,/, $list); + my $cols=''; + foreach my $file (@files) { + open(FH, $file) or die "Couldn't read $file, $!\n"; + while (<FH>) { + chomp; + my @f = split(/\s+/); + if ($f[0] =~/\D/) { die "ERROR expect an integer for the column\n"; } + $f[0] += $offset; + $cols .= "$f[0],"; + } + close FH; + $cols .= ":"; + } + $cols =~ s/,:$//; + return $cols; +} +####End