# HG changeset patch
# User john-mccallum
# Date 1343709191 14400
# Node ID a0689dc29b7f919534777066fcb8f87732e559a3
# Parent 21053f7f9ed1025ea85ce219073d62a59d4052c9
Updated vcf to gff conversion tool
diff -r 21053f7f9ed1 -r a0689dc29b7f design_primers.xml
--- a/design_primers.xml Thu Jun 14 19:29:26 2012 -0400
+++ b/design_primers.xml Tue Jul 31 00:33:11 2012 -0400
@@ -5,7 +5,7 @@
-
+
diff -r 21053f7f9ed1 -r a0689dc29b7f patman.xml
--- a/patman.xml Thu Jun 14 19:29:26 2012 -0400
+++ b/patman.xml Tue Jul 31 00:33:11 2012 -0400
@@ -1,7 +1,7 @@
search for approximate patterns in DNA libraries
- patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile
+ patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile
diff -r 21053f7f9ed1 -r a0689dc29b7f vcf2gvf.sh
--- a/vcf2gvf.sh Thu Jun 14 19:29:26 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-#!/bin/sh
-##convert vcf to gvf
-##NOTE This is a very simple basic parser for a complex format.
-
-##usage vcf2gvf.sh
-
-#Copyright 2012 John McCallum & Leshi Chen
-#New Zealand Institute for Plant and Food Research
-
-#New Zealand Institute for Plant and Food Research
-#This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see .
-
-
-
-inputfile=$1
-outputfile=$2
-
-echo "##gvf-version 1.05" > $outputfile
-
-awk '
-BEGIN {OFS="\t"}
-
-##get feature type
-{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} }
-##get feature length
-{if (type=="SNP")
- {feat_length=1}
- else {feat_length=length($4)}
-}
-{end=($2+feat_length)}
-
-!/^#/ { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8}
-
-END {print ""}
-' "$inputfile" > "$outputfile"
\ No newline at end of file
diff -r 21053f7f9ed1 -r a0689dc29b7f vcf2gvf.xml
--- a/vcf2gvf.xml Thu Jun 14 19:29:26 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-
-
- convert vcf to gvf/gff3
- vcf2gvf.sh $inputFile $outputfile
-
-
-
-
-
-
-
-
-This tool provides a simple conversion from vcf to gvf.
-
-Be sure to read the documentation to determine if it meets your requirements.
-
-* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6
-* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html
-
-
-
-**input**
-
-::
-
- PGSC0003DMB000000010 2042429 . C A 44.6 . DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 GT:PL:DP:GQ 0/1:14,0,42:5:23 1/1:27,6,0:2:9 1/1:15,3,0:1:7 1/1:30,6,0:2:9
- PGSC0003DMB000000038 1756646 . G A 3.69 . DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 GT:PL:DP:GQ 0/1:20,3,0:1:6 0/1:9,0,67:7:8 0/0:0,15,82:5:17 0/1:16,3,0:1:5
- PGSC0003DMB000000064 1916664 . T C 8.12 . DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 GT:PL:DP:GQ 1/1:14,3,0:1:5 1/1:0,0,0:0:3 1/1:13,3,0:1:5 1/1:15,3,0:1:5
-
-
-**output**
-
-
-::
-
- PGSC0003DMB000000010 samtools SNP 2042429 2042430 44.6 . . ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1
- PGSC0003DMB000000038 samtools SNP 1756646 1756647 3.69 . . ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31
- PGSC0003DMB000000064 samtools SNP 1916664 1916665 8.12 . . ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5
-
-
-
------------------------
-
-*If you use this tool please cite:*
-
-A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
-Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
-(2012)
-
-Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
-Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
-
-
-
-
diff -r 21053f7f9ed1 -r a0689dc29b7f vcf_gff.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_gff.py Tue Jul 31 00:33:11 2012 -0400
@@ -0,0 +1,150 @@
+#!/usr/local/bin/python2.6
+"""
+Usage vcf_gff.py
+
+Input vcf file format:
+ CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
+
+Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!!
+
+Output gff format:
+ SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES
+
+"""
+#Copyright 2012 Susan Thomson
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+
+import sys
+
+in_vcf_file = open(sys.argv[1], 'r')
+out_gff_file = open(sys.argv[2], 'w')
+
+def get_info(attribute_input):
+ """
+ Get the record_type by determining if INDEL is stated in column INFO; get
+ raw read count
+ """
+ INFO = {}
+ rec = attribute_input.split(";")
+ if "INDEL" in rec:
+ record_type = "INDEL"
+ rec = rec[1:]
+ else:
+ record_type = "SNP"
+ for entry in rec:
+ detail = entry.split("=")
+ INFO[detail[0]] = detail[1]
+ if INFO.has_key("DP"):
+ reads = INFO.get("DP")
+ else:
+ reads = "NA"
+ data = (record_type, reads)
+ return data
+
+def get_gen(formatcols, ref):
+ """
+ Get info on heterozyosity or homozygosity (could be useful later),
+ by estimating genotype(GT) calling ref allele=0, variant allele=1
+
+ """
+ formats = []
+ sample_dict = {}
+ genos = ""
+ format_types = formatcols[0].split(":")
+ samples = formatcols[1:]
+ for entry in format_types:
+ formats.append(entry)
+ for sample in samples:
+ var = ""
+ data = sample.split(":")
+ sample_dict = dict(zip(formats, data))
+ if sample_dict.has_key("DP"):
+ reads = sample_dict.get("DP")
+ else:
+ reads = "NA"
+ if sample_dict.has_key("NF"):
+ """
+ polysnp tool output
+ """
+ freq = sample_dict.get("NF")
+ variants = freq.split("|")
+ if int(variants[0]) >= 1:
+ var += "A"
+ if int(variants[1]) >= 1:
+ var += "C"
+ if int(variants[2]) >= 1:
+ var += "G"
+ if int(variants[3]) >= 1:
+ var += "T"
+ if var == "":
+ gen = "NA"
+ elif var == ref:
+ gen = "HOM_ref"
+ elif len(var) >= 2:
+ gen = "HET"
+ else:
+ gen = "HOM_mut"
+ elif sample_dict.has_key("GT"):
+ """
+ mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too!
+ """
+ genotypes = sample_dict.get("GT")
+ if genotypes == "1/1":
+ gen = "HOM_mut"
+ if genotypes == "0/1":
+ gen = "HET"
+ if genotypes == "0/0":
+ gen = "HOM_ref"
+ else:
+ gen = "NA"
+ geno = ("%s:%s;" % (reads, gen))
+ genos += geno
+ sample_dict = {}
+ return genos
+
+attributes = {}
+"""
+Get relevant info from vcf file and put to proper gff columns
+"""
+
+for line in in_vcf_file:
+ if line.startswith("#") == False:
+ info = line.split()
+ seqid = info[0].strip()
+ source = "SAMTOOLS"
+ start = int(info[1])
+ score = info[5]
+ strand = "."
+ phase = "."
+ reference = info[3].strip()
+ variant = info[4].strip()
+ attr = get_info(info[7])
+ record_type = attr[0]
+ reads = attr[1]
+ if record_type == "INDEL":
+ end = start + len(reference)
+ else:
+ end = start
+ gen = get_gen(info[8:], reference)
+ out_gff_file.write(
+ ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" +
+ "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") %
+ ( seqid, source,record_type, start, end, score, strand, phase,seqid,
+ record_type, start, variant, reference, reads, gen))
+
+out_gff_file.close()
+
+
diff -r 21053f7f9ed1 -r a0689dc29b7f vcf_gff.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_gff.xml Tue Jul 31 00:33:11 2012 -0400
@@ -0,0 +1,61 @@
+
+
+ Convert vcf to gff
+ vcf_gff.py $inputVcf $outputfile
+
+
+
+
+
+
+
+
+** Convert vcf to gff3**
+
+This tool takes vcf output from Samtools mpileup and converts to gff3 format.
+It converts a single vcf output file containing output from a single bam file or from multiple bam files.
+Read counts and GT scores are used as an indicator of whether a mutation is homozygous or heterozygous and outputs in the INFO section.
+
+**TIP**
+mpileup **must** be run with Genotype Likelihood Computation selected (-g flag)to generate the GT flag in BCF/VCF output.
+This then used to estimate the SNP presence in one or more samples.
+More info is available in the manual pages at: http://samtools.sourceforge.net/mpileup.shtml
+
+
+**Example**
+
+--input vcf
+
+::
+
+ #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT bam
+ PGSC0003DMB000000001 430 . A T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23
+ PGSC0003DMB000000001 445 . G T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23
+ PGSC0003DMB000000001 446 . G A 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23
+ PGSC0003DMB000000001 452 . C T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23
+
+--output gff
+
+::
+
+ #CHROM SOURCE TYPE START STOP QUAL STRAND PHASE INFO
+ PGSC0003DMB000000001 SAMTOOLS SNP 430 430 3.41 . . ID=PGSC0003DMB000000001:SNP:430;Variant_seq=T;Reference_seq=A;Total_reads=4:Zygosity=2:HOM_mut
+ PGSC0003DMB000000001 SAMTOOLS SNP 445 445 3.41 . . ID=PGSC0003DMB000000001:SNP:445;Variant_seq=T;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut
+ PGSC0003DMB000000001 SAMTOOLS SNP 446 446 3.41 . . ID=PGSC0003DMB000000001:SNP:446;Variant_seq=A;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut
+ PGSC0003DMB000000001 SAMTOOLS SNP 452 452 3.41 . . ID=PGSC0003DMB000000001:SNP:452;Variant_seq=T;Reference_seq=C;Total_reads=4:Zygosity=2:HOM_mut
+
+
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data:
+Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright,
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+
+
\ No newline at end of file