changeset 1:a0689dc29b7f draft

Updated vcf to gff conversion tool
author john-mccallum
date Tue, 31 Jul 2012 00:33:11 -0400
parents 21053f7f9ed1
children ea2117a7b363
files design_primers.xml patman.xml vcf2gvf.sh vcf2gvf.xml vcf_gff.py vcf_gff.xml
diffstat 6 files changed, 213 insertions(+), 103 deletions(-) [+]
line wrap: on
line diff
--- a/design_primers.xml	Thu Jun 14 19:29:26 2012 -0400
+++ b/design_primers.xml	Tue Jul 31 00:33:11 2012 -0400
@@ -5,7 +5,7 @@
   <inputs>
     <param format="fasta" name="inputfastaFile" type="data" label="Multifasta Source file" />
     <param format="gff3" name="inputSNPfile" type="data" label="annotation file(Gff3)" />
-    <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="Format:Sequence id:source:type:start, such as 1174806:gsMapper:SNP:292" ></param>
+    <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="IN FORMAT Sequence id:source:type:start e.g. 1174806:gsMapper:SNP:292" ></param>
 <param name="min_size" size="20" type="text" value="75" label="Minimum Product Size Range" />
      <param name="max_size" size="20" type="text" value="100" label="Maximum Product Size Range" />
   </inputs>
--- a/patman.xml	Thu Jun 14 19:29:26 2012 -0400
+++ b/patman.xml	Tue Jul 31 00:33:11 2012 -0400
@@ -1,7 +1,7 @@
 <?xml version="1.0"?>
 <tool id="patman_1" name="search for patterns in DNA using PatMaN">
   <description>search for approximate patterns in DNA libraries</description>
-  <command>patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile  </command>
+  <command>patman  -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile  </command>
   <inputs>
   <param  name="edits" type="integer" label="max N mismatches and or gaps" value="0" size="1" />
   <param name="gaps" type="integer" label="max N gaps" value="0" size="1"/>
--- a/vcf2gvf.sh	Thu Jun 14 19:29:26 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-#!/bin/sh
-##convert vcf to gvf
-##NOTE This is a very simple basic parser for a complex format.
-
-##usage vcf2gvf.sh <vcf file> <outputfile>
-
-#Copyright 2012 John McCallum & Leshi Chen
-#New Zealand Institute for Plant and Food Research
-
-#New Zealand Institute for Plant and Food Research
-#This program is free software: you can redistribute it and/or modify
-#     it under the terms of the GNU General Public License as published by
-#    the Free Software Foundation, either version 3 of the License, or
-#    (at your option) any later version.
-#
-#    This program is distributed in the hope that it will be useful,
-#    but WITHOUT ANY WARRANTY; without even the implied warranty of
-#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-#    GNU General Public License for more details.
-#
-#    You should have received a copy of the GNU General Public License
-#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-
-
-inputfile=$1
-outputfile=$2
-
-echo  "##gvf-version 1.05" > $outputfile
-
-awk '
-BEGIN {OFS="\t"}
-
-##get feature type 
-{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} }
-##get feature length
-{if (type=="SNP") 
-    {feat_length=1}
-    else {feat_length=length($4)} 
-}
-{end=($2+feat_length)}
-
-!/^#/  { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8}
-
-END {print ""} 
-' "$inputfile" > "$outputfile"
\ No newline at end of file
--- a/vcf2gvf.xml	Thu Jun 14 19:29:26 2012 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-<?xml version="1.0"?>
-<tool id="vcf2gvf_1" name="VCF to GFF3">
-  <description>convert vcf to gvf/gff3</description>
-  <command interpreter="bash">vcf2gvf.sh  $inputFile  $outputfile</command>
-  <inputs>
-    <param format="vcf" name="inputFile" type="data" label="Input vcf File" help="vcf file from mpileup" />
-  </inputs>
-  <outputs>
-<data format="gff3" name="outputfile" /> 
-
-  </outputs>
-<help>
-This tool provides a simple conversion from vcf to gvf.
-
-Be sure to read the documentation to determine if it meets your requirements.
-
-* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6
-* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html
-
-
-
-**input**
-
-::
-
-	PGSC0003DMB000000010	2042429	.	C	A	44.6	.	DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1	GT:PL:DP:GQ	0/1:14,0,42:5:23	1/1:27,6,0:2:9	1/1:15,3,0:1:7	1/1:30,6,0:2:9
-	PGSC0003DMB000000038	1756646	.	G	A	3.69	.	DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31	GT:PL:DP:GQ	0/1:20,3,0:1:6	0/1:9,0,67:7:8	0/0:0,15,82:5:17	0/1:16,3,0:1:5
-	PGSC0003DMB000000064	1916664	.	T	C	8.12	.	DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5	GT:PL:DP:GQ	1/1:14,3,0:1:5	1/1:0,0,0:0:3	1/1:13,3,0:1:5	1/1:15,3,0:1:5
-
-
-**output**
-
-
-::
-
-	PGSC0003DMB000000010	samtools	SNP	2042429	2042430	44.6	.	.	ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1
-	PGSC0003DMB000000038	samtools	SNP	1756646	1756647	3.69	.	.	ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31
-	PGSC0003DMB000000064	samtools	SNP	1916664	1916665	8.12	.	.	ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5
-
-
-
------------------------
-
-*If you use this tool please cite:*
-
-A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
-Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
-(2012)
-
-Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
-Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
-
-
-</help>
-</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_gff.py	Tue Jul 31 00:33:11 2012 -0400
@@ -0,0 +1,150 @@
+#!/usr/local/bin/python2.6
+"""
+Usage vcf_gff.py <vcf_file input> <gff_file output>
+
+Input vcf file format:
+   CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
+
+Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!!
+
+Output gff format:
+    SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES
+
+"""
+#Copyright 2012 Susan Thomson
+#New Zealand Institute for Plant and Food Research
+#This program is free software: you can redistribute it and/or modify
+#     it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import sys
+                                                                              
+in_vcf_file = open(sys.argv[1], 'r')
+out_gff_file = open(sys.argv[2], 'w')
+
+def get_info(attribute_input):
+    """
+    Get the record_type by determining if INDEL is stated in column INFO; get
+    raw read count
+    """
+    INFO = {}
+    rec = attribute_input.split(";")
+    if "INDEL" in rec:
+        record_type = "INDEL"
+        rec = rec[1:]
+    else:
+        record_type = "SNP"
+    for entry in rec:
+        detail = entry.split("=")
+        INFO[detail[0]] = detail[1]
+    if INFO.has_key("DP"):
+        reads = INFO.get("DP")
+    else:
+        reads = "NA"
+    data = (record_type, reads)
+    return data
+
+def get_gen(formatcols, ref):
+    """
+    Get info on heterozyosity or homozygosity (could be useful later),
+    by estimating genotype(GT) calling ref allele=0, variant allele=1
+
+    """
+    formats = []
+    sample_dict = {}
+    genos = ""
+    format_types = formatcols[0].split(":")
+    samples = formatcols[1:]
+    for entry in format_types:
+        formats.append(entry)
+    for sample in samples:
+        var = ""
+        data = sample.split(":")
+        sample_dict = dict(zip(formats, data))
+        if sample_dict.has_key("DP"):
+            reads = sample_dict.get("DP")
+        else:
+            reads = "NA"
+        if sample_dict.has_key("NF"):
+            """
+            polysnp tool output
+            """
+            freq = sample_dict.get("NF")
+            variants = freq.split("|")
+            if int(variants[0]) >= 1:
+                var += "A"
+            if int(variants[1]) >= 1:
+                var += "C"
+            if int(variants[2]) >= 1:
+                var += "G"
+            if int(variants[3]) >= 1:
+                var += "T"
+            if var == "":
+                gen = "NA"
+            elif var == ref:
+                gen = "HOM_ref"
+            elif len(var) >= 2:
+                    gen = "HET"
+            else:
+                gen = "HOM_mut"
+        elif sample_dict.has_key("GT"):
+            """
+            mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too!
+            """
+            genotypes = sample_dict.get("GT")
+            if genotypes == "1/1":
+                gen = "HOM_mut"
+            if genotypes == "0/1":
+                gen = "HET"
+            if genotypes == "0/0":
+                gen = "HOM_ref"
+        else:
+            gen = "NA"
+        geno = ("%s:%s;" % (reads, gen))
+        genos += geno
+        sample_dict = {}
+    return genos
+    
+attributes = {}
+"""
+Get relevant info from vcf file and put to proper gff columns
+"""
+
+for line in in_vcf_file:
+    if line.startswith("#") == False:
+        info = line.split()
+        seqid = info[0].strip()
+        source = "SAMTOOLS"
+        start = int(info[1])
+        score = info[5]
+        strand = "."
+        phase = "."
+        reference = info[3].strip()
+        variant = info[4].strip()
+        attr = get_info(info[7])
+        record_type = attr[0]
+        reads = attr[1]
+        if record_type == "INDEL":
+            end = start + len(reference)
+        else:
+            end = start 
+        gen = get_gen(info[8:], reference)
+        out_gff_file.write(
+            ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" +
+             "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") %
+            ( seqid, source,record_type, start, end, score, strand, phase,seqid, 
+              record_type, start, variant, reference, reads, gen))
+    
+out_gff_file.close()    
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/vcf_gff.xml	Tue Jul 31 00:33:11 2012 -0400
@@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+<tool id="vcf2gff_1" name="Convert vcf to gff">
+  <description>Convert vcf to gff</description>
+  <command interpreter="python">vcf_gff.py $inputVcf  $outputfile</command>
+  <inputs>
+    <param format="vcf" name="inputVcf" type="data" label="vcf file containing SNP data"/>
+  </inputs>
+  <outputs>
+     <data format="gff3" name="outputfile" />
+  </outputs>
+<help>
+
+** Convert vcf to gff3**
+
+This tool takes vcf output from Samtools mpileup and converts to gff3 format.
+It converts a single vcf output file containing output from a single bam file or from multiple bam files.
+Read counts and GT scores are used as an indicator of whether a mutation is homozygous or heterozygous and outputs in the INFO section.
+
+**TIP**
+mpileup **must** be run with Genotype Likelihood Computation selected (-g flag)to generate the GT flag in BCF/VCF output. 
+This then used to estimate the SNP presence in one or more samples.  
+More info is available in the manual pages at: http://samtools.sourceforge.net/mpileup.shtml
+
+
+**Example**
+
+--input vcf
+
+::
+
+	#CHROM    POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT  bam
+	PGSC0003DMB000000001    430     .       A       T       3.41    .       DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17  PL:DP:GT:GQ     32,6,0:2:1/1:23
+	PGSC0003DMB000000001    445     .       G       T       3.41    .       DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17  PL:DP:GT:GQ     32,6,0:2:1/1:23 
+	PGSC0003DMB000000001    446     .       G       A       3.41    .       DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17  PL:DP:GT:GQ     32,6,0:2:1/1:23 
+	PGSC0003DMB000000001    452     .       C       T       3.41    .       DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17  PL:DP:GT:GQ     32,6,0:2:1/1:23
+
+--output gff
+
+::
+
+	#CHROM    SOURCE    TYPE    START    STOP    QUAL    STRAND   PHASE    INFO
+	PGSC0003DMB000000001    SAMTOOLS        SNP     430     430     3.41    .       .       ID=PGSC0003DMB000000001:SNP:430;Variant_seq=T;Reference_seq=A;Total_reads=4:Zygosity=2:HOM_mut
+	PGSC0003DMB000000001    SAMTOOLS        SNP     445     445     3.41    .       .       ID=PGSC0003DMB000000001:SNP:445;Variant_seq=T;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut
+	PGSC0003DMB000000001    SAMTOOLS        SNP     446     446     3.41    .       .       ID=PGSC0003DMB000000001:SNP:446;Variant_seq=A;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut
+	PGSC0003DMB000000001    SAMTOOLS        SNP     452     452     3.41    .       .       ID=PGSC0003DMB000000001:SNP:452;Variant_seq=T;Reference_seq=C;Total_reads=4:Zygosity=2:HOM_mut
+
+
+
+-----------------------
+
+*If you use this tool please cite:*
+
+A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: 
+Application For  Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.)
+(2012)
+
+Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, 
+Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum
+
+</help>
+</tool>
\ No newline at end of file