Mercurial > repos > john-mccallum > pcr_markers
changeset 1:a0689dc29b7f draft
Updated vcf to gff conversion tool
author | john-mccallum |
---|---|
date | Tue, 31 Jul 2012 00:33:11 -0400 |
parents | 21053f7f9ed1 |
children | ea2117a7b363 |
files | design_primers.xml patman.xml vcf2gvf.sh vcf2gvf.xml vcf_gff.py vcf_gff.xml |
diffstat | 6 files changed, 213 insertions(+), 103 deletions(-) [+] |
line wrap: on
line diff
--- a/design_primers.xml Thu Jun 14 19:29:26 2012 -0400 +++ b/design_primers.xml Tue Jul 31 00:33:11 2012 -0400 @@ -5,7 +5,7 @@ <inputs> <param format="fasta" name="inputfastaFile" type="data" label="Multifasta Source file" /> <param format="gff3" name="inputSNPfile" type="data" label="annotation file(Gff3)" /> - <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="Format:Sequence id:source:type:start, such as 1174806:gsMapper:SNP:292" ></param> + <param format="txt" name="inputTargetfile" type="data" optional="false" label="Target file" help="IN FORMAT Sequence id:source:type:start e.g. 1174806:gsMapper:SNP:292" ></param> <param name="min_size" size="20" type="text" value="75" label="Minimum Product Size Range" /> <param name="max_size" size="20" type="text" value="100" label="Maximum Product Size Range" /> </inputs>
--- a/patman.xml Thu Jun 14 19:29:26 2012 -0400 +++ b/patman.xml Tue Jul 31 00:33:11 2012 -0400 @@ -1,7 +1,7 @@ <?xml version="1.0"?> <tool id="patman_1" name="search for patterns in DNA using PatMaN"> <description>search for approximate patterns in DNA libraries</description> - <command>patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile </command> + <command>patman -a -e $edits -g $gaps -D $inputfastaFile -P $inputPatfile | sort | uniq > $patman_outputfile </command> <inputs> <param name="edits" type="integer" label="max N mismatches and or gaps" value="0" size="1" /> <param name="gaps" type="integer" label="max N gaps" value="0" size="1"/>
--- a/vcf2gvf.sh Thu Jun 14 19:29:26 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -#!/bin/sh -##convert vcf to gvf -##NOTE This is a very simple basic parser for a complex format. - -##usage vcf2gvf.sh <vcf file> <outputfile> - -#Copyright 2012 John McCallum & Leshi Chen -#New Zealand Institute for Plant and Food Research - -#New Zealand Institute for Plant and Food Research -#This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - - - -inputfile=$1 -outputfile=$2 - -echo "##gvf-version 1.05" > $outputfile - -awk ' -BEGIN {OFS="\t"} - -##get feature type -{if (index($8,"INDEL")== 1) {type="INDEL"} else {type="SNP"} } -##get feature length -{if (type=="SNP") - {feat_length=1} - else {feat_length=length($4)} -} -{end=($2+feat_length)} - -!/^#/ { print $1 ,"SAMTOOLS",type,$2,end,$6,".",".","ID="$1":SAMTOOLS:"type":"$2";Variant_seq="$5";Reference_seq="$4";"$8} - -END {print ""} -' "$inputfile" > "$outputfile" \ No newline at end of file
--- a/vcf2gvf.xml Thu Jun 14 19:29:26 2012 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -<?xml version="1.0"?> -<tool id="vcf2gvf_1" name="VCF to GFF3"> - <description>convert vcf to gvf/gff3</description> - <command interpreter="bash">vcf2gvf.sh $inputFile $outputfile</command> - <inputs> - <param format="vcf" name="inputFile" type="data" label="Input vcf File" help="vcf file from mpileup" /> - </inputs> - <outputs> -<data format="gff3" name="outputfile" /> - - </outputs> -<help> -This tool provides a simple conversion from vcf to gvf. - -Be sure to read the documentation to determine if it meets your requirements. - -* vcf documentation at http://samtools.sourceforge.net/samtools.shtml#6 -* GVF/GFF3 at http://www.sequenceontology.org/resources/gvf.html - - - -**input** - -:: - - PGSC0003DMB000000010 2042429 . C A 44.6 . DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 GT:PL:DP:GQ 0/1:14,0,42:5:23 1/1:27,6,0:2:9 1/1:15,3,0:1:7 1/1:30,6,0:2:9 - PGSC0003DMB000000038 1756646 . G A 3.69 . DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 GT:PL:DP:GQ 0/1:20,3,0:1:6 0/1:9,0,67:7:8 0/0:0,15,82:5:17 0/1:16,3,0:1:5 - PGSC0003DMB000000064 1916664 . T C 8.12 . DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 GT:PL:DP:GQ 1/1:14,3,0:1:5 1/1:0,0,0:0:3 1/1:13,3,0:1:5 1/1:15,3,0:1:5 - - -**output** - - -:: - - PGSC0003DMB000000010 samtools SNP 2042429 2042430 44.6 . . ID=PGSC0003DMB000000010:SAMTOOLS:SNP:2042429;Variant_seq=A;Reference_seq=C;DP=10;VDB=0.0118;AF1=0.8295;AC1=7;DP4=2,1,3,4;MQ=20;FQ=8.78;PV4=1,5.2e-10,1,1 - PGSC0003DMB000000038 samtools SNP 1756646 1756647 3.69 . . ID=PGSC0003DMB000000038:SAMTOOLS:SNP:1756646;Variant_seq=A;Reference_seq=G;DP=15;VDB=0.0166;AF1=0.495;AC1=4;DP4=3,7,2,2;MQ=20;FQ=5.6;PV4=0.58,3.8e-09,1,0.31 - PGSC0003DMB000000064 samtools SNP 1916664 1916665 8.12 . . ID=PGSC0003DMB000000064:SAMTOOLS:SNP:1916664;Variant_seq=C;Reference_seq=T;DP=4;VDB=0.0151;AF1=1;AC1=8;DP4=0,0,0,3;MQ=20;FQ=-29.5 - - - ------------------------ - -*If you use this tool please cite:* - -A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: -Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) -(2012) - -Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, -Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum - - -</help> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_gff.py Tue Jul 31 00:33:11 2012 -0400 @@ -0,0 +1,150 @@ +#!/usr/local/bin/python2.6 +""" +Usage vcf_gff.py <vcf_file input> <gff_file output> + +Input vcf file format: + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + +Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!! + +Output gff format: + SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES + +""" +#Copyright 2012 Susan Thomson +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +import sys + +in_vcf_file = open(sys.argv[1], 'r') +out_gff_file = open(sys.argv[2], 'w') + +def get_info(attribute_input): + """ + Get the record_type by determining if INDEL is stated in column INFO; get + raw read count + """ + INFO = {} + rec = attribute_input.split(";") + if "INDEL" in rec: + record_type = "INDEL" + rec = rec[1:] + else: + record_type = "SNP" + for entry in rec: + detail = entry.split("=") + INFO[detail[0]] = detail[1] + if INFO.has_key("DP"): + reads = INFO.get("DP") + else: + reads = "NA" + data = (record_type, reads) + return data + +def get_gen(formatcols, ref): + """ + Get info on heterozyosity or homozygosity (could be useful later), + by estimating genotype(GT) calling ref allele=0, variant allele=1 + + """ + formats = [] + sample_dict = {} + genos = "" + format_types = formatcols[0].split(":") + samples = formatcols[1:] + for entry in format_types: + formats.append(entry) + for sample in samples: + var = "" + data = sample.split(":") + sample_dict = dict(zip(formats, data)) + if sample_dict.has_key("DP"): + reads = sample_dict.get("DP") + else: + reads = "NA" + if sample_dict.has_key("NF"): + """ + polysnp tool output + """ + freq = sample_dict.get("NF") + variants = freq.split("|") + if int(variants[0]) >= 1: + var += "A" + if int(variants[1]) >= 1: + var += "C" + if int(variants[2]) >= 1: + var += "G" + if int(variants[3]) >= 1: + var += "T" + if var == "": + gen = "NA" + elif var == ref: + gen = "HOM_ref" + elif len(var) >= 2: + gen = "HET" + else: + gen = "HOM_mut" + elif sample_dict.has_key("GT"): + """ + mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too! + """ + genotypes = sample_dict.get("GT") + if genotypes == "1/1": + gen = "HOM_mut" + if genotypes == "0/1": + gen = "HET" + if genotypes == "0/0": + gen = "HOM_ref" + else: + gen = "NA" + geno = ("%s:%s;" % (reads, gen)) + genos += geno + sample_dict = {} + return genos + +attributes = {} +""" +Get relevant info from vcf file and put to proper gff columns +""" + +for line in in_vcf_file: + if line.startswith("#") == False: + info = line.split() + seqid = info[0].strip() + source = "SAMTOOLS" + start = int(info[1]) + score = info[5] + strand = "." + phase = "." + reference = info[3].strip() + variant = info[4].strip() + attr = get_info(info[7]) + record_type = attr[0] + reads = attr[1] + if record_type == "INDEL": + end = start + len(reference) + else: + end = start + gen = get_gen(info[8:], reference) + out_gff_file.write( + ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" + + "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") % + ( seqid, source,record_type, start, end, score, strand, phase,seqid, + record_type, start, variant, reference, reads, gen)) + +out_gff_file.close() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_gff.xml Tue Jul 31 00:33:11 2012 -0400 @@ -0,0 +1,61 @@ +<?xml version="1.0"?> +<tool id="vcf2gff_1" name="Convert vcf to gff"> + <description>Convert vcf to gff</description> + <command interpreter="python">vcf_gff.py $inputVcf $outputfile</command> + <inputs> + <param format="vcf" name="inputVcf" type="data" label="vcf file containing SNP data"/> + </inputs> + <outputs> + <data format="gff3" name="outputfile" /> + </outputs> +<help> + +** Convert vcf to gff3** + +This tool takes vcf output from Samtools mpileup and converts to gff3 format. +It converts a single vcf output file containing output from a single bam file or from multiple bam files. +Read counts and GT scores are used as an indicator of whether a mutation is homozygous or heterozygous and outputs in the INFO section. + +**TIP** +mpileup **must** be run with Genotype Likelihood Computation selected (-g flag)to generate the GT flag in BCF/VCF output. +This then used to estimate the SNP presence in one or more samples. +More info is available in the manual pages at: http://samtools.sourceforge.net/mpileup.shtml + + +**Example** + +--input vcf + +:: + + #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT bam + PGSC0003DMB000000001 430 . A T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23 + PGSC0003DMB000000001 445 . G T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23 + PGSC0003DMB000000001 446 . G A 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23 + PGSC0003DMB000000001 452 . C T 3.41 . DP=4;AF1=0.9904;CI95=0.125,1;DP4=0,0,2,0;MQ=17 PL:DP:GT:GQ 32,6,0:2:1/1:23 + +--output gff + +:: + + #CHROM SOURCE TYPE START STOP QUAL STRAND PHASE INFO + PGSC0003DMB000000001 SAMTOOLS SNP 430 430 3.41 . . ID=PGSC0003DMB000000001:SNP:430;Variant_seq=T;Reference_seq=A;Total_reads=4:Zygosity=2:HOM_mut + PGSC0003DMB000000001 SAMTOOLS SNP 445 445 3.41 . . ID=PGSC0003DMB000000001:SNP:445;Variant_seq=T;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut + PGSC0003DMB000000001 SAMTOOLS SNP 446 446 3.41 . . ID=PGSC0003DMB000000001:SNP:446;Variant_seq=A;Reference_seq=G;Total_reads=4:Zygosity=2:HOM_mut + PGSC0003DMB000000001 SAMTOOLS SNP 452 452 3.41 . . ID=PGSC0003DMB000000001:SNP:452;Variant_seq=T;Reference_seq=C;Total_reads=4:Zygosity=2:HOM_mut + + + +----------------------- + +*If you use this tool please cite:* + +A Toolkit For Bulk PCR-Based Marker Design From Next-Generation Sequence Data: +Application For Development Of A Framework Linkage Map In Bulb Onion (*Allium cepa* L.) +(2012) + +Samantha Baldwin, Roopashree Revanna, Susan Thomson, Meeghan Pither-Joyce, Kathryn Wright, +Ross Crowhurst, Mark Fiers, Leshi Chen, Richard MacKnight, John A. McCallum + +</help> +</tool> \ No newline at end of file