Mercurial > repos > john-mccallum > pcr_markers
diff vcf_gff.py @ 1:a0689dc29b7f draft
Updated vcf to gff conversion tool
author | john-mccallum |
---|---|
date | Tue, 31 Jul 2012 00:33:11 -0400 |
parents | |
children | 402c3f0fe807 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf_gff.py Tue Jul 31 00:33:11 2012 -0400 @@ -0,0 +1,150 @@ +#!/usr/local/bin/python2.6 +""" +Usage vcf_gff.py <vcf_file input> <gff_file output> + +Input vcf file format: + CHROM POS ID REF ALT QUAL FILTER INFO FORMAT + +Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!! + +Output gff format: + SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES + +""" +#Copyright 2012 Susan Thomson +#New Zealand Institute for Plant and Food Research +#This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +import sys + +in_vcf_file = open(sys.argv[1], 'r') +out_gff_file = open(sys.argv[2], 'w') + +def get_info(attribute_input): + """ + Get the record_type by determining if INDEL is stated in column INFO; get + raw read count + """ + INFO = {} + rec = attribute_input.split(";") + if "INDEL" in rec: + record_type = "INDEL" + rec = rec[1:] + else: + record_type = "SNP" + for entry in rec: + detail = entry.split("=") + INFO[detail[0]] = detail[1] + if INFO.has_key("DP"): + reads = INFO.get("DP") + else: + reads = "NA" + data = (record_type, reads) + return data + +def get_gen(formatcols, ref): + """ + Get info on heterozyosity or homozygosity (could be useful later), + by estimating genotype(GT) calling ref allele=0, variant allele=1 + + """ + formats = [] + sample_dict = {} + genos = "" + format_types = formatcols[0].split(":") + samples = formatcols[1:] + for entry in format_types: + formats.append(entry) + for sample in samples: + var = "" + data = sample.split(":") + sample_dict = dict(zip(formats, data)) + if sample_dict.has_key("DP"): + reads = sample_dict.get("DP") + else: + reads = "NA" + if sample_dict.has_key("NF"): + """ + polysnp tool output + """ + freq = sample_dict.get("NF") + variants = freq.split("|") + if int(variants[0]) >= 1: + var += "A" + if int(variants[1]) >= 1: + var += "C" + if int(variants[2]) >= 1: + var += "G" + if int(variants[3]) >= 1: + var += "T" + if var == "": + gen = "NA" + elif var == ref: + gen = "HOM_ref" + elif len(var) >= 2: + gen = "HET" + else: + gen = "HOM_mut" + elif sample_dict.has_key("GT"): + """ + mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too! + """ + genotypes = sample_dict.get("GT") + if genotypes == "1/1": + gen = "HOM_mut" + if genotypes == "0/1": + gen = "HET" + if genotypes == "0/0": + gen = "HOM_ref" + else: + gen = "NA" + geno = ("%s:%s;" % (reads, gen)) + genos += geno + sample_dict = {} + return genos + +attributes = {} +""" +Get relevant info from vcf file and put to proper gff columns +""" + +for line in in_vcf_file: + if line.startswith("#") == False: + info = line.split() + seqid = info[0].strip() + source = "SAMTOOLS" + start = int(info[1]) + score = info[5] + strand = "." + phase = "." + reference = info[3].strip() + variant = info[4].strip() + attr = get_info(info[7]) + record_type = attr[0] + reads = attr[1] + if record_type == "INDEL": + end = start + len(reference) + else: + end = start + gen = get_gen(info[8:], reference) + out_gff_file.write( + ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" + + "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") % + ( seqid, source,record_type, start, end, score, strand, phase,seqid, + record_type, start, variant, reference, reads, gen)) + +out_gff_file.close() + +