view vcf_gff.py @ 1:a0689dc29b7f draft

Updated vcf to gff conversion tool
author john-mccallum
date Tue, 31 Jul 2012 00:33:11 -0400
parents
children 402c3f0fe807
line wrap: on
line source

#!/usr/local/bin/python2.6
"""
Usage vcf_gff.py <vcf_file input> <gff_file output>

Input vcf file format:
   CHROM POS ID REF ALT QUAL FILTER INFO FORMAT

Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!!

Output gff format:
    SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES

"""
#Copyright 2012 Susan Thomson
#New Zealand Institute for Plant and Food Research
#This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.


import sys
                                                                              
in_vcf_file = open(sys.argv[1], 'r')
out_gff_file = open(sys.argv[2], 'w')

def get_info(attribute_input):
    """
    Get the record_type by determining if INDEL is stated in column INFO; get
    raw read count
    """
    INFO = {}
    rec = attribute_input.split(";")
    if "INDEL" in rec:
        record_type = "INDEL"
        rec = rec[1:]
    else:
        record_type = "SNP"
    for entry in rec:
        detail = entry.split("=")
        INFO[detail[0]] = detail[1]
    if INFO.has_key("DP"):
        reads = INFO.get("DP")
    else:
        reads = "NA"
    data = (record_type, reads)
    return data

def get_gen(formatcols, ref):
    """
    Get info on heterozyosity or homozygosity (could be useful later),
    by estimating genotype(GT) calling ref allele=0, variant allele=1

    """
    formats = []
    sample_dict = {}
    genos = ""
    format_types = formatcols[0].split(":")
    samples = formatcols[1:]
    for entry in format_types:
        formats.append(entry)
    for sample in samples:
        var = ""
        data = sample.split(":")
        sample_dict = dict(zip(formats, data))
        if sample_dict.has_key("DP"):
            reads = sample_dict.get("DP")
        else:
            reads = "NA"
        if sample_dict.has_key("NF"):
            """
            polysnp tool output
            """
            freq = sample_dict.get("NF")
            variants = freq.split("|")
            if int(variants[0]) >= 1:
                var += "A"
            if int(variants[1]) >= 1:
                var += "C"
            if int(variants[2]) >= 1:
                var += "G"
            if int(variants[3]) >= 1:
                var += "T"
            if var == "":
                gen = "NA"
            elif var == ref:
                gen = "HOM_ref"
            elif len(var) >= 2:
                    gen = "HET"
            else:
                gen = "HOM_mut"
        elif sample_dict.has_key("GT"):
            """
            mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too!
            """
            genotypes = sample_dict.get("GT")
            if genotypes == "1/1":
                gen = "HOM_mut"
            if genotypes == "0/1":
                gen = "HET"
            if genotypes == "0/0":
                gen = "HOM_ref"
        else:
            gen = "NA"
        geno = ("%s:%s;" % (reads, gen))
        genos += geno
        sample_dict = {}
    return genos
    
attributes = {}
"""
Get relevant info from vcf file and put to proper gff columns
"""

for line in in_vcf_file:
    if line.startswith("#") == False:
        info = line.split()
        seqid = info[0].strip()
        source = "SAMTOOLS"
        start = int(info[1])
        score = info[5]
        strand = "."
        phase = "."
        reference = info[3].strip()
        variant = info[4].strip()
        attr = get_info(info[7])
        record_type = attr[0]
        reads = attr[1]
        if record_type == "INDEL":
            end = start + len(reference)
        else:
            end = start 
        gen = get_gen(info[8:], reference)
        out_gff_file.write(
            ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" +
             "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") %
            ( seqid, source,record_type, start, end, score, strand, phase,seqid, 
              record_type, start, variant, reference, reads, gen))
    
out_gff_file.close()