Mercurial > repos > john-mccallum > pcr_markers
view vcf_gff.py @ 1:a0689dc29b7f draft
Updated vcf to gff conversion tool
author | john-mccallum |
---|---|
date | Tue, 31 Jul 2012 00:33:11 -0400 |
parents | |
children | 402c3f0fe807 |
line wrap: on
line source
#!/usr/local/bin/python2.6 """ Usage vcf_gff.py <vcf_file input> <gff_file output> Input vcf file format: CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!! Output gff format: SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES """ #Copyright 2012 Susan Thomson #New Zealand Institute for Plant and Food Research #This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import sys in_vcf_file = open(sys.argv[1], 'r') out_gff_file = open(sys.argv[2], 'w') def get_info(attribute_input): """ Get the record_type by determining if INDEL is stated in column INFO; get raw read count """ INFO = {} rec = attribute_input.split(";") if "INDEL" in rec: record_type = "INDEL" rec = rec[1:] else: record_type = "SNP" for entry in rec: detail = entry.split("=") INFO[detail[0]] = detail[1] if INFO.has_key("DP"): reads = INFO.get("DP") else: reads = "NA" data = (record_type, reads) return data def get_gen(formatcols, ref): """ Get info on heterozyosity or homozygosity (could be useful later), by estimating genotype(GT) calling ref allele=0, variant allele=1 """ formats = [] sample_dict = {} genos = "" format_types = formatcols[0].split(":") samples = formatcols[1:] for entry in format_types: formats.append(entry) for sample in samples: var = "" data = sample.split(":") sample_dict = dict(zip(formats, data)) if sample_dict.has_key("DP"): reads = sample_dict.get("DP") else: reads = "NA" if sample_dict.has_key("NF"): """ polysnp tool output """ freq = sample_dict.get("NF") variants = freq.split("|") if int(variants[0]) >= 1: var += "A" if int(variants[1]) >= 1: var += "C" if int(variants[2]) >= 1: var += "G" if int(variants[3]) >= 1: var += "T" if var == "": gen = "NA" elif var == ref: gen = "HOM_ref" elif len(var) >= 2: gen = "HET" else: gen = "HOM_mut" elif sample_dict.has_key("GT"): """ mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too! """ genotypes = sample_dict.get("GT") if genotypes == "1/1": gen = "HOM_mut" if genotypes == "0/1": gen = "HET" if genotypes == "0/0": gen = "HOM_ref" else: gen = "NA" geno = ("%s:%s;" % (reads, gen)) genos += geno sample_dict = {} return genos attributes = {} """ Get relevant info from vcf file and put to proper gff columns """ for line in in_vcf_file: if line.startswith("#") == False: info = line.split() seqid = info[0].strip() source = "SAMTOOLS" start = int(info[1]) score = info[5] strand = "." phase = "." reference = info[3].strip() variant = info[4].strip() attr = get_info(info[7]) record_type = attr[0] reads = attr[1] if record_type == "INDEL": end = start + len(reference) else: end = start gen = get_gen(info[8:], reference) out_gff_file.write( ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" + "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") % ( seqid, source,record_type, start, end, score, strand, phase,seqid, record_type, start, variant, reference, reads, gen)) out_gff_file.close()