1
|
1 #!/usr/local/bin/python2.6
|
|
2 """
|
|
3 Usage vcf_gff.py <vcf_file input> <gff_file output>
|
|
4
|
|
5 Input vcf file format:
|
|
6 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
|
7
|
|
8 Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!!
|
|
9
|
|
10 Output gff format:
|
|
11 SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES
|
|
12
|
|
13 """
|
|
14 #Copyright 2012 Susan Thomson
|
|
15 #New Zealand Institute for Plant and Food Research
|
|
16 #This program is free software: you can redistribute it and/or modify
|
|
17 # it under the terms of the GNU General Public License as published by
|
|
18 # the Free Software Foundation, either version 3 of the License, or
|
|
19 # (at your option) any later version.
|
|
20 #
|
|
21 # This program is distributed in the hope that it will be useful,
|
|
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
24 # GNU General Public License for more details.
|
|
25 #
|
|
26 # You should have received a copy of the GNU General Public License
|
|
27 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
28
|
|
29
|
|
30 import sys
|
|
31
|
|
32 in_vcf_file = open(sys.argv[1], 'r')
|
|
33 out_gff_file = open(sys.argv[2], 'w')
|
|
34
|
|
35 def get_info(attribute_input):
|
|
36 """
|
|
37 Get the record_type by determining if INDEL is stated in column INFO; get
|
|
38 raw read count
|
|
39 """
|
|
40 INFO = {}
|
|
41 rec = attribute_input.split(";")
|
|
42 if "INDEL" in rec:
|
|
43 record_type = "INDEL"
|
|
44 rec = rec[1:]
|
|
45 else:
|
|
46 record_type = "SNP"
|
|
47 for entry in rec:
|
|
48 detail = entry.split("=")
|
|
49 INFO[detail[0]] = detail[1]
|
|
50 if INFO.has_key("DP"):
|
|
51 reads = INFO.get("DP")
|
|
52 else:
|
|
53 reads = "NA"
|
|
54 data = (record_type, reads)
|
|
55 return data
|
|
56
|
|
57 def get_gen(formatcols, ref):
|
|
58 """
|
|
59 Get info on heterozyosity or homozygosity (could be useful later),
|
|
60 by estimating genotype(GT) calling ref allele=0, variant allele=1
|
|
61
|
|
62 """
|
|
63 formats = []
|
|
64 sample_dict = {}
|
|
65 genos = ""
|
|
66 format_types = formatcols[0].split(":")
|
|
67 samples = formatcols[1:]
|
|
68 for entry in format_types:
|
|
69 formats.append(entry)
|
|
70 for sample in samples:
|
|
71 var = ""
|
|
72 data = sample.split(":")
|
|
73 sample_dict = dict(zip(formats, data))
|
|
74 if sample_dict.has_key("DP"):
|
|
75 reads = sample_dict.get("DP")
|
|
76 else:
|
|
77 reads = "NA"
|
|
78 if sample_dict.has_key("NF"):
|
|
79 """
|
|
80 polysnp tool output
|
|
81 """
|
|
82 freq = sample_dict.get("NF")
|
|
83 variants = freq.split("|")
|
|
84 if int(variants[0]) >= 1:
|
|
85 var += "A"
|
|
86 if int(variants[1]) >= 1:
|
|
87 var += "C"
|
|
88 if int(variants[2]) >= 1:
|
|
89 var += "G"
|
|
90 if int(variants[3]) >= 1:
|
|
91 var += "T"
|
|
92 if var == "":
|
|
93 gen = "NA"
|
|
94 elif var == ref:
|
|
95 gen = "HOM_ref"
|
|
96 elif len(var) >= 2:
|
|
97 gen = "HET"
|
|
98 else:
|
|
99 gen = "HOM_mut"
|
|
100 elif sample_dict.has_key("GT"):
|
|
101 """
|
|
102 mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too!
|
|
103 """
|
|
104 genotypes = sample_dict.get("GT")
|
|
105 if genotypes == "1/1":
|
|
106 gen = "HOM_mut"
|
|
107 if genotypes == "0/1":
|
|
108 gen = "HET"
|
|
109 if genotypes == "0/0":
|
|
110 gen = "HOM_ref"
|
|
111 else:
|
|
112 gen = "NA"
|
|
113 geno = ("%s:%s;" % (reads, gen))
|
|
114 genos += geno
|
|
115 sample_dict = {}
|
|
116 return genos
|
|
117
|
|
118 attributes = {}
|
|
119 """
|
|
120 Get relevant info from vcf file and put to proper gff columns
|
|
121 """
|
|
122
|
|
123 for line in in_vcf_file:
|
|
124 if line.startswith("#") == False:
|
|
125 info = line.split()
|
|
126 seqid = info[0].strip()
|
|
127 source = "SAMTOOLS"
|
|
128 start = int(info[1])
|
|
129 score = info[5]
|
|
130 strand = "."
|
|
131 phase = "."
|
|
132 reference = info[3].strip()
|
|
133 variant = info[4].strip()
|
|
134 attr = get_info(info[7])
|
|
135 record_type = attr[0]
|
|
136 reads = attr[1]
|
|
137 if record_type == "INDEL":
|
|
138 end = start + len(reference)
|
|
139 else:
|
|
140 end = start
|
|
141 gen = get_gen(info[8:], reference)
|
|
142 out_gff_file.write(
|
|
143 ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" +
|
|
144 "_seq=%s;Reference_seq=%s;Total_reads=%s:Zygosity=%s\n") %
|
|
145 ( seqid, source,record_type, start, end, score, strand, phase,seqid,
|
|
146 record_type, start, variant, reference, reads, gen))
|
|
147
|
|
148 out_gff_file.close()
|
|
149
|
|
150
|