Mercurial > repos > john-mccallum > pcr_markers
annotate vcf_gff.py @ 6:f201e8c6e004 draft default tip
Uploaded
author | ben-warren |
---|---|
date | Mon, 07 Jul 2014 19:28:17 -0400 |
parents | b321e0517be3 |
children |
rev | line source |
---|---|
1 | 1 #!/usr/local/bin/python2.6 |
2 """ | |
3 Usage vcf_gff.py <vcf_file input> <gff_file output> | |
4 | |
5 Input vcf file format: | |
6 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT | |
7 | |
8 Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!! | |
9 | |
10 Output gff format: | |
11 SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES | |
12 | |
13 """ | |
14 #Copyright 2012 Susan Thomson | |
15 #New Zealand Institute for Plant and Food Research | |
16 #This program is free software: you can redistribute it and/or modify | |
17 # it under the terms of the GNU General Public License as published by | |
18 # the Free Software Foundation, either version 3 of the License, or | |
19 # (at your option) any later version. | |
20 # | |
21 # This program is distributed in the hope that it will be useful, | |
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
24 # GNU General Public License for more details. | |
25 # | |
26 # You should have received a copy of the GNU General Public License | |
27 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
28 | |
29 | |
30 import sys | |
31 | |
32 in_vcf_file = open(sys.argv[1], 'r') | |
33 out_gff_file = open(sys.argv[2], 'w') | |
34 | |
35 def get_info(attribute_input): | |
36 """ | |
37 Get the record_type by determining if INDEL is stated in column INFO; get | |
38 raw read count | |
39 """ | |
40 INFO = {} | |
41 rec = attribute_input.split(";") | |
42 if "INDEL" in rec: | |
43 record_type = "INDEL" | |
44 rec = rec[1:] | |
45 else: | |
46 record_type = "SNP" | |
47 for entry in rec: | |
48 detail = entry.split("=") | |
5 | 49 if len(detail) < 2: |
50 continue | |
1 | 51 INFO[detail[0]] = detail[1] |
52 if INFO.has_key("DP"): | |
53 reads = INFO.get("DP") | |
54 else: | |
55 reads = "NA" | |
56 data = (record_type, reads) | |
57 return data | |
58 | |
59 def get_gen(formatcols, ref): | |
60 """ | |
61 Get info on heterozyosity or homozygosity (could be useful later), | |
62 by estimating genotype(GT) calling ref allele=0, variant allele=1 | |
63 | |
64 """ | |
65 formats = [] | |
66 sample_dict = {} | |
67 genos = "" | |
68 format_types = formatcols[0].split(":") | |
69 samples = formatcols[1:] | |
70 for entry in format_types: | |
71 formats.append(entry) | |
72 for sample in samples: | |
73 var = "" | |
74 data = sample.split(":") | |
75 sample_dict = dict(zip(formats, data)) | |
76 if sample_dict.has_key("DP"): | |
77 reads = sample_dict.get("DP") | |
78 else: | |
79 reads = "NA" | |
80 if sample_dict.has_key("NF"): | |
81 """ | |
82 polysnp tool output | |
83 """ | |
84 freq = sample_dict.get("NF") | |
85 variants = freq.split("|") | |
86 if int(variants[0]) >= 1: | |
87 var += "A" | |
88 if int(variants[1]) >= 1: | |
89 var += "C" | |
90 if int(variants[2]) >= 1: | |
91 var += "G" | |
92 if int(variants[3]) >= 1: | |
93 var += "T" | |
94 if var == "": | |
95 gen = "NA" | |
96 elif var == ref: | |
97 gen = "HOM_ref" | |
98 elif len(var) >= 2: | |
99 gen = "HET" | |
100 else: | |
101 gen = "HOM_mut" | |
102 elif sample_dict.has_key("GT"): | |
103 """ | |
104 mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too! | |
105 """ | |
106 genotypes = sample_dict.get("GT") | |
107 if genotypes == "1/1": | |
108 gen = "HOM_mut" | |
109 if genotypes == "0/1": | |
110 gen = "HET" | |
111 if genotypes == "0/0": | |
112 gen = "HOM_ref" | |
5 | 113 try: # set gen to 'NA' if still unset |
114 gen | |
115 except NameError: | |
1 | 116 gen = "NA" |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
117 geno = ("%s:%s " % (reads, gen)) |
1 | 118 genos += geno |
119 sample_dict = {} | |
120 return genos | |
121 | |
122 attributes = {} | |
123 """ | |
124 Get relevant info from vcf file and put to proper gff columns | |
125 """ | |
126 | |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
127 out_gff_file.write("#gff-version 3\n") |
1 | 128 for line in in_vcf_file: |
129 if line.startswith("#") == False: | |
130 info = line.split() | |
131 seqid = info[0].strip() | |
132 source = "SAMTOOLS" | |
133 start = int(info[1]) | |
134 score = info[5] | |
135 strand = "." | |
136 phase = "." | |
137 reference = info[3].strip() | |
138 variant = info[4].strip() | |
139 attr = get_info(info[7]) | |
140 record_type = attr[0] | |
141 reads = attr[1] | |
142 if record_type == "INDEL": | |
143 end = start + len(reference) | |
144 else: | |
145 end = start | |
146 gen = get_gen(info[8:], reference) | |
147 out_gff_file.write( | |
5 | 148 ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%s:%d;Variant" + |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
149 "_seq=%s;Reference_seq=%s;Total_reads=%s;Zygosity=%s\n") % |
1 | 150 ( seqid, source,record_type, start, end, score, strand, phase,seqid, |
5 | 151 source, record_type, start, variant, reference, reads, gen)) |
152 | |
1 | 153 out_gff_file.close() |
154 | |
155 |