Mercurial > repos > john-mccallum > pcr_markers
annotate vcf_gff.py @ 4:be070a68521e draft
Uploaded workflow from vcf file to CAPS
author | john-mccallum |
---|---|
date | Thu, 18 Oct 2012 19:47:07 -0400 |
parents | 402c3f0fe807 |
children | b321e0517be3 |
rev | line source |
---|---|
1 | 1 #!/usr/local/bin/python2.6 |
2 """ | |
3 Usage vcf_gff.py <vcf_file input> <gff_file output> | |
4 | |
5 Input vcf file format: | |
6 CHROM POS ID REF ALT QUAL FILTER INFO FORMAT | |
7 | |
8 Note: Generating vcf from a single merged bam file, using multiple bam files results in multiple FORMAT columns!!! | |
9 | |
10 Output gff format: | |
11 SEQID SOURCE TYPE START END SCORE STRAND PHASE ATTRIBUTES | |
12 | |
13 """ | |
14 #Copyright 2012 Susan Thomson | |
15 #New Zealand Institute for Plant and Food Research | |
16 #This program is free software: you can redistribute it and/or modify | |
17 # it under the terms of the GNU General Public License as published by | |
18 # the Free Software Foundation, either version 3 of the License, or | |
19 # (at your option) any later version. | |
20 # | |
21 # This program is distributed in the hope that it will be useful, | |
22 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
23 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
24 # GNU General Public License for more details. | |
25 # | |
26 # You should have received a copy of the GNU General Public License | |
27 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
28 | |
29 | |
30 import sys | |
31 | |
32 in_vcf_file = open(sys.argv[1], 'r') | |
33 out_gff_file = open(sys.argv[2], 'w') | |
34 | |
35 def get_info(attribute_input): | |
36 """ | |
37 Get the record_type by determining if INDEL is stated in column INFO; get | |
38 raw read count | |
39 """ | |
40 INFO = {} | |
41 rec = attribute_input.split(";") | |
42 if "INDEL" in rec: | |
43 record_type = "INDEL" | |
44 rec = rec[1:] | |
45 else: | |
46 record_type = "SNP" | |
47 for entry in rec: | |
48 detail = entry.split("=") | |
49 INFO[detail[0]] = detail[1] | |
50 if INFO.has_key("DP"): | |
51 reads = INFO.get("DP") | |
52 else: | |
53 reads = "NA" | |
54 data = (record_type, reads) | |
55 return data | |
56 | |
57 def get_gen(formatcols, ref): | |
58 """ | |
59 Get info on heterozyosity or homozygosity (could be useful later), | |
60 by estimating genotype(GT) calling ref allele=0, variant allele=1 | |
61 | |
62 """ | |
63 formats = [] | |
64 sample_dict = {} | |
65 genos = "" | |
66 format_types = formatcols[0].split(":") | |
67 samples = formatcols[1:] | |
68 for entry in format_types: | |
69 formats.append(entry) | |
70 for sample in samples: | |
71 var = "" | |
72 data = sample.split(":") | |
73 sample_dict = dict(zip(formats, data)) | |
74 if sample_dict.has_key("DP"): | |
75 reads = sample_dict.get("DP") | |
76 else: | |
77 reads = "NA" | |
78 if sample_dict.has_key("NF"): | |
79 """ | |
80 polysnp tool output | |
81 """ | |
82 freq = sample_dict.get("NF") | |
83 variants = freq.split("|") | |
84 if int(variants[0]) >= 1: | |
85 var += "A" | |
86 if int(variants[1]) >= 1: | |
87 var += "C" | |
88 if int(variants[2]) >= 1: | |
89 var += "G" | |
90 if int(variants[3]) >= 1: | |
91 var += "T" | |
92 if var == "": | |
93 gen = "NA" | |
94 elif var == ref: | |
95 gen = "HOM_ref" | |
96 elif len(var) >= 2: | |
97 gen = "HET" | |
98 else: | |
99 gen = "HOM_mut" | |
100 elif sample_dict.has_key("GT"): | |
101 """ | |
102 mpileup output, recommend ALWAYS have GT, note this only good scoring for diploids too! | |
103 """ | |
104 genotypes = sample_dict.get("GT") | |
105 if genotypes == "1/1": | |
106 gen = "HOM_mut" | |
107 if genotypes == "0/1": | |
108 gen = "HET" | |
109 if genotypes == "0/0": | |
110 gen = "HOM_ref" | |
111 else: | |
112 gen = "NA" | |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
113 geno = ("%s:%s " % (reads, gen)) |
1 | 114 genos += geno |
115 sample_dict = {} | |
116 return genos | |
117 | |
118 attributes = {} | |
119 """ | |
120 Get relevant info from vcf file and put to proper gff columns | |
121 """ | |
122 | |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
123 out_gff_file.write("#gff-version 3\n") |
1 | 124 for line in in_vcf_file: |
125 if line.startswith("#") == False: | |
126 info = line.split() | |
127 seqid = info[0].strip() | |
128 source = "SAMTOOLS" | |
129 start = int(info[1]) | |
130 score = info[5] | |
131 strand = "." | |
132 phase = "." | |
133 reference = info[3].strip() | |
134 variant = info[4].strip() | |
135 attr = get_info(info[7]) | |
136 record_type = attr[0] | |
137 reads = attr[1] | |
138 if record_type == "INDEL": | |
139 end = start + len(reference) | |
140 else: | |
141 end = start | |
142 gen = get_gen(info[8:], reference) | |
143 out_gff_file.write( | |
144 ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\tID=%s:%s:%d;Variant" + | |
3
402c3f0fe807
Uploaded revised vcf_gff.py from Github to fix bug
john-mccallum
parents:
1
diff
changeset
|
145 "_seq=%s;Reference_seq=%s;Total_reads=%s;Zygosity=%s\n") % |
1 | 146 ( seqid, source,record_type, start, end, score, strand, phase,seqid, |
147 record_type, start, variant, reference, reads, gen)) | |
148 | |
149 out_gff_file.close() | |
150 | |
151 |