comparison vcf2gvf.py @ 5:b321e0517be3 draft

Uploaded
author ben-warren
date Thu, 22 May 2014 20:30:19 -0400
parents
children
comparison
equal deleted inserted replaced
4:be070a68521e 5:b321e0517be3
1 #!/usr/bin/env python
2 """
3 Usage vcf2gvf.py <vcf_file input> <gff_file output>
4 """
5 import sys,re
6
7 ##simplify the variant descriptor for GVF
8 def get_vartype(type_str):
9 type_str=type_str.upper()
10 for V in ['COMPLEX', 'MNP', 'DEL', 'INDEL','INS','SNP']:
11 if V in type_str:
12 return V
13 break
14
15
16
17 in_vcf_file = open(sys.argv[1], 'r')
18 out_gff_file = open(sys.argv[2], 'w')
19 out_gff_file.write("#gff-version 3\n") ####gvf-version 1.06
20
21 ##only deal with tidy source lines, samtools is special needs
22 source_pat = re.compile('##source=(\S+)', re.IGNORECASE)
23
24
25 while True:
26 line=in_vcf_file.readline()
27 if line[0]!='#':
28 break
29 elif line[2:10]=='samtools':
30 source_type='SAMTOOLS'
31 break
32 else:
33 m=source_pat.match(line)
34 if m:
35 source_type=m.group(1).upper()
36 break
37
38
39 ##now read the data
40 ##samtools only distinguishes SNPs and indels
41 ##This is much complicated in highly heterozygous systems
42 ##see http://www.sequenceontology.org/miso/current_release/term/SO:0001059
43 ##GVF column 3 requires a type from this ontology
44 ###vcf type may be snp, mnp, ins, del, or complex.
45 ##for current purposes map multiple types to most complex state
46 ##ie SNP,SNP -> SNP; SNP,MNP-> MNP; SNP,complex
47 try:
48 for line in in_vcf_file:
49 if line[0] != '#':
50 var=line.split() ##see if var[7] has indel at start. If so add TYPE=INDEL for samtools case
51 info_field=var[7].split(';')
52 if info_field[0]=='INDEL':
53 info_field[0]="TYPE=INDEL"
54 info_dict=dict(X.split('=') for X in info_field)
55 ##if no TYPE key, then add TYPE=SNP for samtools case
56 if not info_dict.has_key('TYPE'):
57 info_dict['TYPE']='SNP';
58 var_type=get_vartype(info_dict['TYPE'])
59 ID=":".join([var[0],source_type,var_type,var[1]])
60 start=int(var[1])
61 length=len(var[3]) ## using reference length in this case
62 attributes=";".join(['ID='+ID,'Reference_seq='+var[3],'Variant_seq='+var[4]])
63 output_line=[var[0], source_type, var_type, str(start), str(start+length-1) ,'.','.','.',attributes,"\n"]
64 out_gff_file.write("\t".join([str(X) for X in output_line]))
65 finally:
66 in_vcf_file.close()
67 out_gff_file.close()
68