comparison tools/vcf_tools/annotate.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/python
2
3 import os.path
4 import sys
5 import optparse
6
7 import vcfClass
8 from vcfClass import *
9
10 import tools
11 from tools import *
12
13 if __name__ == "__main__":
14 main()
15
16 # Check that the reference and alternate in the dbsnp vcf file match those
17 # from the input vcf file.
18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation):
19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \
20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt
21
22 allelesAgree = True
23 if vcfRef.lower() != dbsnpRef.lower():
24 if vcfRef.lower() != dbsnpAlt.lower():
25 #print >> sys.stderr, text
26 allelesAgree = False
27 else:
28 if vcfAlt.lower() != dbsnpAlt.lower():
29 #print >> sys.stderr, text
30 allelesAgree = False
31
32 return allelesAgree
33
34 # Intersect two vcf files. It is assumed that the two files are
35 # sorted by genomic coordinates and the reference sequences are
36 # in the same order.
37 def annotateVcf(v, d, outputFile, annotation):
38 success1 = v.getRecord()
39 success2 = d.getRecord()
40 currentReferenceSequence = v.referenceSequence
41
42 # Finish when the end of the first file has been reached.
43 while success1:
44
45 # If the end of the dbsnp vcf file is reached, write out the
46 # remaining records from the vcf file.
47 if not success2:
48 outputFile.write(v.record)
49 success1 = v.getRecord()
50
51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence:
52 if v.position == d.position:
53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation)
54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo()
55 elif annotation == "hapmap":
56 if allelesAgree: v.info += ";HM3"
57 else: v.info += ";HM3A"
58 record = v.buildRecord(False)
59 outputFile.write(record)
60
61 success1 = v.getRecord()
62 success2 = d.getRecord()
63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
65 else:
66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
68
69 # If the last record for a reference sequence is the same for both vcf
70 # files, they will both have referenceSequences different from the
71 # current reference sequence. Change the reference sequence to reflect
72 # this and proceed.
73 else:
74 if v.referenceSequence != d.referenceSequence:
75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence
78 exit(1)
79 currentReferenceSequence = v.referenceSequence
80
81 def main():
82
83 # Parse the command line options
84 usage = "Usage: vcfPytools.py annotate [options]"
85 parser = optparse.OptionParser(usage = usage)
86 parser.add_option("-i", "--in",
87 action="store", type="string",
88 dest="vcfFile", help="input vcf files")
89 parser.add_option("-d", "--dbsnp",
90 action="store", type="string",
91 dest="dbsnpFile", help="input dbsnp vcf file")
92 parser.add_option("-m", "--hapmap",
93 action="store", type="string",
94 dest="hapmapFile", help="input hapmap vcf file")
95 parser.add_option("-o", "--out",
96 action="store", type="string",
97 dest="output", help="output vcf file")
98
99 (options, args) = parser.parse_args()
100
101 # Check that a single vcf file is given.
102 if options.vcfFile == None:
103 parser.print_help()
104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation."
105 exit(1)
106
107 # Check that either a hapmap or a dbsnp vcf file is included.
108 if options.dbsnpFile == None and options.hapmapFile == None:
109 parser.print_help()
110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)."
111 exit(1)
112 elif options.dbsnpFile != None and options.hapmapFile != None:
113 parser.print_help()
114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)."
115 exit(1)
116
117 # Set the output file to stdout if no output file was specified.
118 outputFile, writeOut = setOutput(options.output) # tools.py
119
120 v = vcf() # Define vcf object.
121 d = vcf() # Define dbsnp/hapmap vcf object.
122 if options.dbsnpFile:
123 d.dbsnpVcf = True
124 annotationFile = options.dbsnpFile
125 annotation = "dbsnp"
126 elif options.hapmapFile:
127 d.hapmapVcf = True
128 annotationFile = options.hapmapFile
129 annotation = "hapmap"
130
131 # Open the vcf files.
132 v.openVcf(options.vcfFile)
133 d.openVcf(annotationFile)
134
135 # Read in the header information.
136 v.parseHeader(options.vcfFile, writeOut)
137 d.parseHeader(annotationFile, writeOut)
138
139 # Add an extra line to the vcf header to indicate the file used for
140 # performing dbsnp annotation.
141 taskDescriptor = "##vcfPytools=annotated vcf file with "
142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile
143 elif options.hapmapFile:
144 taskDescriptor += "hapmap file " + options.hapmapFile
145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \
146 options.hapmapFile + "\">"
147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \
148 ", determined from file " + options.hapmapFile + "\">"
149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
150
151 # Annotate the vcf file.
152 annotateVcf(v, d, outputFile, annotation)
153
154 # Check that the input files had the same list of reference sequences.
155 # If not, it is possible that there were some problems.
156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py
157
158 # Close the vcf files.
159 v.closeVcf(options.vcfFile)
160 d.closeVcf(annotationFile)
161
162 # End the program.
163 return 0