0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import os.path
|
|
4 import sys
|
|
5 import optparse
|
|
6
|
|
7 import vcfClass
|
|
8 from vcfClass import *
|
|
9
|
|
10 import tools
|
|
11 from tools import *
|
|
12
|
|
13 if __name__ == "__main__":
|
|
14 main()
|
|
15
|
|
16 # Check that the reference and alternate in the dbsnp vcf file match those
|
|
17 # from the input vcf file.
|
|
18 def checkRefAlt(vcfRef, vcfAlt, dbsnpRef, dbsnpAlt, ref, position, annotation):
|
|
19 text = "WARNING: ref and alt alleles differ between vcf and " + annotation + " " + ref + ":" + str(position) + " vcf: " + \
|
|
20 vcfRef + "/" + vcfAlt + ", dbsnp: " + dbsnpRef + "/" + dbsnpAlt
|
|
21
|
|
22 allelesAgree = True
|
|
23 if vcfRef.lower() != dbsnpRef.lower():
|
|
24 if vcfRef.lower() != dbsnpAlt.lower():
|
|
25 #print >> sys.stderr, text
|
|
26 allelesAgree = False
|
|
27 else:
|
|
28 if vcfAlt.lower() != dbsnpAlt.lower():
|
|
29 #print >> sys.stderr, text
|
|
30 allelesAgree = False
|
|
31
|
|
32 return allelesAgree
|
|
33
|
|
34 # Intersect two vcf files. It is assumed that the two files are
|
|
35 # sorted by genomic coordinates and the reference sequences are
|
|
36 # in the same order.
|
|
37 def annotateVcf(v, d, outputFile, annotation):
|
|
38 success1 = v.getRecord()
|
|
39 success2 = d.getRecord()
|
|
40 currentReferenceSequence = v.referenceSequence
|
|
41
|
|
42 # Finish when the end of the first file has been reached.
|
|
43 while success1:
|
|
44
|
|
45 # If the end of the dbsnp vcf file is reached, write out the
|
|
46 # remaining records from the vcf file.
|
|
47 if not success2:
|
|
48 outputFile.write(v.record)
|
|
49 success1 = v.getRecord()
|
|
50
|
|
51 if v.referenceSequence == d.referenceSequence and v.referenceSequence == currentReferenceSequence:
|
|
52 if v.position == d.position:
|
|
53 allelesAgree = checkRefAlt(v.ref, v.alt, d.ref, d.alt, v.referenceSequence, v.position, annotation)
|
|
54 if annotation == "dbsnp": v.rsid = d.getDbsnpInfo()
|
|
55 elif annotation == "hapmap":
|
|
56 if allelesAgree: v.info += ";HM3"
|
|
57 else: v.info += ";HM3A"
|
|
58 record = v.buildRecord(False)
|
|
59 outputFile.write(record)
|
|
60
|
|
61 success1 = v.getRecord()
|
|
62 success2 = d.getRecord()
|
|
63 elif d.position > v.position: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
|
|
64 elif v.position > d.position: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
|
|
65 else:
|
|
66 if v.referenceSequence == currentReferenceSequence: success1 = v.parseVcf(d.referenceSequence, d.position, True, outputFile)
|
|
67 elif d.referenceSequence == currentReferenceSequence: success2 = d.parseVcf(v.referenceSequence, v.position, False, None)
|
|
68
|
|
69 # If the last record for a reference sequence is the same for both vcf
|
|
70 # files, they will both have referenceSequences different from the
|
|
71 # current reference sequence. Change the reference sequence to reflect
|
|
72 # this and proceed.
|
|
73 else:
|
|
74 if v.referenceSequence != d.referenceSequence:
|
|
75 print >> sys.stderr, "ERROR: Reference sequences for both files are unexpectedly different."
|
|
76 print >> sys.stderr, "Check that both files contain records for the following reference sequences:"
|
|
77 print >> sys.stderr, "\t", v.referenceSequence, " and ", d.referenceSequence
|
|
78 exit(1)
|
|
79 currentReferenceSequence = v.referenceSequence
|
|
80
|
|
81 def main():
|
|
82
|
|
83 # Parse the command line options
|
|
84 usage = "Usage: vcfPytools.py annotate [options]"
|
|
85 parser = optparse.OptionParser(usage = usage)
|
|
86 parser.add_option("-i", "--in",
|
|
87 action="store", type="string",
|
|
88 dest="vcfFile", help="input vcf files")
|
|
89 parser.add_option("-d", "--dbsnp",
|
|
90 action="store", type="string",
|
|
91 dest="dbsnpFile", help="input dbsnp vcf file")
|
|
92 parser.add_option("-m", "--hapmap",
|
|
93 action="store", type="string",
|
|
94 dest="hapmapFile", help="input hapmap vcf file")
|
|
95 parser.add_option("-o", "--out",
|
|
96 action="store", type="string",
|
|
97 dest="output", help="output vcf file")
|
|
98
|
|
99 (options, args) = parser.parse_args()
|
|
100
|
|
101 # Check that a single vcf file is given.
|
|
102 if options.vcfFile == None:
|
|
103 parser.print_help()
|
|
104 print >> sys.stderr, "\nInput vcf file (--in, -i) is required for dbsnp annotation."
|
|
105 exit(1)
|
|
106
|
|
107 # Check that either a hapmap or a dbsnp vcf file is included.
|
|
108 if options.dbsnpFile == None and options.hapmapFile == None:
|
|
109 parser.print_help()
|
|
110 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required (--dbsnp, -d, --hapmap, -h)."
|
|
111 exit(1)
|
|
112 elif options.dbsnpFile != None and options.hapmapFile != None:
|
|
113 parser.print_help()
|
|
114 print >> sys.stderr, "\ndbSNP or hapmap vcf file is required, not both (--dbsnp, -d, --hapmap, -h)."
|
|
115 exit(1)
|
|
116
|
|
117 # Set the output file to stdout if no output file was specified.
|
|
118 outputFile, writeOut = setOutput(options.output) # tools.py
|
|
119
|
|
120 v = vcf() # Define vcf object.
|
|
121 d = vcf() # Define dbsnp/hapmap vcf object.
|
|
122 if options.dbsnpFile:
|
|
123 d.dbsnpVcf = True
|
|
124 annotationFile = options.dbsnpFile
|
|
125 annotation = "dbsnp"
|
|
126 elif options.hapmapFile:
|
|
127 d.hapmapVcf = True
|
|
128 annotationFile = options.hapmapFile
|
|
129 annotation = "hapmap"
|
|
130
|
|
131 # Open the vcf files.
|
|
132 v.openVcf(options.vcfFile)
|
|
133 d.openVcf(annotationFile)
|
|
134
|
|
135 # Read in the header information.
|
|
136 v.parseHeader(options.vcfFile, writeOut)
|
|
137 d.parseHeader(annotationFile, writeOut)
|
|
138
|
|
139 # Add an extra line to the vcf header to indicate the file used for
|
|
140 # performing dbsnp annotation.
|
|
141 taskDescriptor = "##vcfPytools=annotated vcf file with "
|
|
142 if options.dbsnpFile: taskDescriptor += "dbSNP file " + options.dbsnpFile
|
|
143 elif options.hapmapFile:
|
|
144 taskDescriptor += "hapmap file " + options.hapmapFile
|
|
145 v.infoHeaderString["HM3"] = "##INFO=<ID=HM3,Number=0,Type=Flag,Description=\"Hapmap3.2 membership determined from file " + \
|
|
146 options.hapmapFile + "\">"
|
|
147 v.infoHeaderString["HM3A"] = "##INFO=<ID=HM3A,Number=0,Type=Flag,Description=\"Hapmap3.2 membership (with different alleles)" + \
|
|
148 ", determined from file " + options.hapmapFile + "\">"
|
|
149 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
|
|
150
|
|
151 # Annotate the vcf file.
|
|
152 annotateVcf(v, d, outputFile, annotation)
|
|
153
|
|
154 # Check that the input files had the same list of reference sequences.
|
|
155 # If not, it is possible that there were some problems.
|
|
156 checkReferenceSequenceLists(v.referenceSequenceList, d.referenceSequenceList) # tools.py
|
|
157
|
|
158 # Close the vcf files.
|
|
159 v.closeVcf(options.vcfFile)
|
|
160 d.closeVcf(annotationFile)
|
|
161
|
|
162 # End the program.
|
|
163 return 0
|