comparison tools/vcf_tools/extract.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/python
2
3 import os.path
4 import sys
5 import optparse
6
7 import vcfClass
8 from vcfClass import *
9
10 import tools
11 from tools import *
12
13 if __name__ == "__main__":
14 main()
15
16 def main():
17
18 # Parse the command line options
19 usage = "Usage: vcfPytools.py extract [options]"
20 parser = optparse.OptionParser(usage = usage)
21 parser.add_option("-i", "--in",
22 action="store", type="string",
23 dest="vcfFile", help="input vcf file (stdin for piped vcf)")
24 parser.add_option("-o", "--out",
25 action="store", type="string",
26 dest="output", help="output validation file")
27 parser.add_option("-s", "--reference-sequence",
28 action="store", type="string",
29 dest="referenceSequence", help="extract records from this reference sequence")
30 parser.add_option("-r", "--region",
31 action="store", type="string",
32 dest="region", help="extract records from this region")
33 parser.add_option("-q", "--keep-quality",
34 action="append", type="string", nargs=2,
35 dest="keepQuality", help="keep records containing this quality")
36 parser.add_option("-k", "--keep-info",
37 action="append", type="string",
38 dest="infoKeep", help="keep records containing this info field")
39 parser.add_option("-d", "--discard-info",
40 action="append", type="string",
41 dest="infoDiscard", help="discard records containing this info field")
42 parser.add_option("-p", "--pass-filter",
43 action="store_true", default=False,
44 dest="passFilter", help="discard records whose filter field is not PASS")
45
46 (options, args) = parser.parse_args()
47
48 # Check that a vcf file is given.
49 if options.vcfFile == None:
50 parser.print_help()
51 print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
52 exit(1)
53
54 # Check that either a reference sequence or region is specified,
55 # but not both if not dealing with info fields.
56 if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
57 if not options.referenceSequence and not options.region:
58 parser.print_help()
59 print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
60 print >> sys.stderr, "if not extracting records based on info strings."
61 exit(1)
62 if options.referenceSequence and options.region:
63 parser.print_help()
64 print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
65 exit(1)
66
67 # If a region was supplied, check the format.
68 if options.region:
69 if options.region.find(":") == -1 or options.region.find("..") == -1:
70 print >> sys.stderr, "\nIncorrect format for region string. Required: ref:start..end."
71 exit(1)
72 regionList = options.region.split(":",1)
73 referenceSequence = regionList[0]
74 try: start = int(regionList[1].split("..")[0])
75 except ValueError:
76 print >> sys.stderr, "region start coordinate is not an integer"
77 exit(1)
78 try: end = int(regionList[1].split("..")[1])
79 except ValueError:
80 print >> sys.stderr, "region end coordinate is not an integer"
81 exit(1)
82
83 # Ensure that discard-info and keep-info haven't both been defined.
84 if options.infoKeep and options.infoDiscard:
85 print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
86 exit(1)
87
88 # If the --keep-quality argument is used, check that a value and a logical
89 # argument are supplied and that the logical argument is valid.
90
91 if options.keepQuality:
92 for value, logic in options.keepQuality:
93 if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
94 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
95 print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
96 print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
97 exit(1)
98 try: qualityValue = float(value)
99 except ValueError:
100 print >> sys.stderr, "Error with --keep-quality (-q) argument. Must take the following form:"
101 print >> sys.stderr, "Quality value must be an integer or float value."
102 exit(1)
103 qualityLogic = logic
104
105 # Set the output file to stdout if no output file was specified.
106 outputFile, writeOut = setOutput(options.output)
107
108 v = vcf() # Define vcf object.
109
110 # Set process info to True if info strings need to be parsed.
111 if options.infoKeep or options.infoDiscard: v.processInfo = True
112
113 # Open the file.
114 v.openVcf(options.vcfFile)
115
116 # Read in the header information.
117 v.parseHeader(options.vcfFile, writeOut)
118 taskDescriptor = "##vcfPytools=extract data"
119 writeHeader(outputFile, v, False, taskDescriptor) # tools.py
120
121 # Read through all the entries and write out records in the correct
122 # reference sequence.
123 while v.getRecord():
124 writeRecord = True
125 if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
126 elif options.region:
127 if v.referenceSequence != referenceSequence: writeRecord = False
128 elif v.position < start or v.position > end: writeRecord = False
129
130 # Only consider these fields if the record is contained within the
131 # specified region.
132 if options.infoKeep and writeRecord:
133 for tag in options.infoKeep:
134 if v.infoTags.has_key(tag):
135 writeRecord = True
136 break
137 if not v.infoTags.has_key(tag): writeRecord = False
138 if options.infoDiscard and writeRecord:
139 for tag in options.infoDiscard:
140 if v.infoTags.has_key(tag): writeRecord = False
141 if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
142 if options.keepQuality:
143 if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
144 if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
145 if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
146 if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
147 if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
148
149 if writeRecord: outputFile.write(v.record)
150
151 # Close the file.
152 v.closeVcf(options.vcfFile)
153
154 # Terminate the program cleanly.
155 return 0