diff tools/vcf_tools/extract.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/vcf_tools/extract.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,155 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import optparse
+
+import vcfClass
+from vcfClass import *
+
+import tools
+from tools import *
+
+if __name__ == "__main__":
+  main()
+
+def main():
+
+# Parse the command line options
+  usage = "Usage: vcfPytools.py extract [options]"
+  parser = optparse.OptionParser(usage = usage)
+  parser.add_option("-i", "--in",
+                    action="store", type="string",
+                    dest="vcfFile", help="input vcf file (stdin for piped vcf)")
+  parser.add_option("-o", "--out",
+                    action="store", type="string",
+                    dest="output", help="output validation file")
+  parser.add_option("-s", "--reference-sequence",
+                    action="store", type="string",
+                    dest="referenceSequence", help="extract records from this reference sequence")
+  parser.add_option("-r", "--region",
+                    action="store", type="string",
+                    dest="region", help="extract records from this region")
+  parser.add_option("-q", "--keep-quality",
+                    action="append", type="string", nargs=2,
+                    dest="keepQuality", help="keep records containing this quality")
+  parser.add_option("-k", "--keep-info",
+                    action="append", type="string",
+                    dest="infoKeep", help="keep records containing this info field")
+  parser.add_option("-d", "--discard-info",
+                    action="append", type="string",
+                    dest="infoDiscard", help="discard records containing this info field")
+  parser.add_option("-p", "--pass-filter",
+                    action="store_true", default=False,
+                    dest="passFilter", help="discard records whose filter field is not PASS")
+
+  (options, args) = parser.parse_args()
+
+# Check that a vcf file is given.
+  if options.vcfFile == None:
+    parser.print_help()
+    print >> sys.stderr, "\nInput vcf file (--in, -i) is required."
+    exit(1)
+
+# Check that either a reference sequence or region is specified,
+# but not both if not dealing with info fields.
+  if not options.infoKeep and not options.infoDiscard and not options.passFilter and not options.keepQuality:
+    if not options.referenceSequence and not options.region:
+      parser.print_help()
+      print >> sys.stderr, "\nA region (--region, -r) or reference sequence (--reference-sequence, -s) must be supplied"
+      print >> sys.stderr, "if not extracting records based on info strings."
+      exit(1)
+  if options.referenceSequence and options.region:
+    parser.print_help()
+    print >> sys.stderr, "\nEither a region (--region, -r) or reference sequence (--reference-sequence, -s) can be supplied, but not both."
+    exit(1)
+
+# If a region was supplied, check the format.
+  if options.region:
+    if options.region.find(":") == -1 or options.region.find("..") == -1:
+      print >> sys.stderr, "\nIncorrect format for region string.  Required: ref:start..end."
+      exit(1)
+    regionList = options.region.split(":",1)
+    referenceSequence = regionList[0]
+    try: start = int(regionList[1].split("..")[0])
+    except ValueError:
+      print >> sys.stderr, "region start coordinate is not an integer"
+      exit(1)
+    try: end = int(regionList[1].split("..")[1])
+    except ValueError:
+      print >> sys.stderr, "region end coordinate is not an integer"
+      exit(1)
+
+# Ensure that discard-info and keep-info haven't both been defined.
+  if options.infoKeep and options.infoDiscard:
+    print >> sys.stderr, "Cannot specify fields to keep and discard simultaneously."
+    exit(1)
+
+# If the --keep-quality argument is used, check that a value and a logical
+# argument are supplied and that the logical argument is valid.
+
+  if options.keepQuality:
+    for value, logic in options.keepQuality:
+      if logic != "eq" and logic != "lt" and logic != "le" and logic != "gt" and logic != "ge":
+        print >> sys.stderr, "Error with --keep-quality (-q) argument.  Must take the following form:"
+        print >> sys.stderr, "\npython vcfPytools extract --in <VCF> --keep-quality <value> <logic>"
+        print >> sys.stderr, "\nwhere logic is one of: eq, le, lt, ge or gt"
+        exit(1)
+    try: qualityValue = float(value)
+    except ValueError:
+      print >> sys.stderr, "Error with --keep-quality (-q) argument.  Must take the following form:"
+      print >> sys.stderr, "Quality value must be an integer or float value."
+      exit(1)
+    qualityLogic = logic
+
+# Set the output file to stdout if no output file was specified.
+  outputFile, writeOut = setOutput(options.output)
+
+  v = vcf() # Define vcf object.
+
+# Set process info to True if info strings need to be parsed.
+  if options.infoKeep or options.infoDiscard: v.processInfo = True
+
+# Open the file.
+  v.openVcf(options.vcfFile)
+
+# Read in the header information.
+  v.parseHeader(options.vcfFile, writeOut)
+  taskDescriptor = "##vcfPytools=extract data"
+  writeHeader(outputFile, v, False, taskDescriptor) # tools.py
+
+# Read through all the entries and write out records in the correct
+# reference sequence.
+  while v.getRecord():
+    writeRecord = True
+    if options.referenceSequence and v.referenceSequence != options.referenceSequence: writeRecord = False
+    elif options.region:
+      if v.referenceSequence != referenceSequence: writeRecord = False
+      elif v.position < start or v.position > end: writeRecord = False
+
+# Only consider these fields if the record is contained within the
+# specified region.
+    if options.infoKeep and writeRecord:
+      for tag in options.infoKeep:
+        if v.infoTags.has_key(tag):
+          writeRecord = True
+          break
+        if not v.infoTags.has_key(tag): writeRecord = False
+    if options.infoDiscard and writeRecord:
+      for tag in options.infoDiscard:
+        if v.infoTags.has_key(tag): writeRecord = False
+    if options.passFilter and v.filters != "PASS" and writeRecord: writeRecord = False
+    if options.keepQuality:
+      if qualityLogic == "eq" and v.quality != qualityValue: writeRecord = False
+      if qualityLogic == "le" and v.quality > qualityValue: writeRecord = False
+      if qualityLogic == "lt" and v.quality >= qualityValue: writeRecord = False
+      if qualityLogic == "ge" and v.quality < qualityValue: writeRecord = False
+      if qualityLogic == "gt" and v.quality <= qualityValue: writeRecord = False
+
+    if writeRecord: outputFile.write(v.record)
+
+# Close the file.
+  v.closeVcf(options.vcfFile)
+
+# Terminate the program cleanly.
+  return 0