Mercurial > repos > devteam > vcf_extract

diff tools.py @ 0:76ad0b7865b9 draft default tip
Imported from capsule None
author: devteam
date: Mon, 27 Jan 2014 09:26:27 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools.py	Mon Jan 27 09:26:27 2014 -0500
@@ -0,0 +1,188 @@
+#!/usr/bin/python
+
+import os.path
+import sys
+import vcfPytools
+from vcfPytools import __version__
+
+# Determine whether to output to a file or stdout.
+def setOutput(output):
+  if output == None:
+    outputFile = sys.stdout
+    writeOut = False
+  else:
+    output = os.path.abspath(output)
+    outputFile = open(output, 'w')
+    writeOut = True
+
+  return outputFile, writeOut
+
+# Determine which file has priority for writing out records.
+def setVcfPriority(priorityFile, vcfFiles):
+  if priorityFile == None: priority = 0
+  elif priorityFile == vcfFiles[0]: priority = 1
+  elif priorityFile == vcfFiles[1]: priority = 2
+  elif priorityFile.lower() == "merge": priority = 3
+  else:
+    print >> sys.stderr, "vcf file give priority must be one of the two input vcf files or merge."
+    exit(1)
+
+  return priority
+
+# If the union or intersection of two vcf files is being performed
+# and the output vcf file is to contain the information from both
+# files, the headers need to be merged to ensure that all info and
+# format entries have an explanation.
+def mergeHeaders(v1, v2, v3):
+
+# If either file does not have a header, terminate the program.
+# In order to merge the headers, the different fields must be
+# checked to ensure the files are compatible.
+  if not v1.hasHeader or not v2.hasHeader:
+    print >> sys.stderr, "Both vcf files must have a header in order to merge data sets."
+    exit(1)
+
+  v3.infoHeaderTags = v1.infoHeaderTags.copy()
+  v3.formatHeaderTags = v1.formatHeaderTags.copy()
+  v3.numberDataSets = v1.numberDataSets
+  v3.includedDataSets = v1.includedDataSets.copy()
+  v3.headerText = v1.headerText
+  v3.headerTitles = v1.headerTitles
+  v3.infoHeaderString = v1.infoHeaderString.copy()
+  v3.formatHeaderString = v1.formatHeaderString.copy()
+
+# Merge the info field descriptions.
+  for tag in v2.infoHeaderTags:
+    if v1.infoHeaderTags.has_key(tag):
+      if v1.infoHeaderTags[tag][0] != v2.infoHeaderTags[tag][0] or \
+         v1.infoHeaderTags[tag][1] != v2.infoHeaderTags[tag][1]:
+        print v1.infoHeaderTags[tag][0]
+        print v1.infoHeaderTags[tag][1]
+        print v1.infoHeaderTags[tag][2]
+        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
+        exit(1)
+    else: v3.infoHeaderTags[tag] = v2.infoHeaderTags[tag]
+
+# Merge the format field descriptions.
+  for tag in v2.formatHeaderTags:
+    if v1.formatHeaderTags.has_key(tag):
+      if v1.formatHeaderTags[tag][0] != v2.formatHeaderTags[tag][0] or \
+         v1.formatHeaderTags[tag][1] != v2.formatHeaderTags[tag][1]:
+        print >> sys.stderr, "Input vcf files have different definitions for " + tag + " field."
+        exit(1)
+    else: v3.formatHeaderTags[tag] = v2.formatHeaderTags[tag]
+
+# Now check to see if the vcf files contain information from multiple
+# records themselves and create an ordered list in which the data
+# will appear in the file.  For instance, of the first file has
+# already got two sets of data and is being intersected with a file
+# with one set of data, the order of data in the new vcf file will be
+# the two sets from the first file followed by the second, e.g.
+# AB=3/2/4, where the 3 and 2 are from the first file and the 4 is the
+# value of AC from the second vcf.  The header will have a ##FILE for
+# each of the three files, so the origin if the data can be recovered.
+  if v1.numberDataSets == 0:
+    v3.includedDataSets[v3.numberDataSets + 1] = v1.filename
+    v3.numberDataSets += 1
+  if v2.numberDataSets == 0:
+    v3.includedDataSets[v3.numberDataSets + 1] = v2.filename
+    v3.numberDataSets += 1
+  else:
+    for i in range(1, v2.numberDataSets + 1):
+      v3.includedDataSets[v3.numberDataSets + 1] = v2.includedDataSets[i]
+      v3.numberDataSets += 1
+
+# If either of the input files contain multiple data sets (e.g. multiple
+# vcf files have undergone intersection or union calculations and all
+# information has been retained) and the priority isn't set to 'merge',
+# terminate the program.  This is to ensure that the origin of the data
+# doesn't get confused.
+def checkDataSets(v1, v2):
+  if v1.numberDataSets + v2.numberDataSets != 0:
+    print >> sys.stderr, "\nERROR:"
+    print >> sys.stderr, "input vcf file(s) contain data sets from multiple vcf files."
+    print >> sys.stderr, "Further intersection or union operations must include --priority-file merge"
+    print >> sys.stderr, "Other tools may be incompatible with this format."
+    exit(1)
+
+# Write the header to file.
+def writeHeader (outputFile, v, removeGenotypes, taskDescriptor):
+  if not v.hasHeader: 
+    v.headerText = "##fileformat=VCFv4.0\n##source=vcfPytools " + __version__ + "\n"
+    v.headerTitles = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n"
+  outputFile.write(v.headerText) if v.headerText != "" else None
+  print >> outputFile, taskDescriptor
+  for tag in v.infoHeaderString: print >> outputFile, v.infoHeaderString[tag]
+  for tag in v.formatHeaderString: print >> outputFile, v.formatHeaderString[tag]
+
+# Write out a list of files indicating which data set belongs to which file.
+  if v.numberDataSets != 0:
+    for i in range(1, v.numberDataSets + 1):
+      print >> outputFile, "##FILE=<ID=" + str(i) + ",\"" + v.includedDataSets[i] + "\">"
+
+  if removeGenotypes:
+    line = v.headerTitles.rstrip("\n").split("\t")
+    newHeaderTitles = line[0]
+    for i in range(1,8):
+      newHeaderTitles = newHeaderTitles + "\t" + line[i]
+    newHeaderTitles = newHeaderTitles + "\n"
+    outputFile.write( newHeaderTitles )
+  else:
+    outputFile.write( v.headerTitles )
+
+# Check that the two reference sequence lists are identical.
+# If there are a different number or order, the results may
+# not be as expected.
+def checkReferenceSequenceLists(list1, list2):
+  errorMessage = False
+  if len(list1) != len(list2):
+    print >> sys.stderr, "WARNING: Input files contain a different number of reference sequences."
+    errorMessage = True
+  elif list1 != list2:
+    print >> sys.stderr, "WARNING: Input files contain different or differently ordered reference sequences."
+    errorMessage = True
+  if errorMessage:
+    print >> sys.stderr, "Results may not be as expected."
+    print >> sys.stderr, "Ensure that input files have the same reference sequences in the same order."
+    print >> sys.stderr, "Reference sequence lists observed were:\n\t", list1, "\n\t", list2
+
+# Write out a vcf record to file.  The record written depends on the
+# value of 'priority' and could therefore be the record from either
+# of the vcf files, or a combination of them.
+
+def writeVcfRecord(priority, v1, v2, outputFile):
+  if priority == 0:
+    if v1.quality >= v2.quality: outputFile.write(v1.record)
+    else: outputFile.write(v2.record)
+  elif priority == 1: outputFile.write(v1.record)
+  elif priority == 2: outputFile.write(v2.record)
+  elif priority == 3:
+
+# Define the missing entry values (depends on the number of data sets
+# in the file).
+    info = ""
+    missingEntry1 = missingEntry2 = "."
+    for i in range(1, v1.numberDataSets): missingEntry1 += "/."
+    for i in range(1, v2.numberDataSets): missingEntry2 += "/."
+    secondList = v2.infoTags.copy()
+
+# Build up the info field.
+    for tag in v1.infoTags:
+      if secondList.has_key(tag):
+        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + v2.infoTags[tag] + ";"
+        del secondList[tag]
+      else: 
+        if v1.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + v1.infoTags[tag] + "/" + missingEntry2 + ";"
+
+# Now include the info tags that are not populated in the first vcf file.
+    for tag in secondList:
+      if v2.infoHeaderTags[tag][1].lower() != "flag": info += tag + "=" + missingEntry1 + "/" + v2.infoTags[tag] + ";"
+
+# Build the complete record.
+    info = info.rstrip(";")
+    record = v1.referenceSequence + "\t" + str(v1.position) + "\t" + v1.rsid + "\t" + v1.ref + "\t" + \
+             v1.alt + "/" + v2.alt + "\t" + v1.quality + "/" + v2.quality + "\t.\t" + info
+    print >> outputFile, record
+  else:
+    print >> sys.sterr, "Unknown file priority."
+    exit(1)
author	devteam
date	Mon, 27 Jan 2014 09:26:27 -0500
parents
children