view filter_bed_on_splice_junctions.py @ 0:915e5c5e02e6 draft default tip

Uploaded
author jjohnson
date Wed, 05 Feb 2014 08:18:49 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python
"""
#
#------------------------------------------------------------------------------
#                         University of Minnesota
#         Copyright 2013, Regents of the University of Minnesota
#------------------------------------------------------------------------------
# Author:
#
#  James E Johnson
#
#------------------------------------------------------------------------------
"""

"""
Takes 2 bed files as input:  
  1.  tophat splice junctions.bed (without options:  --GTF and --no-novel-juncs) 
  2.  tophat splice junctions.bed with options:  --GTF ref.gtf --no-novel-juncs 

Find the splice junctions from the bed file that aren't in the second bed file, and output those lines
bed file format - http://genome.ucsc.edu/FAQ/FAQformat.html#format1
"""
import sys,re,os.path
import tempfile
import optparse
from optparse import OptionParser
import logging

class BedEntry( object ):
  def __init__(self, line):
    self.line = line
    (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = line.split('\t')[0:12]
    self.chrom = chrom
    self.chromStart = int(chromStart)
    self.chromEnd = int(chromEnd)
    self.name = name
    self.score = int(score)
    self.strand = strand
    self.thickStart = int(thickStart)
    self.thickEnd = int(thickEnd)
    self.itemRgb = itemRgb
    self.blockCount = int(blockCount)
    self.blockSizes = [int(x) for x in blockSizes.split(',')]
    self.blockStarts = [int(x) for x in blockStarts.split(',')]
  def get_splice_junctions(self): 
    splice_juncs = []
    for i in range(self.blockCount  - 1):
      splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
      splice_juncs.append(splice_junc)
    return splice_juncs
  def get_line(self, start_offset = 0, end_offset = 0):
    if start_offset or end_offset:
      s_offset = start_offset if start_offset else 0
      e_offset = end_offset if end_offset else 0
      if s_offset > self.chromStart:
        s_offset = self.chromStart
      chrStart = self.chromStart - s_offset
      chrEnd = self.chromEnd + e_offset
      blkSizes = self.blockSizes
      blkSizes[0] += s_offset
      blkSizes[-1] += e_offset
      blkStarts = self.blockStarts
      for i in range(1,self.blockCount):
        blkStarts[i] += s_offset
      items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]]
      return '\t'.join(items) + '\n'
    return self.line

def __main__():
  #Parse Command Line
  parser = optparse.OptionParser()
  parser.add_option( '-i', '--input', dest='input', help='tophat non-guided junctions.bed file' )
  parser.add_option( '-g', '--guided_junctions', dest='reference_bed', help='tophat gtf-guided junctions.bed file' )
  # parser.add_option( '-r', '--reference_gtf', dest='reference_gtf', default=None, help='The Ensembl genome reference GTF' )
  parser.add_option( '-o', '--output', dest='output', help='The output non-guided junctions.bed with gtf-guided entries removed')
  parser.add_option( '-n', '--novel_junctions', dest='novel_bed', help='The output with the exons extened' )
  parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to output' )
  parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to output' )
  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
  (options, args) = parser.parse_args()
  # Input files
  if options.input != None:
    try:
      inputPath = os.path.abspath(options.input)
      inputFile = open(inputPath, 'r')
    except Exception, e:
      print >> sys.stderr, "failed: %s" % e
      exit(2)
  else:
    inputFile = sys.stdin
  # Output files
  outFile = None
  novelFile = None
  if options.novel_bed == None and options.output == None:
    #write to stdout
    outFile = sys.stdout
  else:
    if options.output != None:
      try:
        outPath = os.path.abspath(options.output)
        outFile = open(outPath, 'w')
      except Exception, e:
        print >> sys.stderr, "failed: %s" % e
        exit(3)
    if options.novel_bed != None:
      try:
        novelPath = os.path.abspath(options.novel_bed)
        novelFile = open(novelPath, 'w')
      except Exception, e:
        print >> sys.stderr, "failed: %s" % e
        exit(3)
  if options.leading_bp and options.leading_bp < 0:
        print >> sys.stderr, "failed: leading_bp must be positive"
        exit(5)
  if options.trailing_bp and options.trailing_bp < 0:
        print >> sys.stderr, "failed: trailing_bp must be positive"
        exit(5)
  # Scan bed file with known splice junctions
  known_splice_junctions = set()
  if options.reference_bed:
    try:
      inFile = open(options.reference_bed)
      for i, line in enumerate( inFile ):
        if line.startswith('track'):
          continue
        entry = BedEntry(line)
        for splice_junc in entry.get_splice_junctions():
          known_splice_junctions.add(splice_junc)
    except Exception, e:
      print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference_bed,e)
    finally:
      if inFile:
        inFile.close()
  # Scan bed file that may have novel splice junctions
  try:
    for i, line in enumerate( inputFile ):
      if line.startswith('track'):
        if outFile:
          outFile.write(line)
        if novelFile:
          novelFile.write(line)
        continue
      novel_junctions = 0
      entry = BedEntry(line)
      for splice_junc in entry.get_splice_junctions():
        if splice_junc not in known_splice_junctions:
          novel_junctions += 1
          if novelFile:
            # output a line with offsets from the splice junction
            novelFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
      if outFile:
        if novel_junctions > 0:
          if not novelFile and (options.leading_bp or options.trailing_bp):
            outFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
          else:
            outFile.write(line)
  except Exception, e:
    print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)

if __name__ == "__main__" : __main__()