Mercurial > repos > jjohnson > filter_bed_on_splice_junctions
view filter_bed_on_splice_junctions.py @ 0:915e5c5e02e6 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Wed, 05 Feb 2014 08:18:49 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python """ # #------------------------------------------------------------------------------ # University of Minnesota # Copyright 2013, Regents of the University of Minnesota #------------------------------------------------------------------------------ # Author: # # James E Johnson # #------------------------------------------------------------------------------ """ """ Takes 2 bed files as input: 1. tophat splice junctions.bed (without options: --GTF and --no-novel-juncs) 2. tophat splice junctions.bed with options: --GTF ref.gtf --no-novel-juncs Find the splice junctions from the bed file that aren't in the second bed file, and output those lines bed file format - http://genome.ucsc.edu/FAQ/FAQformat.html#format1 """ import sys,re,os.path import tempfile import optparse from optparse import OptionParser import logging class BedEntry( object ): def __init__(self, line): self.line = line (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = line.split('\t')[0:12] self.chrom = chrom self.chromStart = int(chromStart) self.chromEnd = int(chromEnd) self.name = name self.score = int(score) self.strand = strand self.thickStart = int(thickStart) self.thickEnd = int(thickEnd) self.itemRgb = itemRgb self.blockCount = int(blockCount) self.blockSizes = [int(x) for x in blockSizes.split(',')] self.blockStarts = [int(x) for x in blockStarts.split(',')] def get_splice_junctions(self): splice_juncs = [] for i in range(self.blockCount - 1): splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) splice_juncs.append(splice_junc) return splice_juncs def get_line(self, start_offset = 0, end_offset = 0): if start_offset or end_offset: s_offset = start_offset if start_offset else 0 e_offset = end_offset if end_offset else 0 if s_offset > self.chromStart: s_offset = self.chromStart chrStart = self.chromStart - s_offset chrEnd = self.chromEnd + e_offset blkSizes = self.blockSizes blkSizes[0] += s_offset blkSizes[-1] += e_offset blkStarts = self.blockStarts for i in range(1,self.blockCount): blkStarts[i] += s_offset items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] return '\t'.join(items) + '\n' return self.line def __main__(): #Parse Command Line parser = optparse.OptionParser() parser.add_option( '-i', '--input', dest='input', help='tophat non-guided junctions.bed file' ) parser.add_option( '-g', '--guided_junctions', dest='reference_bed', help='tophat gtf-guided junctions.bed file' ) # parser.add_option( '-r', '--reference_gtf', dest='reference_gtf', default=None, help='The Ensembl genome reference GTF' ) parser.add_option( '-o', '--output', dest='output', help='The output non-guided junctions.bed with gtf-guided entries removed') parser.add_option( '-n', '--novel_junctions', dest='novel_bed', help='The output with the exons extened' ) parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to output' ) parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to output' ) parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) (options, args) = parser.parse_args() # Input files if options.input != None: try: inputPath = os.path.abspath(options.input) inputFile = open(inputPath, 'r') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(2) else: inputFile = sys.stdin # Output files outFile = None novelFile = None if options.novel_bed == None and options.output == None: #write to stdout outFile = sys.stdout else: if options.output != None: try: outPath = os.path.abspath(options.output) outFile = open(outPath, 'w') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(3) if options.novel_bed != None: try: novelPath = os.path.abspath(options.novel_bed) novelFile = open(novelPath, 'w') except Exception, e: print >> sys.stderr, "failed: %s" % e exit(3) if options.leading_bp and options.leading_bp < 0: print >> sys.stderr, "failed: leading_bp must be positive" exit(5) if options.trailing_bp and options.trailing_bp < 0: print >> sys.stderr, "failed: trailing_bp must be positive" exit(5) # Scan bed file with known splice junctions known_splice_junctions = set() if options.reference_bed: try: inFile = open(options.reference_bed) for i, line in enumerate( inFile ): if line.startswith('track'): continue entry = BedEntry(line) for splice_junc in entry.get_splice_junctions(): known_splice_junctions.add(splice_junc) except Exception, e: print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference_bed,e) finally: if inFile: inFile.close() # Scan bed file that may have novel splice junctions try: for i, line in enumerate( inputFile ): if line.startswith('track'): if outFile: outFile.write(line) if novelFile: novelFile.write(line) continue novel_junctions = 0 entry = BedEntry(line) for splice_junc in entry.get_splice_junctions(): if splice_junc not in known_splice_junctions: novel_junctions += 1 if novelFile: # output a line with offsets from the splice junction novelFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp)) if outFile: if novel_junctions > 0: if not novelFile and (options.leading_bp or options.trailing_bp): outFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp)) else: outFile.write(line) except Exception, e: print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) if __name__ == "__main__" : __main__()