# HG changeset patch # User jjohnson # Date 1391606329 18000 # Node ID 915e5c5e02e6805021df19fcf0cf8bd0a7baaf2b Uploaded diff -r 000000000000 -r 915e5c5e02e6 filter_bed_on_splice_junctions.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_bed_on_splice_junctions.py Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,161 @@ +#!/usr/bin/env python +""" +# +#------------------------------------------------------------------------------ +# University of Minnesota +# Copyright 2013, Regents of the University of Minnesota +#------------------------------------------------------------------------------ +# Author: +# +# James E Johnson +# +#------------------------------------------------------------------------------ +""" + +""" +Takes 2 bed files as input: + 1. tophat splice junctions.bed (without options: --GTF and --no-novel-juncs) + 2. tophat splice junctions.bed with options: --GTF ref.gtf --no-novel-juncs + +Find the splice junctions from the bed file that aren't in the second bed file, and output those lines +bed file format - http://genome.ucsc.edu/FAQ/FAQformat.html#format1 +""" +import sys,re,os.path +import tempfile +import optparse +from optparse import OptionParser +import logging + +class BedEntry( object ): + def __init__(self, line): + self.line = line + (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = line.split('\t')[0:12] + self.chrom = chrom + self.chromStart = int(chromStart) + self.chromEnd = int(chromEnd) + self.name = name + self.score = int(score) + self.strand = strand + self.thickStart = int(thickStart) + self.thickEnd = int(thickEnd) + self.itemRgb = itemRgb + self.blockCount = int(blockCount) + self.blockSizes = [int(x) for x in blockSizes.split(',')] + self.blockStarts = [int(x) for x in blockStarts.split(',')] + def get_splice_junctions(self): + splice_juncs = [] + for i in range(self.blockCount - 1): + splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) + splice_juncs.append(splice_junc) + return splice_juncs + def get_line(self, start_offset = 0, end_offset = 0): + if start_offset or end_offset: + s_offset = start_offset if start_offset else 0 + e_offset = end_offset if end_offset else 0 + if s_offset > self.chromStart: + s_offset = self.chromStart + chrStart = self.chromStart - s_offset + chrEnd = self.chromEnd + e_offset + blkSizes = self.blockSizes + blkSizes[0] += s_offset + blkSizes[-1] += e_offset + blkStarts = self.blockStarts + for i in range(1,self.blockCount): + blkStarts[i] += s_offset + items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] + return '\t'.join(items) + '\n' + return self.line + +def __main__(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-i', '--input', dest='input', help='tophat non-guided junctions.bed file' ) + parser.add_option( '-g', '--guided_junctions', dest='reference_bed', help='tophat gtf-guided junctions.bed file' ) + # parser.add_option( '-r', '--reference_gtf', dest='reference_gtf', default=None, help='The Ensembl genome reference GTF' ) + parser.add_option( '-o', '--output', dest='output', help='The output non-guided junctions.bed with gtf-guided entries removed') + parser.add_option( '-n', '--novel_junctions', dest='novel_bed', help='The output with the exons extened' ) + parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to output' ) + parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to output' ) + parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) + (options, args) = parser.parse_args() + # Input files + if options.input != None: + try: + inputPath = os.path.abspath(options.input) + inputFile = open(inputPath, 'r') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(2) + else: + inputFile = sys.stdin + # Output files + outFile = None + novelFile = None + if options.novel_bed == None and options.output == None: + #write to stdout + outFile = sys.stdout + else: + if options.output != None: + try: + outPath = os.path.abspath(options.output) + outFile = open(outPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + if options.novel_bed != None: + try: + novelPath = os.path.abspath(options.novel_bed) + novelFile = open(novelPath, 'w') + except Exception, e: + print >> sys.stderr, "failed: %s" % e + exit(3) + if options.leading_bp and options.leading_bp < 0: + print >> sys.stderr, "failed: leading_bp must be positive" + exit(5) + if options.trailing_bp and options.trailing_bp < 0: + print >> sys.stderr, "failed: trailing_bp must be positive" + exit(5) + # Scan bed file with known splice junctions + known_splice_junctions = set() + if options.reference_bed: + try: + inFile = open(options.reference_bed) + for i, line in enumerate( inFile ): + if line.startswith('track'): + continue + entry = BedEntry(line) + for splice_junc in entry.get_splice_junctions(): + known_splice_junctions.add(splice_junc) + except Exception, e: + print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference_bed,e) + finally: + if inFile: + inFile.close() + # Scan bed file that may have novel splice junctions + try: + for i, line in enumerate( inputFile ): + if line.startswith('track'): + if outFile: + outFile.write(line) + if novelFile: + novelFile.write(line) + continue + novel_junctions = 0 + entry = BedEntry(line) + for splice_junc in entry.get_splice_junctions(): + if splice_junc not in known_splice_junctions: + novel_junctions += 1 + if novelFile: + # output a line with offsets from the splice junction + novelFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp)) + if outFile: + if novel_junctions > 0: + if not novelFile and (options.leading_bp or options.trailing_bp): + outFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp)) + else: + outFile.write(line) + except Exception, e: + print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) + +if __name__ == "__main__" : __main__() + diff -r 000000000000 -r 915e5c5e02e6 filter_bed_on_splice_junctions.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/filter_bed_on_splice_junctions.xml Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,61 @@ + + + that are not in a reference bed file + filter_bed_on_splice_junctions.py --input "$input_bed" + --guided_junctions "$guided_junctions" + #if $leading_bp: + --leading_bp $leading_bp + #end if + #if $trailing_bp: + --trailing_bp $trailing_bp + #end if + --novel_junctions "$novel_junctions" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Filter BED on splice junctions** + +Filter out lines of a BED file that have junctions that are in in the reference bed file of known junctions. +The start position of the exon preceding the junction and the end position of the exon after the junction can be extended. +This is to compensate for alignments that may not include enough of the exons surrounding the junctions. + +A typical application would be to run tophat twice, +once with the --GTF and --no-novel-juncs options for well known splice junctions, +then a second time without those options to also include novel splice junctions. + +This application would filter out the well splice known junctions +from the run intended to find all splice junctions including novel ones. + + + diff -r 000000000000 -r 915e5c5e02e6 test-data/extended_novel_splice_junctions.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/extended_novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,4 @@ +chr6_dbb_hap3 876056 878085 JUNC00000037 1 - 876066 878075 255,0,0 2 66,55 0,1974 +chr6_dbb_hap3 878246 882141 JUNC00000042 3 - 878256 882131 255,0,0 2 106,84 0,3811 +chr6_dbb_hap3 879761 882145 JUNC00000043 57 - 879771 882135 255,0,0 2 106,88 0,2296 +chr6_dbb_hap3 898749 902619 JUNC00000053 1 - 898759 902609 255,0,0 2 32,89 0,3781 diff -r 000000000000 -r 915e5c5e02e6 test-data/filtered_novel_splice_junctions.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,4 @@ +chr6_dbb_hap3 876066 878075 JUNC00000037 1 - 876066 878075 255,0,0 2 56,45 0,1964 +chr6_dbb_hap3 878256 882131 JUNC00000042 3 - 878256 882131 255,0,0 2 96,74 0,3801 +chr6_dbb_hap3 879771 882135 JUNC00000043 57 - 879771 882135 255,0,0 2 96,78 0,2286 +chr6_dbb_hap3 898759 902609 JUNC00000053 1 - 898759 902609 255,0,0 2 22,79 0,3771 diff -r 000000000000 -r 915e5c5e02e6 test-data/gtf_splice_junctions.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtf_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,20 @@ +chr6_dbb_hap3 874766 875722 JUNC00000001 36 - 874766 875722 255,0,0 2 84,95 0,861 +chr6_dbb_hap3 875681 876089 JUNC00000002 56 - 875681 876089 255,0,0 2 90,96 0,312 +chr6_dbb_hap3 876025 876797 JUNC00000003 82 - 876025 876797 255,0,0 2 97,95 0,677 +chr6_dbb_hap3 876735 877618 JUNC00000004 85 - 876735 877618 255,0,0 2 95,94 0,789 +chr6_dbb_hap3 877523 878127 JUNC00000005 83 - 877523 878127 255,0,0 2 95,97 0,507 +chr6_dbb_hap3 878042 878332 JUNC00000006 58 - 878042 878332 255,0,0 2 96,97 0,193 +chr6_dbb_hap3 878260 879827 JUNC00000007 33 - 878260 879827 255,0,0 2 92,93 0,1474 +chr6_dbb_hap3 882054 883748 JUNC00000008 22 - 882054 883748 255,0,0 2 81,64 0,1630 +chr6_dbb_hap3 883682 884465 JUNC00000009 45 - 883682 884465 255,0,0 2 66,89 0,694 +chr6_dbb_hap3 884526 892332 JUNC00000010 82 - 884526 892332 255,0,0 2 93,97 0,7709 +chr6_dbb_hap3 892335 892952 JUNC00000011 77 - 892335 892952 255,0,0 2 92,66 0,551 +chr6_dbb_hap3 892885 893335 JUNC00000012 107 - 892885 893335 255,0,0 2 67,97 0,353 +chr6_dbb_hap3 893244 894535 JUNC00000013 62 - 893244 894535 255,0,0 2 96,96 0,1195 +chr6_dbb_hap3 894513 895119 JUNC00000014 56 - 894513 895119 255,0,0 2 97,95 0,511 +chr6_dbb_hap3 895064 898717 JUNC00000015 42 - 895064 898717 255,0,0 2 95,97 0,3556 +chr6_dbb_hap3 898708 900279 JUNC00000016 4 - 898708 900279 255,0,0 2 73,58 0,1513 +chr6_dbb_hap3 900221 901683 JUNC00000017 4 - 900221 901683 255,0,0 2 21,91 0,1371 +chr6_dbb_hap3 901688 902616 JUNC00000018 21 - 901688 902616 255,0,0 2 90,86 0,842 +chr6_dbb_hap3 902646 903512 JUNC00000019 7 - 902646 903512 255,0,0 2 88,77 0,789 +chr6_dbb_hap3 992108 992398 JUNC00000020 4 + 992108 992398 255,0,0 2 78,82 0,208 diff -r 000000000000 -r 915e5c5e02e6 test-data/novel_splice_junctions.bed --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500 @@ -0,0 +1,20 @@ +chr6_dbb_hap3 874766 875722 JUNC00000034 36 - 874766 875722 255,0,0 2 84,95 0,861 +chr6_dbb_hap3 875681 876089 JUNC00000035 56 - 875681 876089 255,0,0 2 90,96 0,312 +chr6_dbb_hap3 876025 876797 JUNC00000036 82 - 876025 876797 255,0,0 2 97,95 0,677 +chr6_dbb_hap3 876066 878075 JUNC00000037 1 - 876066 878075 255,0,0 2 56,45 0,1964 +chr6_dbb_hap3 876735 877618 JUNC00000038 85 - 876735 877618 255,0,0 2 95,94 0,789 +chr6_dbb_hap3 877523 878127 JUNC00000039 83 - 877523 878127 255,0,0 2 95,97 0,507 +chr6_dbb_hap3 878042 878332 JUNC00000040 58 - 878042 878332 255,0,0 2 96,97 0,193 +chr6_dbb_hap3 878260 879827 JUNC00000041 33 - 878260 879827 255,0,0 2 92,93 0,1474 +chr6_dbb_hap3 878256 882131 JUNC00000042 3 - 878256 882131 255,0,0 2 96,74 0,3801 +chr6_dbb_hap3 879771 882135 JUNC00000043 57 - 879771 882135 255,0,0 2 96,78 0,2286 +chr6_dbb_hap3 882057 883748 JUNC00000044 32 - 882057 883748 255,0,0 2 78,64 0,1627 +chr6_dbb_hap3 883682 884465 JUNC00000045 45 - 883682 884465 255,0,0 2 66,89 0,694 +chr6_dbb_hap3 884526 892332 JUNC00000046 82 - 884526 892332 255,0,0 2 93,97 0,7709 +chr6_dbb_hap3 892335 892952 JUNC00000047 77 - 892335 892952 255,0,0 2 92,66 0,551 +chr6_dbb_hap3 892885 893335 JUNC00000048 107 - 892885 893335 255,0,0 2 67,97 0,353 +chr6_dbb_hap3 893244 894535 JUNC00000049 62 - 893244 894535 255,0,0 2 96,96 0,1195 +chr6_dbb_hap3 894513 895119 JUNC00000050 56 - 894513 895119 255,0,0 2 97,95 0,511 +chr6_dbb_hap3 895064 898717 JUNC00000051 42 - 895064 898717 255,0,0 2 95,97 0,3556 +chr6_dbb_hap3 898708 900279 JUNC00000052 4 - 898708 900279 255,0,0 2 73,58 0,1513 +chr6_dbb_hap3 898759 902609 JUNC00000053 1 - 898759 902609 255,0,0 2 22,79 0,3771