Repository 'filter_bed_on_splice_junctions'
hg clone https://toolshed.g2.bx.psu.edu/repos/jjohnson/filter_bed_on_splice_junctions

Changeset 0:915e5c5e02e6 (2014-02-05)
Commit message:
Uploaded
added:
filter_bed_on_splice_junctions.py
filter_bed_on_splice_junctions.xml
test-data/extended_novel_splice_junctions.bed
test-data/filtered_novel_splice_junctions.bed
test-data/gtf_splice_junctions.bed
test-data/novel_splice_junctions.bed
b
diff -r 000000000000 -r 915e5c5e02e6 filter_bed_on_splice_junctions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bed_on_splice_junctions.py Wed Feb 05 08:18:49 2014 -0500
[
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2013, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+"""
+Takes 2 bed files as input:  
+  1.  tophat splice junctions.bed (without options:  --GTF and --no-novel-juncs) 
+  2.  tophat splice junctions.bed with options:  --GTF ref.gtf --no-novel-juncs 
+
+Find the splice junctions from the bed file that aren't in the second bed file, and output those lines
+bed file format - http://genome.ucsc.edu/FAQ/FAQformat.html#format1
+"""
+import sys,re,os.path
+import tempfile
+import optparse
+from optparse import OptionParser
+import logging
+
+class BedEntry( object ):
+  def __init__(self, line):
+    self.line = line
+    (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = line.split('\t')[0:12]
+    self.chrom = chrom
+    self.chromStart = int(chromStart)
+    self.chromEnd = int(chromEnd)
+    self.name = name
+    self.score = int(score)
+    self.strand = strand
+    self.thickStart = int(thickStart)
+    self.thickEnd = int(thickEnd)
+    self.itemRgb = itemRgb
+    self.blockCount = int(blockCount)
+    self.blockSizes = [int(x) for x in blockSizes.split(',')]
+    self.blockStarts = [int(x) for x in blockStarts.split(',')]
+  def get_splice_junctions(self): 
+    splice_juncs = []
+    for i in range(self.blockCount  - 1):
+      splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
+      splice_juncs.append(splice_junc)
+    return splice_juncs
+  def get_line(self, start_offset = 0, end_offset = 0):
+    if start_offset or end_offset:
+      s_offset = start_offset if start_offset else 0
+      e_offset = end_offset if end_offset else 0
+      if s_offset > self.chromStart:
+        s_offset = self.chromStart
+      chrStart = self.chromStart - s_offset
+      chrEnd = self.chromEnd + e_offset
+      blkSizes = self.blockSizes
+      blkSizes[0] += s_offset
+      blkSizes[-1] += e_offset
+      blkStarts = self.blockStarts
+      for i in range(1,self.blockCount):
+        blkStarts[i] += s_offset
+      items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]]
+      return '\t'.join(items) + '\n'
+    return self.line
+
+def __main__():
+  #Parse Command Line
+  parser = optparse.OptionParser()
+  parser.add_option( '-i', '--input', dest='input', help='tophat non-guided junctions.bed file' )
+  parser.add_option( '-g', '--guided_junctions', dest='reference_bed', help='tophat gtf-guided junctions.bed file' )
+  # parser.add_option( '-r', '--reference_gtf', dest='reference_gtf', default=None, help='The Ensembl genome reference GTF' )
+  parser.add_option( '-o', '--output', dest='output', help='The output non-guided junctions.bed with gtf-guided entries removed')
+  parser.add_option( '-n', '--novel_junctions', dest='novel_bed', help='The output with the exons extened' )
+  parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to output' )
+  parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to output' )
+  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
+  (options, args) = parser.parse_args()
+  # Input files
+  if options.input != None:
+    try:
+      inputPath = os.path.abspath(options.input)
+      inputFile = open(inputPath, 'r')
+    except Exception, e:
+      print >> sys.stderr, "failed: %s" % e
+      exit(2)
+  else:
+    inputFile = sys.stdin
+  # Output files
+  outFile = None
+  novelFile = None
+  if options.novel_bed == None and options.output == None:
+    #write to stdout
+    outFile = sys.stdout
+  else:
+    if options.output != None:
+      try:
+        outPath = os.path.abspath(options.output)
+        outFile = open(outPath, 'w')
+      except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+        exit(3)
+    if options.novel_bed != None:
+      try:
+        novelPath = os.path.abspath(options.novel_bed)
+        novelFile = open(novelPath, 'w')
+      except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+        exit(3)
+  if options.leading_bp and options.leading_bp < 0:
+        print >> sys.stderr, "failed: leading_bp must be positive"
+        exit(5)
+  if options.trailing_bp and options.trailing_bp < 0:
+        print >> sys.stderr, "failed: trailing_bp must be positive"
+        exit(5)
+  # Scan bed file with known splice junctions
+  known_splice_junctions = set()
+  if options.reference_bed:
+    try:
+      inFile = open(options.reference_bed)
+      for i, line in enumerate( inFile ):
+        if line.startswith('track'):
+          continue
+        entry = BedEntry(line)
+        for splice_junc in entry.get_splice_junctions():
+          known_splice_junctions.add(splice_junc)
+    except Exception, e:
+      print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference_bed,e)
+    finally:
+      if inFile:
+        inFile.close()
+  # Scan bed file that may have novel splice junctions
+  try:
+    for i, line in enumerate( inputFile ):
+      if line.startswith('track'):
+        if outFile:
+          outFile.write(line)
+        if novelFile:
+          novelFile.write(line)
+        continue
+      novel_junctions = 0
+      entry = BedEntry(line)
+      for splice_junc in entry.get_splice_junctions():
+        if splice_junc not in known_splice_junctions:
+          novel_junctions += 1
+          if novelFile:
+            # output a line with offsets from the splice junction
+            novelFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
+      if outFile:
+        if novel_junctions > 0:
+          if not novelFile and (options.leading_bp or options.trailing_bp):
+            outFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
+          else:
+            outFile.write(line)
+  except Exception, e:
+    print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)
+
+if __name__ == "__main__" : __main__()
+
b
diff -r 000000000000 -r 915e5c5e02e6 filter_bed_on_splice_junctions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bed_on_splice_junctions.xml Wed Feb 05 08:18:49 2014 -0500
b
@@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+<tool id="filter_bed_on_splice_junctions" name="Filter BED on splice junctions" version="0.0.1">
+  <description>that are not in a reference bed file</description>
+  <command interpreter="python">filter_bed_on_splice_junctions.py  --input "$input_bed" 
+  --guided_junctions "$guided_junctions" 
+  #if $leading_bp:
+    --leading_bp $leading_bp
+  #end if
+  #if $trailing_bp:
+    --trailing_bp $trailing_bp
+  #end if
+  --novel_junctions "$novel_junctions"
+  </command>
+  <inputs>
+    <param name="input_bed" type="data" format="bed" label="BED file" 
+           help="e.g. tophat junctions.bed run without GTF option or no-novel-junctions"/> 
+    <param name="guided_junctions" type="data" format="bed" optional="true" label="reference bed file" 
+           help="e.g. tophat junctions.bed run with GTF option and no-novel-junctions"/> 
+    <param name="leading_bp" type="integer" value="" min="0" optional="true" label="Extend the start position" 
+           help="The number of base pairs to extend the start of the exon before the junction position"/>
+    <param name="trailing_bp" type="integer" value="" min="0" optional="true" label="Extend the end position" 
+           help="The number of base pairs to extend the end of the exon after the junction position"/>
+  </inputs>
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Error" />
+  </stdio>
+  <outputs>
+    <data name="novel_junctions" metadata_source="input_bed" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_bed" value="novel_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="guided_junctions" value="gtf_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="leading_bp" value="0"/>
+      <param name="trailing_bp" value="0"/>
+      <output name="novel_junctions" file="filtered_novel_splice_junctions.bed"/>
+    </test>
+    <test>
+      <param name="input_bed" value="novel_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="guided_junctions" value="gtf_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="leading_bp" value="10"/>
+      <param name="trailing_bp" value="10"/>
+      <output name="novel_junctions" file="extended_novel_splice_junctions.bed"/>
+    </test>
+  </tests>
+  <help>
+**Filter BED on splice junctions**
+
+Filter out lines of a BED file that have junctions that are in in the reference bed file of known junctions. 
+The start position of the exon preceding the junction and the end position of the exon after the junction can be extended.  
+This is to compensate for alignments that may not include enough of the exons surrounding the junctions.  
+
+A typical application would be to run tophat twice, 
+once with the --GTF and --no-novel-juncs options for well known splice junctions,
+then a second time without those options to also include novel splice junctions.  
+
+This application would filter out the well splice known junctions 
+from the run intended to find all splice junctions including novel ones.
+
+  </help>
+</tool>
b
diff -r 000000000000 -r 915e5c5e02e6 test-data/extended_novel_splice_junctions.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extended_novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500
b
@@ -0,0 +1,4 @@
+chr6_dbb_hap3 876056 878085 JUNC00000037 1 - 876066 878075 255,0,0 2 66,55 0,1974
+chr6_dbb_hap3 878246 882141 JUNC00000042 3 - 878256 882131 255,0,0 2 106,84 0,3811
+chr6_dbb_hap3 879761 882145 JUNC00000043 57 - 879771 882135 255,0,0 2 106,88 0,2296
+chr6_dbb_hap3 898749 902619 JUNC00000053 1 - 898759 902609 255,0,0 2 32,89 0,3781
b
diff -r 000000000000 -r 915e5c5e02e6 test-data/filtered_novel_splice_junctions.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500
b
@@ -0,0 +1,4 @@
+chr6_dbb_hap3 876066 878075 JUNC00000037 1 - 876066 878075 255,0,0 2 56,45 0,1964
+chr6_dbb_hap3 878256 882131 JUNC00000042 3 - 878256 882131 255,0,0 2 96,74 0,3801
+chr6_dbb_hap3 879771 882135 JUNC00000043 57 - 879771 882135 255,0,0 2 96,78 0,2286
+chr6_dbb_hap3 898759 902609 JUNC00000053 1 - 898759 902609 255,0,0 2 22,79 0,3771
b
diff -r 000000000000 -r 915e5c5e02e6 test-data/gtf_splice_junctions.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtf_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500
b
@@ -0,0 +1,20 @@
+chr6_dbb_hap3 874766 875722 JUNC00000001 36 - 874766 875722 255,0,0 2 84,95 0,861
+chr6_dbb_hap3 875681 876089 JUNC00000002 56 - 875681 876089 255,0,0 2 90,96 0,312
+chr6_dbb_hap3 876025 876797 JUNC00000003 82 - 876025 876797 255,0,0 2 97,95 0,677
+chr6_dbb_hap3 876735 877618 JUNC00000004 85 - 876735 877618 255,0,0 2 95,94 0,789
+chr6_dbb_hap3 877523 878127 JUNC00000005 83 - 877523 878127 255,0,0 2 95,97 0,507
+chr6_dbb_hap3 878042 878332 JUNC00000006 58 - 878042 878332 255,0,0 2 96,97 0,193
+chr6_dbb_hap3 878260 879827 JUNC00000007 33 - 878260 879827 255,0,0 2 92,93 0,1474
+chr6_dbb_hap3 882054 883748 JUNC00000008 22 - 882054 883748 255,0,0 2 81,64 0,1630
+chr6_dbb_hap3 883682 884465 JUNC00000009 45 - 883682 884465 255,0,0 2 66,89 0,694
+chr6_dbb_hap3 884526 892332 JUNC00000010 82 - 884526 892332 255,0,0 2 93,97 0,7709
+chr6_dbb_hap3 892335 892952 JUNC00000011 77 - 892335 892952 255,0,0 2 92,66 0,551
+chr6_dbb_hap3 892885 893335 JUNC00000012 107 - 892885 893335 255,0,0 2 67,97 0,353
+chr6_dbb_hap3 893244 894535 JUNC00000013 62 - 893244 894535 255,0,0 2 96,96 0,1195
+chr6_dbb_hap3 894513 895119 JUNC00000014 56 - 894513 895119 255,0,0 2 97,95 0,511
+chr6_dbb_hap3 895064 898717 JUNC00000015 42 - 895064 898717 255,0,0 2 95,97 0,3556
+chr6_dbb_hap3 898708 900279 JUNC00000016 4 - 898708 900279 255,0,0 2 73,58 0,1513
+chr6_dbb_hap3 900221 901683 JUNC00000017 4 - 900221 901683 255,0,0 2 21,91 0,1371
+chr6_dbb_hap3 901688 902616 JUNC00000018 21 - 901688 902616 255,0,0 2 90,86 0,842
+chr6_dbb_hap3 902646 903512 JUNC00000019 7 - 902646 903512 255,0,0 2 88,77 0,789
+chr6_dbb_hap3 992108 992398 JUNC00000020 4 + 992108 992398 255,0,0 2 78,82 0,208
b
diff -r 000000000000 -r 915e5c5e02e6 test-data/novel_splice_junctions.bed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/novel_splice_junctions.bed Wed Feb 05 08:18:49 2014 -0500
b
@@ -0,0 +1,20 @@
+chr6_dbb_hap3 874766 875722 JUNC00000034 36 - 874766 875722 255,0,0 2 84,95 0,861
+chr6_dbb_hap3 875681 876089 JUNC00000035 56 - 875681 876089 255,0,0 2 90,96 0,312
+chr6_dbb_hap3 876025 876797 JUNC00000036 82 - 876025 876797 255,0,0 2 97,95 0,677
+chr6_dbb_hap3 876066 878075 JUNC00000037 1 - 876066 878075 255,0,0 2 56,45 0,1964
+chr6_dbb_hap3 876735 877618 JUNC00000038 85 - 876735 877618 255,0,0 2 95,94 0,789
+chr6_dbb_hap3 877523 878127 JUNC00000039 83 - 877523 878127 255,0,0 2 95,97 0,507
+chr6_dbb_hap3 878042 878332 JUNC00000040 58 - 878042 878332 255,0,0 2 96,97 0,193
+chr6_dbb_hap3 878260 879827 JUNC00000041 33 - 878260 879827 255,0,0 2 92,93 0,1474
+chr6_dbb_hap3 878256 882131 JUNC00000042 3 - 878256 882131 255,0,0 2 96,74 0,3801
+chr6_dbb_hap3 879771 882135 JUNC00000043 57 - 879771 882135 255,0,0 2 96,78 0,2286
+chr6_dbb_hap3 882057 883748 JUNC00000044 32 - 882057 883748 255,0,0 2 78,64 0,1627
+chr6_dbb_hap3 883682 884465 JUNC00000045 45 - 883682 884465 255,0,0 2 66,89 0,694
+chr6_dbb_hap3 884526 892332 JUNC00000046 82 - 884526 892332 255,0,0 2 93,97 0,7709
+chr6_dbb_hap3 892335 892952 JUNC00000047 77 - 892335 892952 255,0,0 2 92,66 0,551
+chr6_dbb_hap3 892885 893335 JUNC00000048 107 - 892885 893335 255,0,0 2 67,97 0,353
+chr6_dbb_hap3 893244 894535 JUNC00000049 62 - 893244 894535 255,0,0 2 96,96 0,1195
+chr6_dbb_hap3 894513 895119 JUNC00000050 56 - 894513 895119 255,0,0 2 97,95 0,511
+chr6_dbb_hap3 895064 898717 JUNC00000051 42 - 895064 898717 255,0,0 2 95,97 0,3556
+chr6_dbb_hap3 898708 900279 JUNC00000052 4 - 898708 900279 255,0,0 2 73,58 0,1513
+chr6_dbb_hap3 898759 902609 JUNC00000053 1 - 898759 902609 255,0,0 2 22,79 0,3771