changeset 0:915e5c5e02e6 draft default tip

Uploaded
author jjohnson
date Wed, 05 Feb 2014 08:18:49 -0500
parents
children
files filter_bed_on_splice_junctions.py filter_bed_on_splice_junctions.xml test-data/extended_novel_splice_junctions.bed test-data/filtered_novel_splice_junctions.bed test-data/gtf_splice_junctions.bed test-data/novel_splice_junctions.bed
diffstat 6 files changed, 270 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bed_on_splice_junctions.py	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+"""
+#
+#------------------------------------------------------------------------------
+#                         University of Minnesota
+#         Copyright 2013, Regents of the University of Minnesota
+#------------------------------------------------------------------------------
+# Author:
+#
+#  James E Johnson
+#
+#------------------------------------------------------------------------------
+"""
+
+"""
+Takes 2 bed files as input:  
+  1.  tophat splice junctions.bed (without options:  --GTF and --no-novel-juncs) 
+  2.  tophat splice junctions.bed with options:  --GTF ref.gtf --no-novel-juncs 
+
+Find the splice junctions from the bed file that aren't in the second bed file, and output those lines
+bed file format - http://genome.ucsc.edu/FAQ/FAQformat.html#format1
+"""
+import sys,re,os.path
+import tempfile
+import optparse
+from optparse import OptionParser
+import logging
+
+class BedEntry( object ):
+  def __init__(self, line):
+    self.line = line
+    (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = line.split('\t')[0:12]
+    self.chrom = chrom
+    self.chromStart = int(chromStart)
+    self.chromEnd = int(chromEnd)
+    self.name = name
+    self.score = int(score)
+    self.strand = strand
+    self.thickStart = int(thickStart)
+    self.thickEnd = int(thickEnd)
+    self.itemRgb = itemRgb
+    self.blockCount = int(blockCount)
+    self.blockSizes = [int(x) for x in blockSizes.split(',')]
+    self.blockStarts = [int(x) for x in blockStarts.split(',')]
+  def get_splice_junctions(self): 
+    splice_juncs = []
+    for i in range(self.blockCount  - 1):
+      splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1])
+      splice_juncs.append(splice_junc)
+    return splice_juncs
+  def get_line(self, start_offset = 0, end_offset = 0):
+    if start_offset or end_offset:
+      s_offset = start_offset if start_offset else 0
+      e_offset = end_offset if end_offset else 0
+      if s_offset > self.chromStart:
+        s_offset = self.chromStart
+      chrStart = self.chromStart - s_offset
+      chrEnd = self.chromEnd + e_offset
+      blkSizes = self.blockSizes
+      blkSizes[0] += s_offset
+      blkSizes[-1] += e_offset
+      blkStarts = self.blockStarts
+      for i in range(1,self.blockCount):
+        blkStarts[i] += s_offset
+      items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]]
+      return '\t'.join(items) + '\n'
+    return self.line
+
+def __main__():
+  #Parse Command Line
+  parser = optparse.OptionParser()
+  parser.add_option( '-i', '--input', dest='input', help='tophat non-guided junctions.bed file' )
+  parser.add_option( '-g', '--guided_junctions', dest='reference_bed', help='tophat gtf-guided junctions.bed file' )
+  # parser.add_option( '-r', '--reference_gtf', dest='reference_gtf', default=None, help='The Ensembl genome reference GTF' )
+  parser.add_option( '-o', '--output', dest='output', help='The output non-guided junctions.bed with gtf-guided entries removed')
+  parser.add_option( '-n', '--novel_junctions', dest='novel_bed', help='The output with the exons extened' )
+  parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to output' )
+  parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to output' )
+  parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout'  )
+  (options, args) = parser.parse_args()
+  # Input files
+  if options.input != None:
+    try:
+      inputPath = os.path.abspath(options.input)
+      inputFile = open(inputPath, 'r')
+    except Exception, e:
+      print >> sys.stderr, "failed: %s" % e
+      exit(2)
+  else:
+    inputFile = sys.stdin
+  # Output files
+  outFile = None
+  novelFile = None
+  if options.novel_bed == None and options.output == None:
+    #write to stdout
+    outFile = sys.stdout
+  else:
+    if options.output != None:
+      try:
+        outPath = os.path.abspath(options.output)
+        outFile = open(outPath, 'w')
+      except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+        exit(3)
+    if options.novel_bed != None:
+      try:
+        novelPath = os.path.abspath(options.novel_bed)
+        novelFile = open(novelPath, 'w')
+      except Exception, e:
+        print >> sys.stderr, "failed: %s" % e
+        exit(3)
+  if options.leading_bp and options.leading_bp < 0:
+        print >> sys.stderr, "failed: leading_bp must be positive"
+        exit(5)
+  if options.trailing_bp and options.trailing_bp < 0:
+        print >> sys.stderr, "failed: trailing_bp must be positive"
+        exit(5)
+  # Scan bed file with known splice junctions
+  known_splice_junctions = set()
+  if options.reference_bed:
+    try:
+      inFile = open(options.reference_bed)
+      for i, line in enumerate( inFile ):
+        if line.startswith('track'):
+          continue
+        entry = BedEntry(line)
+        for splice_junc in entry.get_splice_junctions():
+          known_splice_junctions.add(splice_junc)
+    except Exception, e:
+      print >> sys.stderr, "failed: Error reading %s - %s" % (options.reference_bed,e)
+    finally:
+      if inFile:
+        inFile.close()
+  # Scan bed file that may have novel splice junctions
+  try:
+    for i, line in enumerate( inputFile ):
+      if line.startswith('track'):
+        if outFile:
+          outFile.write(line)
+        if novelFile:
+          novelFile.write(line)
+        continue
+      novel_junctions = 0
+      entry = BedEntry(line)
+      for splice_junc in entry.get_splice_junctions():
+        if splice_junc not in known_splice_junctions:
+          novel_junctions += 1
+          if novelFile:
+            # output a line with offsets from the splice junction
+            novelFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
+      if outFile:
+        if novel_junctions > 0:
+          if not novelFile and (options.leading_bp or options.trailing_bp):
+            outFile.write(entry.get_line(start_offset = options.leading_bp, end_offset = options.trailing_bp))
+          else:
+            outFile.write(line)
+  except Exception, e:
+    print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e)
+
+if __name__ == "__main__" : __main__()
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/filter_bed_on_splice_junctions.xml	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,61 @@
+<?xml version="1.0"?>
+<tool id="filter_bed_on_splice_junctions" name="Filter BED on splice junctions" version="0.0.1">
+  <description>that are not in a reference bed file</description>
+  <command interpreter="python">filter_bed_on_splice_junctions.py  --input "$input_bed" 
+  --guided_junctions "$guided_junctions" 
+  #if $leading_bp:
+    --leading_bp $leading_bp
+  #end if
+  #if $trailing_bp:
+    --trailing_bp $trailing_bp
+  #end if
+  --novel_junctions "$novel_junctions"
+  </command>
+  <inputs>
+    <param name="input_bed" type="data" format="bed" label="BED file" 
+           help="e.g. tophat junctions.bed run without GTF option or no-novel-junctions"/> 
+    <param name="guided_junctions" type="data" format="bed" optional="true" label="reference bed file" 
+           help="e.g. tophat junctions.bed run with GTF option and no-novel-junctions"/> 
+    <param name="leading_bp" type="integer" value="" min="0" optional="true" label="Extend the start position" 
+           help="The number of base pairs to extend the start of the exon before the junction position"/>
+    <param name="trailing_bp" type="integer" value="" min="0" optional="true" label="Extend the end position" 
+           help="The number of base pairs to extend the end of the exon after the junction position"/>
+  </inputs>
+  <stdio>
+    <exit_code range="1:" level="fatal" description="Error" />
+  </stdio>
+  <outputs>
+    <data name="novel_junctions" metadata_source="input_bed" format="bed"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input_bed" value="novel_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="guided_junctions" value="gtf_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="leading_bp" value="0"/>
+      <param name="trailing_bp" value="0"/>
+      <output name="novel_junctions" file="filtered_novel_splice_junctions.bed"/>
+    </test>
+    <test>
+      <param name="input_bed" value="novel_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="guided_junctions" value="gtf_splice_junctions.bed" ftype="bed" dbkey="hg19"/>
+      <param name="leading_bp" value="10"/>
+      <param name="trailing_bp" value="10"/>
+      <output name="novel_junctions" file="extended_novel_splice_junctions.bed"/>
+    </test>
+  </tests>
+  <help>
+**Filter BED on splice junctions**
+
+Filter out lines of a BED file that have junctions that are in in the reference bed file of known junctions. 
+The start position of the exon preceding the junction and the end position of the exon after the junction can be extended.  
+This is to compensate for alignments that may not include enough of the exons surrounding the junctions.  
+
+A typical application would be to run tophat twice, 
+once with the --GTF and --no-novel-juncs options for well known splice junctions,
+then a second time without those options to also include novel splice junctions.  
+
+This application would filter out the well splice known junctions 
+from the run intended to find all splice junctions including novel ones.
+
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/extended_novel_splice_junctions.bed	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,4 @@
+chr6_dbb_hap3	876056	878085	JUNC00000037	1	-	876066	878075	255,0,0	2	66,55	0,1974
+chr6_dbb_hap3	878246	882141	JUNC00000042	3	-	878256	882131	255,0,0	2	106,84	0,3811
+chr6_dbb_hap3	879761	882145	JUNC00000043	57	-	879771	882135	255,0,0	2	106,88	0,2296
+chr6_dbb_hap3	898749	902619	JUNC00000053	1	-	898759	902609	255,0,0	2	32,89	0,3781
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_novel_splice_junctions.bed	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,4 @@
+chr6_dbb_hap3	876066	878075	JUNC00000037	1	-	876066	878075	255,0,0	2	56,45	0,1964
+chr6_dbb_hap3	878256	882131	JUNC00000042	3	-	878256	882131	255,0,0	2	96,74	0,3801
+chr6_dbb_hap3	879771	882135	JUNC00000043	57	-	879771	882135	255,0,0	2	96,78	0,2286
+chr6_dbb_hap3	898759	902609	JUNC00000053	1	-	898759	902609	255,0,0	2	22,79	0,3771
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gtf_splice_junctions.bed	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,20 @@
+chr6_dbb_hap3	874766	875722	JUNC00000001	36	-	874766	875722	255,0,0	2	84,95	0,861
+chr6_dbb_hap3	875681	876089	JUNC00000002	56	-	875681	876089	255,0,0	2	90,96	0,312
+chr6_dbb_hap3	876025	876797	JUNC00000003	82	-	876025	876797	255,0,0	2	97,95	0,677
+chr6_dbb_hap3	876735	877618	JUNC00000004	85	-	876735	877618	255,0,0	2	95,94	0,789
+chr6_dbb_hap3	877523	878127	JUNC00000005	83	-	877523	878127	255,0,0	2	95,97	0,507
+chr6_dbb_hap3	878042	878332	JUNC00000006	58	-	878042	878332	255,0,0	2	96,97	0,193
+chr6_dbb_hap3	878260	879827	JUNC00000007	33	-	878260	879827	255,0,0	2	92,93	0,1474
+chr6_dbb_hap3	882054	883748	JUNC00000008	22	-	882054	883748	255,0,0	2	81,64	0,1630
+chr6_dbb_hap3	883682	884465	JUNC00000009	45	-	883682	884465	255,0,0	2	66,89	0,694
+chr6_dbb_hap3	884526	892332	JUNC00000010	82	-	884526	892332	255,0,0	2	93,97	0,7709
+chr6_dbb_hap3	892335	892952	JUNC00000011	77	-	892335	892952	255,0,0	2	92,66	0,551
+chr6_dbb_hap3	892885	893335	JUNC00000012	107	-	892885	893335	255,0,0	2	67,97	0,353
+chr6_dbb_hap3	893244	894535	JUNC00000013	62	-	893244	894535	255,0,0	2	96,96	0,1195
+chr6_dbb_hap3	894513	895119	JUNC00000014	56	-	894513	895119	255,0,0	2	97,95	0,511
+chr6_dbb_hap3	895064	898717	JUNC00000015	42	-	895064	898717	255,0,0	2	95,97	0,3556
+chr6_dbb_hap3	898708	900279	JUNC00000016	4	-	898708	900279	255,0,0	2	73,58	0,1513
+chr6_dbb_hap3	900221	901683	JUNC00000017	4	-	900221	901683	255,0,0	2	21,91	0,1371
+chr6_dbb_hap3	901688	902616	JUNC00000018	21	-	901688	902616	255,0,0	2	90,86	0,842
+chr6_dbb_hap3	902646	903512	JUNC00000019	7	-	902646	903512	255,0,0	2	88,77	0,789
+chr6_dbb_hap3	992108	992398	JUNC00000020	4	+	992108	992398	255,0,0	2	78,82	0,208
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/novel_splice_junctions.bed	Wed Feb 05 08:18:49 2014 -0500
@@ -0,0 +1,20 @@
+chr6_dbb_hap3	874766	875722	JUNC00000034	36	-	874766	875722	255,0,0	2	84,95	0,861
+chr6_dbb_hap3	875681	876089	JUNC00000035	56	-	875681	876089	255,0,0	2	90,96	0,312
+chr6_dbb_hap3	876025	876797	JUNC00000036	82	-	876025	876797	255,0,0	2	97,95	0,677
+chr6_dbb_hap3	876066	878075	JUNC00000037	1	-	876066	878075	255,0,0	2	56,45	0,1964
+chr6_dbb_hap3	876735	877618	JUNC00000038	85	-	876735	877618	255,0,0	2	95,94	0,789
+chr6_dbb_hap3	877523	878127	JUNC00000039	83	-	877523	878127	255,0,0	2	95,97	0,507
+chr6_dbb_hap3	878042	878332	JUNC00000040	58	-	878042	878332	255,0,0	2	96,97	0,193
+chr6_dbb_hap3	878260	879827	JUNC00000041	33	-	878260	879827	255,0,0	2	92,93	0,1474
+chr6_dbb_hap3	878256	882131	JUNC00000042	3	-	878256	882131	255,0,0	2	96,74	0,3801
+chr6_dbb_hap3	879771	882135	JUNC00000043	57	-	879771	882135	255,0,0	2	96,78	0,2286
+chr6_dbb_hap3	882057	883748	JUNC00000044	32	-	882057	883748	255,0,0	2	78,64	0,1627
+chr6_dbb_hap3	883682	884465	JUNC00000045	45	-	883682	884465	255,0,0	2	66,89	0,694
+chr6_dbb_hap3	884526	892332	JUNC00000046	82	-	884526	892332	255,0,0	2	93,97	0,7709
+chr6_dbb_hap3	892335	892952	JUNC00000047	77	-	892335	892952	255,0,0	2	92,66	0,551
+chr6_dbb_hap3	892885	893335	JUNC00000048	107	-	892885	893335	255,0,0	2	67,97	0,353
+chr6_dbb_hap3	893244	894535	JUNC00000049	62	-	893244	894535	255,0,0	2	96,96	0,1195
+chr6_dbb_hap3	894513	895119	JUNC00000050	56	-	894513	895119	255,0,0	2	97,95	0,511
+chr6_dbb_hap3	895064	898717	JUNC00000051	42	-	895064	898717	255,0,0	2	95,97	0,3556
+chr6_dbb_hap3	898708	900279	JUNC00000052	4	-	898708	900279	255,0,0	2	73,58	0,1513
+chr6_dbb_hap3	898759	902609	JUNC00000053	1	-	898759	902609	255,0,0	2	22,79	0,3771