diff featureCounter.py @ 0:ac6218e2b686 draft default tip

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:51:34 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/featureCounter.py	Tue Apr 01 10:51:34 2014 -0400
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+#Guruprasad Ananda
+"""
+Calculate count and coverage of one query on another, and append the Coverage and counts to
+the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features.
+
+usage: %prog bed_file_1 bed_file_2 out_file
+    -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file
+    -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file
+"""
+import sys, fileinput
+from bx.intervals.io import *
+from bx.cookbook import doc_optparse
+from bx.intervals.operations import quicksect
+from galaxy.tools.util.galaxyops import *
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def counter(node, start, end):
+    global full, partial
+    if node.start <= start and node.maxend > start:
+        if node.end >= end or (node.start == start and end > node.end > start):
+            full += 1
+        elif end > node.end > start:
+            partial += 1
+        if node.left and node.left.maxend > start:
+            counter(node.left, start, end)
+        if node.right:
+            counter(node.right, start, end)
+    elif start < node.start < end:
+        if node.end <= end:
+            full += 1
+        else:
+            partial += 1
+        if node.left and node.left.maxend > start:
+            counter(node.left, start, end)
+        if node.right:
+            counter(node.right, start, end)
+    else:
+        if node.left:
+            counter(node.left, start, end)
+
+def count_coverage( readers, comments=True ):
+    primary = readers[0]
+    secondary = readers[1]
+    secondary_copy = readers[2]
+    
+    rightTree = quicksect.IntervalTree()
+    for item in secondary:
+        if type( item ) is GenomicInterval:
+            rightTree.insert( item, secondary.linenum, item.fields )
+    
+    bitsets = secondary_copy.binned_bitsets()
+    
+    global full, partial
+    
+    for interval in primary:
+        if type( interval ) is Header:
+            yield interval
+        if type( interval ) is Comment and comments:
+            yield interval
+        elif type( interval ) == GenomicInterval:
+            chrom = interval.chrom
+            start = int(interval.start)
+            end = int(interval.end)
+            full = 0
+            partial = 0
+            if chrom not in bitsets:
+                bases_covered = 0
+                percent = 0.0
+                full = 0
+                partial = 0
+            else:
+                bases_covered = bitsets[ chrom ].count_range( start, end-start )
+                if (end - start) == 0:
+                    percent = 0
+                else:
+                    percent = float(bases_covered) / float(end - start)
+                if bases_covered:
+                    root = rightTree.chroms[chrom]    #root node for the chrom tree
+                    counter(root, start, end)
+            interval.fields.append(str(bases_covered))
+            interval.fields.append(str(percent))
+            interval.fields.append(str(full))
+            interval.fields.append(str(partial))
+            yield interval
+
+
+def main():
+    options, args = doc_optparse.parse( __doc__ )
+    
+    try:
+        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
+        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
+        in1_fname, in2_fname, out_fname = args
+    except:
+        stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." )
+    
+    g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ),
+                            chrom_col=chr_col_1,
+                            start_col=start_col_1,
+                            end_col=end_col_1,
+                            strand_col=strand_col_1,
+                            fix_strand=True )
+    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                            chrom_col=chr_col_2,
+                            start_col=start_col_2,
+                            end_col=end_col_2,
+                            strand_col=strand_col_2,
+                            fix_strand=True )
+    g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
+                                 chrom_col=chr_col_2,
+                                 start_col=start_col_2,
+                                 end_col=end_col_2,
+                                 strand_col=strand_col_2,
+                                 fix_strand=True )
+    
+
+    out_file = open( out_fname, "w" )
+
+    try:
+        for line in count_coverage([g1, g2, g2_copy]):
+            if type( line ) is GenomicInterval:
+                out_file.write( "%s\n" % "\t".join( line.fields ) )
+            else:
+                out_file.write( "%s\n" % line )
+    except ParseError, exc:
+        out_file.close()
+        fail( str( exc ) )
+
+    out_file.close()
+
+    if g1.skipped > 0:
+        print skipped( g1, filedesc=" of 1st dataset" )
+    if g2.skipped > 0:
+        print skipped( g2, filedesc=" of 2nd dataset" )
+    elif g2_copy.skipped > 0:
+        print skipped( g2_copy, filedesc=" of 2nd dataset" )
+
+
+if __name__ == "__main__":
+    main()