diff tools/filters/gtf_to_bedgraph_converter.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/filters/gtf_to_bedgraph_converter.py	Fri Mar 09 19:37:19 2012 -0500
@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    # Read parms.
+    input_name = sys.argv[1]
+    output_name = sys.argv[2]
+    attribute_name = sys.argv[3]
+    
+    # Create temp files.
+    tmp_name1 = tempfile.NamedTemporaryFile().name
+    tmp_name2 = tempfile.NamedTemporaryFile().name
+    
+    # Do conversion.
+    skipped_lines = 0
+    first_skipped_line = 0
+    out = open( tmp_name1, 'w' )
+    
+    # Write track data to temporary file.
+    i = 0
+    for i, line in enumerate( file( input_name ) ):
+        line = line.rstrip( '\r\n' )
+        
+        if line and not line.startswith( '#' ):
+            try:
+                elems = line.split( '\t' )
+                start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based.
+                strand = elems[6]
+                if strand not in ['+', '-']:
+                    strand = '+'
+                attributes_list = elems[8].split(";")
+                attributes = {}
+                for name_value_pair in attributes_list:
+                    pair = name_value_pair.strip().split(" ")
+                    name = pair[0].strip()
+                    if name == '':
+                        continue
+                    # Need to strip double quote from values
+                    value = pair[1].strip(" \"")
+                    attributes[name] = value
+                value = attributes[ attribute_name ]
+                # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
+                # BedGraph format: chrom, chromStart, chromEnd, value
+                out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) )
+            except:
+                skipped_lines += 1
+                if not first_skipped_line:
+                    first_skipped_line = i + 1
+        else:
+            skipped_lines += 1
+            if not first_skipped_line:
+                first_skipped_line = i + 1
+    out.close()
+    
+    # Sort tmp file by chromosome name and chromosome start to create ordered track data.
+    cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 )
+    try:
+        os.system(cmd)
+        os.remove(tmp_name1)
+    except Exception, ex:
+        sys.stderr.write( "%s\n" % ex )
+        sys.exit(1)
+        
+    # Create bedgraph file by combining track definition with ordered track data.
+    cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name )
+    try:
+        os.system(cmd)
+        os.remove(tmp_name2)
+    except Exception, ex:
+        sys.stderr.write( "%s\n" % ex )
+        sys.exit(1)
+    
+    info_msg = "%i lines converted to BEDGraph.  " % ( i + 1 - skipped_lines )
+    if skipped_lines > 0:
+        info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
+    print info_msg
+
+if __name__ == "__main__": __main__()