Mercurial > repos > xuebing > sharplabtool
comparison tools/filters/gtf_to_bedgraph_converter.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 import os, sys, tempfile | |
3 | |
4 assert sys.version_info[:2] >= ( 2, 4 ) | |
5 | |
6 def __main__(): | |
7 # Read parms. | |
8 input_name = sys.argv[1] | |
9 output_name = sys.argv[2] | |
10 attribute_name = sys.argv[3] | |
11 | |
12 # Create temp files. | |
13 tmp_name1 = tempfile.NamedTemporaryFile().name | |
14 tmp_name2 = tempfile.NamedTemporaryFile().name | |
15 | |
16 # Do conversion. | |
17 skipped_lines = 0 | |
18 first_skipped_line = 0 | |
19 out = open( tmp_name1, 'w' ) | |
20 | |
21 # Write track data to temporary file. | |
22 i = 0 | |
23 for i, line in enumerate( file( input_name ) ): | |
24 line = line.rstrip( '\r\n' ) | |
25 | |
26 if line and not line.startswith( '#' ): | |
27 try: | |
28 elems = line.split( '\t' ) | |
29 start = str( int( elems[3] ) - 1 ) # GTF coordinates are 1-based, BedGraph are 0-based. | |
30 strand = elems[6] | |
31 if strand not in ['+', '-']: | |
32 strand = '+' | |
33 attributes_list = elems[8].split(";") | |
34 attributes = {} | |
35 for name_value_pair in attributes_list: | |
36 pair = name_value_pair.strip().split(" ") | |
37 name = pair[0].strip() | |
38 if name == '': | |
39 continue | |
40 # Need to strip double quote from values | |
41 value = pair[1].strip(" \"") | |
42 attributes[name] = value | |
43 value = attributes[ attribute_name ] | |
44 # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. | |
45 # BedGraph format: chrom, chromStart, chromEnd, value | |
46 out.write( "%s\t%s\t%s\t%s\n" %( elems[0], start, elems[4], value ) ) | |
47 except: | |
48 skipped_lines += 1 | |
49 if not first_skipped_line: | |
50 first_skipped_line = i + 1 | |
51 else: | |
52 skipped_lines += 1 | |
53 if not first_skipped_line: | |
54 first_skipped_line = i + 1 | |
55 out.close() | |
56 | |
57 # Sort tmp file by chromosome name and chromosome start to create ordered track data. | |
58 cmd = "sort -k1,1 -k2,2n < %s > %s" % ( tmp_name1, tmp_name2 ) | |
59 try: | |
60 os.system(cmd) | |
61 os.remove(tmp_name1) | |
62 except Exception, ex: | |
63 sys.stderr.write( "%s\n" % ex ) | |
64 sys.exit(1) | |
65 | |
66 # Create bedgraph file by combining track definition with ordered track data. | |
67 cmd = "echo 'track type=bedGraph' | cat - %s > %s " % ( tmp_name2, output_name ) | |
68 try: | |
69 os.system(cmd) | |
70 os.remove(tmp_name2) | |
71 except Exception, ex: | |
72 sys.stderr.write( "%s\n" % ex ) | |
73 sys.exit(1) | |
74 | |
75 info_msg = "%i lines converted to BEDGraph. " % ( i + 1 - skipped_lines ) | |
76 if skipped_lines > 0: | |
77 info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) | |
78 print info_msg | |
79 | |
80 if __name__ == "__main__": __main__() |