annotate tools/extract/phastOdds/get_scores_galaxy.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 usage: %prog data_file.h5 region_mapping.bed in_file out_file chrom_col start_col end_col [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5 -p, --perCol: standardize to lod per column
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 from __future__ import division
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 import sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12 from numpy import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 from tables import *
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 import pkg_resources; pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 from bx.cookbook import doc_optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18 from bx import intervals
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 # ignore wanrnings about NumArray flavor
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 from warnings import filterwarnings
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22 from tables.exceptions import FlavorWarning
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 filterwarnings("ignore", category=FlavorWarning)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 assert sys.version_info[:2] >= ( 2, 4 )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 sys.stderr.write(msg)
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 def main():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 # Parse command line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 options, args = doc_optparse.parse( __doc__ )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 h5_fname = args[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 mapping_fname = args[1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 in_fname = args[2]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 out_fname = args[3]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 per_col = bool( options.perCol )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 except Exception, e:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 doc_optparse.exception()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 if h5_fname == 'None.h5':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 stop_err( 'Invalid genome build, this tool currently only works with data from build hg17. Click the pencil icon in your history item to correct the build if appropriate.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 # Open the h5 file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 h5 = openFile( h5_fname, mode = "r" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 # Load intervals and names for the subregions
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 intersecters = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 for i, line in enumerate( file( mapping_fname ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 if line and not line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 chr, start, end, name = line.split()[0:4]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 if not intersecters.has_key( chr ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 intersecters[ chr ] = intervals.Intersecter()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 intersecters[ chr ].add_interval( intervals.Interval( int( start ), int( end ), name ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 # Find the subregion containing each input interval
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 skipped_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 first_invalid_line = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 invalid_line = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 out_file = open( out_fname, "w" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 warnings = []
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 warning = ''
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 for i, line in enumerate( file( in_fname ) ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 line = line.rstrip( '\r\n' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 if line.startswith( '#' ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if i == 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 out_file.write( "%s\tscore\n" % line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 out_file.write( "%s\n" % line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 fields = line.split( "\t" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 chr = fields[ chrom_col ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 start = int( fields[ start_col ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 end = int( fields[ end_col ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 warning = "Invalid value for chrom, start or end column."
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 warnings.append( warning )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 # Find matching interval
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 matches = intersecters[ chr ].find( start, end )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 except:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90 warning = "'%s' is not a valid chrom value for the region. " %chr
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 warnings.append( warning )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
92 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
93 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
94 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
95 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
96 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
97 if not len( matches ) == 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
98 warning = "Interval must match exactly one target region. "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
99 warnings.append( warning )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
100 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
101 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
102 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
103 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
104 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
105 region = matches[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
106 if not ( start >= region.start and end <= region.end ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
107 warning = "Interval must fall entirely within region. "
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
108 warnings.append( warning )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
109 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
110 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
111 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
112 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
113 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
114 region_name = region.value
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
115 rel_start = start - region.start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
116 rel_end = end - region.start
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
117 if not rel_start < rel_end:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
118 warning = "Region %s is empty, relative start:%d, relative end:%d. " % ( region_name, rel_start, rel_end )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
119 warnings.append( warning )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
120 skipped_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
121 if not invalid_line:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
122 first_invalid_line = i + 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
123 invalid_line = line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
124 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
125 s = h5.getNode( h5.root, "scores_" + region_name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
126 c = h5.getNode( h5.root, "counts_" + region_name )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
127 score = s[rel_end-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
128 count = c[rel_end-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
129 if rel_start > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
130 score -= s[rel_start-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
131 count -= c[rel_start-1]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
132 if per_col:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
133 score /= count
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
134 fields.append( str( score ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
135 out_file.write( "%s\n" % "\t".join( fields ) )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
136 # Close the file handle
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
137 h5.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
138 out_file.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
139
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
140 if warnings:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
141 warn_msg = "PhastOdds scores are only available for ENCODE regions. %d warnings, 1st is: " % len( warnings )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
142 warn_msg += warnings[0]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
143 print warn_msg
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
144 if skipped_lines:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
145 print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
146
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
147 if __name__ == "__main__": main()