Mercurial > repos > devteam > annotation_profiler
comparison scripts/build_profile_indexes.py @ 0:3b33da018e74 draft default tip
Imported from capsule None
| author | devteam | 
|---|---|
| date | Mon, 19 May 2014 12:33:42 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:3b33da018e74 | 
|---|---|
| 1 #!/usr/bin/env python | |
| 2 #Dan Blankenberg | |
| 3 | |
| 4 VERSION = '1.0.0' # version of this script | |
| 5 | |
| 6 from optparse import OptionParser | |
| 7 import os, gzip, struct, time | |
| 8 from ftplib import FTP #do we want a diff method than using FTP to determine Chrom Names, eg use local copy | |
| 9 | |
| 10 #import md5 from hashlib; if python2.4 or less, use old md5 | |
| 11 try: | |
| 12 from hashlib import md5 | |
| 13 except ImportError: | |
| 14 from md5 import new as md5 | |
| 15 | |
| 16 #import BitSet from bx-python, try using eggs and package resources, fall back to any local installation | |
| 17 try: | |
| 18 from galaxy import eggs | |
| 19 import pkg_resources | |
| 20 pkg_resources.require( "bx-python" ) | |
| 21 except: pass #Maybe there is a local installation available | |
| 22 from bx.bitset import BitSet | |
| 23 | |
| 24 #Define constants | |
| 25 STRUCT_FMT = '<I' | |
| 26 STRUCT_SIZE = struct.calcsize( STRUCT_FMT ) | |
| 27 DEFAULT_BITSET_SIZE = 300000000 | |
| 28 CHUNK_SIZE = 1024 | |
| 29 | |
| 30 #Headers used to parse .sql files to determine column indexes for chromosome name, start and end | |
| 31 alias_spec = { | |
| 32 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name', 'tName' ], | |
| 33 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)', 'tStart', 'genoStart' ], | |
| 34 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)', 'tEnd', 'genoEnd' ], | |
| 35 } | |
| 36 | |
| 37 #Headers used to parse trackDb.txt.gz | |
| 38 #TODO: these should be parsed directly from trackDb.sql | |
| 39 trackDb_headers = ["tableName", "shortLabel", "type", "longLabel", "visibility", "priority", "colorR", "colorG", "colorB", "altColorR", "altColorG", "altColorB", "useScore", "private", "restrictCount", "restrictList", "url", "html", "grp", "canPack", "settings"] | |
| 40 | |
| 41 def get_columns( filename ): | |
| 42 input_sql = open( filename ).read() | |
| 43 input_sql = input_sql.split( 'CREATE TABLE ' )[1].split( ';' )[0] | |
| 44 input_sql = input_sql.split( ' (', 1 ) | |
| 45 table_name = input_sql[0].strip().strip( '`' ) | |
| 46 input_sql = [ split.strip().split( ' ' )[0].strip().strip( '`' ) for split in input_sql[1].rsplit( ')', 1 )[0].strip().split( '\n' ) ] | |
| 47 print input_sql | |
| 48 chrom_col = None | |
| 49 start_col = None | |
| 50 end_col = None | |
| 51 for col_name in alias_spec['chromCol']: | |
| 52 for i, header_name in enumerate( input_sql ): | |
| 53 if col_name == header_name: | |
| 54 chrom_col = i | |
| 55 break | |
| 56 if chrom_col is not None: | |
| 57 break | |
| 58 | |
| 59 for col_name in alias_spec['startCol']: | |
| 60 for i, header_name in enumerate( input_sql ): | |
| 61 if col_name == header_name: | |
| 62 start_col = i | |
| 63 break | |
| 64 if start_col is not None: | |
| 65 break | |
| 66 | |
| 67 for col_name in alias_spec['endCol']: | |
| 68 for i, header_name in enumerate( input_sql ): | |
| 69 if col_name == header_name: | |
| 70 end_col = i | |
| 71 break | |
| 72 if end_col is not None: | |
| 73 break | |
| 74 | |
| 75 return table_name, chrom_col, start_col, end_col | |
| 76 | |
| 77 | |
| 78 def create_grouping_xml( input_dir, output_dir, dbkey ): | |
| 79 output_filename = os.path.join( output_dir, '%s_tables.xml' % dbkey ) | |
| 80 def load_groups( file_name = 'grp.txt.gz' ): | |
| 81 groups = {} | |
| 82 for line in gzip.open( os.path.join( input_dir, file_name ) ): | |
| 83 fields = line.split( '\t' ) | |
| 84 groups[fields[0]] = { 'desc': fields[1], 'priority': fields[2] } | |
| 85 return groups | |
| 86 f = gzip.open( os.path.join( input_dir, 'trackDb.txt.gz' ) ) | |
| 87 out = open( output_filename, 'wb' ) | |
| 88 tables = {} | |
| 89 cur_buf = '' | |
| 90 while True: | |
| 91 line = f.readline() | |
| 92 if not line: break | |
| 93 #remove new lines | |
| 94 line = line.rstrip( '\n\r' ) | |
| 95 line = line.replace( '\\\t', ' ' ) #replace escaped tabs with space | |
| 96 cur_buf += "%s\n" % line.rstrip( '\\' ) | |
| 97 if line.endswith( '\\' ): | |
| 98 continue #line is wrapped, next line | |
| 99 #all fields should be loaded now... | |
| 100 fields = cur_buf.split( '\t' ) | |
| 101 cur_buf = '' #reset buffer | |
| 102 assert len( fields ) == len( trackDb_headers ), 'Failed Parsing trackDb.txt.gz; fields: %s' % fields | |
| 103 table_name = fields[ 0 ] | |
| 104 tables[ table_name ] = {} | |
| 105 for field_name, field_value in zip( trackDb_headers, fields ): | |
| 106 tables[ table_name ][ field_name ] = field_value | |
| 107 #split settings fields into dict | |
| 108 fields = fields[-1].split( '\n' ) | |
| 109 tables[ table_name ][ 'settings' ] = {} | |
| 110 for field in fields: | |
| 111 setting_fields = field.split( ' ', 1 ) | |
| 112 setting_name = setting_value = setting_fields[ 0 ] | |
| 113 if len( setting_fields ) > 1: | |
| 114 setting_value = setting_fields[ 1 ] | |
| 115 if setting_name or setting_value: | |
| 116 tables[ table_name ][ 'settings' ][ setting_name ] = setting_value | |
| 117 #Load Groups | |
| 118 groups = load_groups() | |
| 119 in_groups = {} | |
| 120 for table_name, values in tables.iteritems(): | |
| 121 if os.path.exists( os.path.join( output_dir, table_name ) ): | |
| 122 group = values['grp'] | |
| 123 if group not in in_groups: | |
| 124 in_groups[group]={} | |
| 125 #***NAME CHANGE***, 'subTrack' no longer exists as a setting...use 'parent' instead | |
| 126 #subTrack = values.get('settings', {} ).get( 'subTrack', table_name ) | |
| 127 subTrack = values.get('settings', {} ).get( 'parent', table_name ).split( ' ' )[0] #need to split, because could be e.g. 'trackgroup on' | |
| 128 if subTrack not in in_groups[group]: | |
| 129 in_groups[group][subTrack]=[] | |
| 130 in_groups[group][subTrack].append( table_name ) | |
| 131 | |
| 132 assigned_tables = [] | |
| 133 out.write( """<filter type="data_meta" data_ref="input1" meta_key="dbkey" value="%s">\n""" % ( dbkey ) ) | |
| 134 out.write( " <options>\n" ) | |
| 135 for group, subTracks in sorted( in_groups.iteritems() ): | |
| 136 out.write( """ <option name="%s" value="group-%s">\n""" % ( groups[group]['desc'], group ) ) | |
| 137 for sub_name, sub_tracks in subTracks.iteritems(): | |
| 138 if len( sub_tracks ) > 1: | |
| 139 out.write( """ <option name="%s" value="subtracks-%s">\n""" % ( sub_name, sub_name ) ) | |
| 140 sub_tracks.sort() | |
| 141 for track in sub_tracks: | |
| 142 track_label = track | |
| 143 if "$" not in tables[track]['shortLabel']: | |
| 144 track_label = tables[track]['shortLabel'] | |
| 145 out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) | |
| 146 assigned_tables.append( track ) | |
| 147 out.write( " </option>\n" ) | |
| 148 else: | |
| 149 track = sub_tracks[0] | |
| 150 track_label = track | |
| 151 if "$" not in tables[track]['shortLabel']: | |
| 152 track_label = tables[track]['shortLabel'] | |
| 153 out.write( """ <option name="%s" value="%s"/>\n""" % ( track_label, track ) ) | |
| 154 assigned_tables.append( track ) | |
| 155 out.write( " </option>\n" ) | |
| 156 unassigned_tables = list( sorted( [ table_dir for table_dir in os.listdir( output_dir ) if table_dir not in assigned_tables and os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ) ) | |
| 157 if unassigned_tables: | |
| 158 out.write( """ <option name="Uncategorized Tables" value="group-trackDbUnassigned">\n""" ) | |
| 159 for table_name in unassigned_tables: | |
| 160 out.write( """ <option name="%s" value="%s"/>\n""" % ( table_name, table_name ) ) | |
| 161 out.write( " </option>\n" ) | |
| 162 out.write( " </options>\n" ) | |
| 163 out.write( """</filter>\n""" ) | |
| 164 out.close() | |
| 165 | |
| 166 def write_database_dump_info( input_dir, output_dir, dbkey, chrom_lengths, default_bitset_size ): | |
| 167 #generate hash for profiled table directories | |
| 168 #sort directories off output root (files in output root not hashed, including the profiler_info.txt file) | |
| 169 #sort files in each directory and hash file contents | |
| 170 profiled_hash = md5() | |
| 171 for table_dir in sorted( [ table_dir for table_dir in os.listdir( output_dir ) if os.path.isdir( os.path.join( output_dir, table_dir ) ) ] ): | |
| 172 for filename in sorted( os.listdir( os.path.join( output_dir, table_dir ) ) ): | |
| 173 f = open( os.path.join( output_dir, table_dir, filename ), 'rb' ) | |
| 174 while True: | |
| 175 hash_chunk = f.read( CHUNK_SIZE ) | |
| 176 if not hash_chunk: | |
| 177 break | |
| 178 profiled_hash.update( hash_chunk ) | |
| 179 profiled_hash = profiled_hash.hexdigest() | |
| 180 | |
| 181 #generate hash for input dir | |
| 182 #sort directories off input root | |
| 183 #sort files in each directory and hash file contents | |
| 184 database_hash = md5() | |
| 185 for dirpath, dirnames, filenames in sorted( os.walk( input_dir ) ): | |
| 186 for filename in sorted( filenames ): | |
| 187 f = open( os.path.join( input_dir, dirpath, filename ), 'rb' ) | |
| 188 while True: | |
| 189 hash_chunk = f.read( CHUNK_SIZE ) | |
| 190 if not hash_chunk: | |
| 191 break | |
| 192 database_hash.update( hash_chunk ) | |
| 193 database_hash = database_hash.hexdigest() | |
| 194 | |
| 195 #write out info file | |
| 196 out = open( os.path.join( output_dir, 'profiler_info.txt' ), 'wb' ) | |
| 197 out.write( 'dbkey\t%s\n' % ( dbkey ) ) | |
| 198 out.write( 'chromosomes\t%s\n' % ( ','.join( [ '%s=%s' % ( chrom_name, chrom_len ) for chrom_name, chrom_len in chrom_lengths.iteritems() ] ) ) ) | |
| 199 out.write( 'bitset_size\t%s\n' % ( default_bitset_size ) ) | |
| 200 for line in open( os.path.join( input_dir, 'trackDb.sql' ) ): | |
| 201 line = line.strip() | |
| 202 if line.startswith( '-- Dump completed on ' ): | |
| 203 line = line[ len( '-- Dump completed on ' ): ] | |
| 204 out.write( 'dump_time\t%s\n' % ( line ) ) | |
| 205 break | |
| 206 out.write( 'dump_hash\t%s\n' % ( database_hash ) ) | |
| 207 out.write( 'profiler_time\t%s\n' % ( time.time() ) ) | |
| 208 out.write( 'profiler_hash\t%s\n' % ( profiled_hash ) ) | |
| 209 out.write( 'profiler_version\t%s\n' % ( VERSION ) ) | |
| 210 out.write( 'profiler_struct_format\t%s\n' % ( STRUCT_FMT ) ) | |
| 211 out.write( 'profiler_struct_size\t%s\n' % ( STRUCT_SIZE ) ) | |
| 212 out.close() | |
| 213 | |
| 214 def __main__(): | |
| 215 usage = "usage: %prog options" | |
| 216 parser = OptionParser( usage=usage ) | |
| 217 parser.add_option( '-d', '--dbkey', dest='dbkey', default='hg18', help='dbkey to process' ) | |
| 218 parser.add_option( '-i', '--input_dir', dest='input_dir', default=os.path.join( 'golden_path','%s', 'database' ), help='Input Directory' ) | |
| 219 parser.add_option( '-o', '--output_dir', dest='output_dir', default=os.path.join( 'profiled_annotations','%s' ), help='Output Directory' ) | |
| 220 parser.add_option( '-c', '--chromosomes', dest='chromosomes', default='', help='Comma separated list of: ChromName1[=length],ChromName2[=length],...' ) | |
| 221 parser.add_option( '-b', '--bitset_size', dest='bitset_size', default=DEFAULT_BITSET_SIZE, type='int', help='Default BitSet size; overridden by sizes specified in chromInfo.txt.gz or by --chromosomes' ) | |
| 222 parser.add_option( '-f', '--ftp_site', dest='ftp_site', default='hgdownload.cse.ucsc.edu', help='FTP site; used for chromosome info when chromInfo.txt.gz method fails' ) | |
| 223 parser.add_option( '-p', '--ftp_path', dest='ftp_path', default='/goldenPath/%s/chromosomes/', help='FTP Path; used for chromosome info when chromInfo.txt.gz method fails' ) | |
| 224 | |
| 225 ( options, args ) = parser.parse_args() | |
| 226 | |
| 227 input_dir = options.input_dir | |
| 228 if '%' in input_dir: | |
| 229 input_dir = input_dir % options.dbkey | |
| 230 assert os.path.exists( input_dir ), 'Input directory does not exist' | |
| 231 output_dir = options.output_dir | |
| 232 if '%' in output_dir: | |
| 233 output_dir = output_dir % options.dbkey | |
| 234 assert not os.path.exists( output_dir ), 'Output directory already exists' | |
| 235 os.makedirs( output_dir ) | |
| 236 ftp_path = options.ftp_path | |
| 237 if '%' in ftp_path: | |
| 238 ftp_path = ftp_path % options.dbkey | |
| 239 | |
| 240 #Get chromosome names and lengths | |
| 241 chrom_lengths = {} | |
| 242 if options.chromosomes: | |
| 243 for chrom in options.chromosomes.split( ',' ): | |
| 244 fields = chrom.split( '=' ) | |
| 245 chrom = fields[0] | |
| 246 if len( fields ) > 1: | |
| 247 chrom_len = int( fields[1] ) | |
| 248 else: | |
| 249 chrom_len = options.bitset_size | |
| 250 chrom_lengths[ chrom ] = chrom_len | |
| 251 chroms = chrom_lengths.keys() | |
| 252 print 'Chrom info taken from command line option.' | |
| 253 else: | |
| 254 try: | |
| 255 for line in gzip.open( os.path.join( input_dir, 'chromInfo.txt.gz' ) ): | |
| 256 fields = line.strip().split( '\t' ) | |
| 257 chrom_lengths[ fields[0] ] = int( fields[ 1 ] ) | |
| 258 chroms = chrom_lengths.keys() | |
| 259 print 'Chrom info taken from chromInfo.txt.gz.' | |
| 260 except Exception, e: | |
| 261 print 'Error loading chrom info from chromInfo.txt.gz, trying FTP method.' | |
| 262 chrom_lengths = {} #zero out chrom_lengths | |
| 263 chroms = [] | |
| 264 ftp = FTP( options.ftp_site ) | |
| 265 ftp.login() | |
| 266 for name in ftp.nlst( ftp_path ): | |
| 267 if name.endswith( '.fa.gz' ): | |
| 268 chroms.append( name.split( '/' )[-1][ :-len( '.fa.gz' ) ] ) | |
| 269 ftp.close() | |
| 270 for chrom in chroms: | |
| 271 chrom_lengths[ chrom ] = options.bitset_size | |
| 272 #sort chroms by length of name, decending; necessary for when table names start with chrom name | |
| 273 chroms = list( reversed( [ chrom for chrom_len, chrom in sorted( [ ( len( chrom ), chrom ) for chrom in chroms ] ) ] ) ) | |
| 274 | |
| 275 #parse tables from local files | |
| 276 #loop through directory contents, if file ends in '.sql', process table | |
| 277 for filename in os.listdir( input_dir ): | |
| 278 if filename.endswith ( '.sql' ): | |
| 279 base_filename = filename[ 0:-len( '.sql' ) ] | |
| 280 table_out_dir = os.path.join( output_dir, base_filename ) | |
| 281 #some tables are chromosome specific, lets strip off the chrom name | |
| 282 for chrom in chroms: | |
| 283 if base_filename.startswith( "%s_" % chrom ): | |
| 284 #found chromosome | |
| 285 table_out_dir = os.path.join( output_dir, base_filename[len( "%s_" % chrom ):] ) | |
| 286 break | |
| 287 #create table dir | |
| 288 if not os.path.exists( table_out_dir ): | |
| 289 os.mkdir( table_out_dir ) #table dir may already exist in the case of single chrom tables | |
| 290 print "Created table dir (%s)." % table_out_dir | |
| 291 else: | |
| 292 print "Table dir (%s) already exists." % table_out_dir | |
| 293 #find column assignments | |
| 294 table_name, chrom_col, start_col, end_col = get_columns( "%s.sql" % os.path.join( input_dir, base_filename ) ) | |
| 295 if chrom_col is None or start_col is None or end_col is None: | |
| 296 print "Table %s (%s) does not appear to have a chromosome, a start, or a stop." % ( table_name, "%s.sql" % os.path.join( input_dir, base_filename ) ) | |
| 297 if not os.listdir( table_out_dir ): | |
| 298 print "Removing empty table (%s) directory (%s)." % ( table_name, table_out_dir ) | |
| 299 os.rmdir( table_out_dir ) | |
| 300 continue | |
| 301 #build bitsets from table | |
| 302 bitset_dict = {} | |
| 303 for line in gzip.open( '%s.txt.gz' % os.path.join( input_dir, base_filename ) ): | |
| 304 fields = line.strip().split( '\t' ) | |
| 305 chrom = fields[ chrom_col ] | |
| 306 start = int( fields[ start_col ] ) | |
| 307 end = int( fields[ end_col ] ) | |
| 308 if chrom not in bitset_dict: | |
| 309 bitset_dict[ chrom ] = BitSet( chrom_lengths.get( chrom, options.bitset_size ) ) | |
| 310 bitset_dict[ chrom ].set_range( start, end - start ) | |
| 311 #write bitsets as profiled annotations | |
| 312 for chrom_name, chrom_bits in bitset_dict.iteritems(): | |
| 313 out = open( os.path.join( table_out_dir, '%s.covered' % chrom_name ), 'wb' ) | |
| 314 end = 0 | |
| 315 total_regions = 0 | |
| 316 total_coverage = 0 | |
| 317 max_size = chrom_lengths.get( chrom_name, options.bitset_size ) | |
| 318 while True: | |
| 319 start = chrom_bits.next_set( end ) | |
| 320 if start >= max_size: | |
| 321 break | |
| 322 end = chrom_bits.next_clear( start ) | |
| 323 out.write( struct.pack( STRUCT_FMT, start ) ) | |
| 324 out.write( struct.pack( STRUCT_FMT, end ) ) | |
| 325 total_regions += 1 | |
| 326 total_coverage += end - start | |
| 327 if end >= max_size: | |
| 328 break | |
| 329 out.close() | |
| 330 open( os.path.join( table_out_dir, '%s.total_regions' % chrom_name ), 'wb' ).write( str( total_regions ) ) | |
| 331 open( os.path.join( table_out_dir, '%s.total_coverage' % chrom_name ), 'wb' ).write( str( total_coverage ) ) | |
| 332 | |
| 333 #create xml | |
| 334 create_grouping_xml( input_dir, output_dir, options.dbkey ) | |
| 335 #create database dump info file, for database version control | |
| 336 write_database_dump_info( input_dir, output_dir, options.dbkey, chrom_lengths, options.bitset_size ) | |
| 337 | |
| 338 if __name__ == "__main__": __main__() | 
