comparison lib/galaxy/datatypes/gmap.py @ 6:3be0e0a858fe

refactor and update README
author Jim Johnson <jj@umn.edu>
date Tue, 08 Nov 2011 13:22:34 -0600
parents
children 93911bac43da
comparison
equal deleted inserted replaced
5:f4b4c1712e39 6:3be0e0a858fe
1 """
2 GMAP indexes
3 """
4 import logging
5 import os,os.path,re
6 import data
7 from data import Text
8 from galaxy import util
9 from metadata import MetadataElement
10
11 log = logging.getLogger(__name__)
12
13 class GmapDB( Text ):
14 """
15 A GMAP DB for indexes
16 """
17 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
18 MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True )
19 MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True )
20 MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True )
21 MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True )
22 MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True )
23 MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True )
24 MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True )
25
26 file_ext = 'gmapdb'
27 is_binary = True
28 composite_type = 'auto_primary_file'
29 allow_datatype_change = False
30
31 def generate_primary_file( self, dataset = None ):
32 """
33 This is called only at upload to write the html file
34 cannot rename the datasets here - they come with the default unfortunately
35 """
36 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
37
38 def regenerate_primary_file(self,dataset):
39 """
40 cannot do this until we are setting metadata
41 """
42 bn = dataset.metadata.db_name
43 log.info( "GmapDB regenerate_primary_file %s" % (bn))
44 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
45 for i,name in enumerate(dataset.metadata.maps):
46 rval.append( '<li>%s' % name)
47 rval.append( '</ul></html>' )
48 f = file(dataset.file_name,'w')
49 f.write("\n".join( rval ))
50 f.write('\n')
51 f.close()
52
53 def set_peek( self, dataset, is_multi_byte=False ):
54 log.info( "GmapDB set_peek %s" % (dataset))
55 if not dataset.dataset.purged:
56 dataset.peek = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps )
57 dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name )
58 else:
59 dataset.peek = 'file does not exist'
60 dataset.blurb = 'file purged from disk'
61 def display_peek( self, dataset ):
62 try:
63 return dataset.peek
64 except:
65 return "GMAP index file"
66
67 def sniff( self, filename ):
68 return False
69 def set_meta( self, dataset, overwrite = True, **kwd ):
70 """
71 Expecting:
72 extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index>
73 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp
74 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions
75 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs
76 index maps:
77 extra_files_path/db_name/db_name.maps/*.iit
78 """
79 log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path))
80 pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?'
81 efp = dataset.extra_files_path
82 flist = os.listdir(efp)
83 for i,fname in enumerate(flist):
84 log.info( "GmapDB set_meta %s %s" % (i,fname))
85 fpath = os.path.join(efp,fname)
86 if os.path.isdir(fpath):
87 ilist = os.listdir(fpath)
88 kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata
89 for j,iname in enumerate(ilist):
90 log.info( "GmapDB set_meta file %s %s" % (j,iname))
91 ipath = os.path.join(fpath,iname)
92 if os.path.isdir(ipath): # find maps
93 dataset.metadata.map_dir = iname
94 for mapfile in os.listdir(ipath):
95 mapname = mapfile.replace('.iit','')
96 log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile))
97 dataset.metadata.maps.append(mapname)
98 else:
99 m = re.match(pat,iname)
100 if m:
101 log.info( "GmapDB set_meta m %s %s " % (iname, m))
102 assert len(m.groups()) == 10
103 dataset.metadata.db_name = fname
104 if m.groups()[2] == 'ref':
105 if m.groups()[-1] != None:
106 dataset.metadata.snps.append(m.groups()[-1])
107 else:
108 if m.groups()[-3] != None:
109 k = int(m.groups()[-3])
110 kmers[k] = k
111 if m.groups()[-4] != None:
112 dataset.metadata.basesize = int( m.groups()[-4])
113 elif m.groups()[3] == 'met':
114 dataset.metadata.cmet = True
115 elif m.groups()[4] == 'a2i':
116 dataset.metadata.atoi = True
117 dataset.metadata.kmers = kmers.keys()
118
119 class GmapSnpIndex( Text ):
120 """
121 A GMAP SNP index created by snpindex
122 """
123 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
124 MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True )
125
126 file_ext = 'gmapsnpindex'
127 is_binary = True
128 composite_type = 'auto_primary_file'
129 allow_datatype_change = False
130
131 def generate_primary_file( self, dataset = None ):
132 """
133 This is called only at upload to write the html file
134 cannot rename the datasets here - they come with the default unfortunately
135 """
136 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
137
138 def regenerate_primary_file(self,dataset):
139 """
140 cannot do this until we are setting metadata
141 """
142 bn = dataset.metadata.db_name
143 log.info( "GmapDB regenerate_primary_file %s" % (bn))
144 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
145 for i,name in enumerate(dataset.metadata.maps):
146 rval.append( '<li>%s' % name)
147 rval.append( '</ul></html>' )
148 f = file(dataset.file_name,'w')
149 f.write("\n".join( rval ))
150 f.write('\n')
151 f.close()
152 def set_peek( self, dataset, is_multi_byte=False ):
153 log.info( "GmapSnpIndex set_peek %s" % (dataset))
154 if not dataset.dataset.purged:
155 dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
156 dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
157 else:
158 dataset.peek = 'file does not exist'
159 dataset.blurb = 'file purged from disk'
160 def display_peek( self, dataset ):
161 try:
162 return dataset.peek
163 except:
164 return "GMAP SNP index"
165
166 def sniff( self, filename ):
167 return False
168 def set_meta( self, dataset, overwrite = True, **kwd ):
169 """
170 Expecting:
171 extra_files_path/snp_name.iit
172 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp.snp_name
173 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name
174 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name
175 """
176 log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path))
177 pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?'
178 efp = dataset.extra_files_path
179 flist = os.listdir(efp)
180 for i,fname in enumerate(flist):
181 m = re.match(pat,fname)
182 if m:
183 assert len(m.groups()) == 6
184 dataset.metadata.db_name = m.groups()[0]
185 dataset.metadata.snps_name = m.groups()[-1]
186
187
188
189
190 class IntervalIndexTree( Text ):
191 """
192 A GMAP Interval Index Tree Map
193 created by iit_store
194 (/path/to/map)/(mapname).iit
195 """
196 file_ext = 'iit'
197 is_binary = True
198
199 class SpliceSitesIntervalIndexTree( IntervalIndexTree ):
200 """
201 A GMAP Interval Index Tree Map
202 created by iit_store
203 """
204 file_ext = 'splicesites.iit'
205
206 class IntronsIntervalIndexTree( IntervalIndexTree ):
207 """
208 A GMAP Interval Index Tree Map
209 created by iit_store
210 """
211 file_ext = 'introns.iit'
212
213 class SNPsIntervalIndexTree( IntervalIndexTree ):
214 """
215 A GMAP Interval Index Tree Map
216 created by iit_store
217 """
218 file_ext = 'snps.iit'
219
220 class TallyIntervalIndexTree( IntervalIndexTree ):
221 """
222 A GMAP Interval Index Tree Map
223 created by iit_store
224 """
225 file_ext = 'tally.iit'
226
227 class IntervalAnnotation( Text ):
228 """
229 Class describing a GMAP Interval format:
230 >label coords optional_tag
231 optional_annotation (which may be zero, one, or multiple lines)
232 The coords should be of the form:
233 chr:position
234 chr:startposition..endposition
235 """
236 file_ext = 'gmap_annotation'
237 """Add metadata elements"""
238 MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 )
239
240 def set_meta( self, dataset, **kwd ):
241 """
242 Set the number of annotations and the number of data lines in dataset.
243 """
244 data_lines = 0
245 annotations = 0
246 for line in file( dataset.file_name ):
247 line = line.strip()
248 if line and line.startswith( '>' ):
249 annotations += 1
250 data_lines +=1
251 else:
252 data_lines += 1
253 dataset.metadata.data_lines = data_lines
254 dataset.metadata.annotations = annotations
255 def set_peek( self, dataset, is_multi_byte=False ):
256 if not dataset.dataset.purged:
257 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
258 if dataset.metadata.annotations:
259 dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) )
260 else:
261 dataset.blurb = data.nice_size( dataset.get_size() )
262 else:
263 dataset.peek = 'file does not exist'
264 dataset.blurb = 'file purged from disk'
265
266 def sniff( self, filename ):
267 """
268 Determines whether the file is a gmap annotation file
269 Format:
270 >label coords optional_tag
271 optional_annotation (which may be zero, one, or multiple lines)
272 For example, the label may be an EST accession, with the coords
273 representing its genomic position. Labels may be duplicated if
274 necessary.
275 The coords should be of the form
276 chr:position
277 chr:startposition..endposition
278 The term "chr:position" is equivalent to "chr:position..position". If
279 you want to indicate that the interval is on the minus strand or
280 reverse direction, then <endposition> may be less than <startposition>.
281 """
282 try:
283 pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag]
284 fh = open( filename )
285 count = 0
286 while True and count < 10:
287 line = fh.readline()
288 if not line:
289 break #EOF
290 line = line.strip()
291 if line: #first non-empty line
292 if line.startswith( '>' ):
293 count += 1
294 if re.match(pat,line) == None: # Failed to match
295 return False
296 finally:
297 fh.close()
298 return False
299
300 class SpliceSiteAnnotation(IntervalAnnotation):
301 file_ext = 'gmap_splicesites'
302 """
303 Example:
304 >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678
305 >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678
306 >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179
307 >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179
308 >NM_004449.ERG.exon1 21:38955452..38955451 donor 783
309 >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783
310 >NM_004449.ERG.exon2 21:38878638..38878637 donor 360
311 >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360
312 Each line must start with a ">" character, then be followed by an
313 identifier, which may have duplicates and can have any format, with
314 the gene name or exon number shown here only as a suggestion. Then
315 there should be the chromosomal coordinates which straddle the
316 exon-intron boundary, so one coordinate is on the exon and one is on
317 the intron. (Coordinates are all 1-based, so the first character of a
318 chromosome is number 1.) Finally, there should be the splice type:
319 "donor" or "acceptor". You may optionally store the intron distance
320 at the end. GSNAP can use this intron distance, if it is longer than
321 its value for --localsplicedist, to look for long introns at that
322 splice site. The same splice site may have different intron distances
323 in the database; GSNAP will use the longest intron distance reported
324 in searching for long introns.
325 """
326 def sniff( self, filename ): # TODO
327 """
328 Determines whether the file is a gmap splice site annotation file
329 """
330 try:
331 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist]
332 fh = open( filename )
333 count = 0
334 while True and count < 10:
335 line = fh.readline()
336 if not line:
337 break #EOF
338 line = line.strip()
339 if line: #first non-empty line
340 count += 1
341 if re.match(pat,line) == None: # Failed to match
342 return False
343 finally:
344 fh.close()
345 return False
346
347 class IntronAnnotation(IntervalAnnotation):
348 file_ext = 'gmap_introns'
349 """
350 Example:
351 >NM_004448.ERBB2.intron1 17:35110090..35116769
352 >NM_004448.ERBB2.intron2 17:35116920..35118100
353 >NM_004449.ERG.intron1 21:38955452..38878739
354 >NM_004449.ERG.intron2 21:38878638..38869541
355 The coordinates are 1-based, and specify the exon coordinates
356 surrounding the intron, with the first coordinate being from the donor
357 exon and the second one being from the acceptor exon.
358 """
359 def sniff( self, filename ): # TODO
360 """
361 Determines whether the file is a gmap Intron annotation file
362 """
363 try:
364 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position
365 fh = open( filename )
366 count = 0
367 while True and count < 10:
368 line = fh.readline()
369 if not line:
370 break #EOF
371 line = line.strip()
372 if line: #first non-empty line
373 count += 1
374 if re.match(pat,line) == None: # Failed to match
375 return False
376 finally:
377 fh.close()
378 return False
379
380 class SNPAnnotation(IntervalAnnotation):
381 file_ext = 'gmap_snps'
382 """
383 Example:
384 >rs62211261 21:14379270 CG
385 >rs62211262 21:14379281 AT
386 >rs62211263 21:14379298 WN
387 Each line must start with a ">" character, then be followed by an
388 identifier (which may have duplicates). Then there should be the
389 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so
390 the first character of a chromosome is number 1.) Finally, there
391 should be the two possible alleles. (Previous versions required that
392 these be in alphabetical order: "AC", "AG", "AT", "CG", "CT", or "GT",
393 but that is no longer a requirement.) These alleles must correspond
394 to the possible nucleotides on the plus strand of the genome. If the
395 one of these two letters does not match the allele in the reference
396 sequence, that SNP will be ignored in subsequent processing as a
397 probable error.
398
399 GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows
400 all nucleotides to match at that position, not just a given reference
401 and alternate allele. It is essentially as if an "N" were recorded at
402 that genomic location, although the index files still keep track of
403 the reference allele. To indicate that a position has a wildcard SNP,
404 you can indicate the genotype as "WN", where "W" is the reference
405 allele. Another indication of a wildcard SNP is to provide two
406 separate lines at that position with the genotypes "WX" and "WY",
407 where "W" is the reference allele and "X" and "Y" are two different
408 alternate alleles.
409 """
410 def sniff( self, filename ):
411 """
412 Determines whether the file is a gmap SNP annotation file
413 """
414 try:
415 pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG
416 fh = open( filename )
417 count = 0
418 while True and count < 10:
419 line = fh.readline()
420 if not line:
421 break #EOF
422 line = line.strip()
423 if line: #first non-empty line
424 count += 1
425 if re.match(pat,line) == None: # Failed to match
426 return False
427 finally:
428 fh.close()
429 return False
430
431
432 class TallyAnnotation(IntervalAnnotation):
433 file_ext = 'gsnap_tally'
434 """
435 Output produced by gsnap_tally
436 Example:
437 >144 chr20:57268791..57268935
438 G0
439 A1(1@7|1Q-3)
440 A2(1@36,1@1|1Q2,1Q-8)
441 C2 0.889,0.912,0.889,0.889,0.933,0.912,0.912,0.889,0.889,0.889 -2.66,-2.89,-2.66,-2.66,-3.16,-2.89,-2.89,-2.66,-2.66,-2.66
442 C1 T1 0.888,0.9,0.888,0.9,0.913,0.9,0.911,0.888,0.9,0.913 -2.66,-2.78,-2.66,-2.78,-2.91,-2.78,-2.89,-2.66,-2.78,-2.91
443 """
444 def sniff( self, filename ): # TODO
445 """
446 Determines whether the file is a gmap splice site annotation file
447 """
448 try:
449 pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' #>total chr:position..position
450 pat2 = '^[GATCN]\d.*$' #BaseCountDeatails
451 fh = open( filename )
452 count = 0
453 while True and count < 10:
454 line = fh.readline()
455 if not line:
456 break #EOF
457 line = line.strip()
458 if line: #first non-empty line
459 count += 1
460 if re.match(pat,line) == None and re.match(pat2,line) == None: # Failed to match
461 return False
462 finally:
463 fh.close()
464 return False
465
466 class GsnapResult( Text ):
467 """
468 The default output format for gsnap. Can be used as input for gsnap_tally.
469 """
470 file_ext = 'gsnap'
471
472