6
|
1 """
|
|
2 GMAP indexes
|
|
3 """
|
|
4 import logging
|
|
5 import os,os.path,re
|
|
6 import data
|
|
7 from data import Text
|
|
8 from galaxy import util
|
|
9 from metadata import MetadataElement
|
|
10
|
|
11 log = logging.getLogger(__name__)
|
|
12
|
|
13 class GmapDB( Text ):
|
|
14 """
|
|
15 A GMAP DB for indexes
|
|
16 """
|
|
17 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
|
|
18 MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True )
|
|
19 MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True )
|
|
20 MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True )
|
|
21 MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True )
|
|
22 MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True )
|
|
23 MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True )
|
|
24 MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True )
|
|
25
|
|
26 file_ext = 'gmapdb'
|
|
27 is_binary = True
|
|
28 composite_type = 'auto_primary_file'
|
|
29 allow_datatype_change = False
|
|
30
|
|
31 def generate_primary_file( self, dataset = None ):
|
|
32 """
|
|
33 This is called only at upload to write the html file
|
|
34 cannot rename the datasets here - they come with the default unfortunately
|
|
35 """
|
|
36 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
|
|
37
|
|
38 def regenerate_primary_file(self,dataset):
|
|
39 """
|
|
40 cannot do this until we are setting metadata
|
|
41 """
|
|
42 bn = dataset.metadata.db_name
|
|
43 log.info( "GmapDB regenerate_primary_file %s" % (bn))
|
|
44 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
|
|
45 for i,name in enumerate(dataset.metadata.maps):
|
|
46 rval.append( '<li>%s' % name)
|
|
47 rval.append( '</ul></html>' )
|
|
48 f = file(dataset.file_name,'w')
|
|
49 f.write("\n".join( rval ))
|
|
50 f.write('\n')
|
|
51 f.close()
|
|
52
|
|
53 def set_peek( self, dataset, is_multi_byte=False ):
|
|
54 log.info( "GmapDB set_peek %s" % (dataset))
|
|
55 if not dataset.dataset.purged:
|
|
56 dataset.peek = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps )
|
|
57 dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name )
|
|
58 else:
|
|
59 dataset.peek = 'file does not exist'
|
|
60 dataset.blurb = 'file purged from disk'
|
|
61 def display_peek( self, dataset ):
|
|
62 try:
|
|
63 return dataset.peek
|
|
64 except:
|
|
65 return "GMAP index file"
|
|
66
|
|
67 def sniff( self, filename ):
|
|
68 return False
|
|
69 def set_meta( self, dataset, overwrite = True, **kwd ):
|
|
70 """
|
|
71 Expecting:
|
|
72 extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index>
|
|
73 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp
|
|
74 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions
|
|
75 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs
|
|
76 index maps:
|
|
77 extra_files_path/db_name/db_name.maps/*.iit
|
|
78 """
|
|
79 log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path))
|
|
80 pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?'
|
|
81 efp = dataset.extra_files_path
|
|
82 flist = os.listdir(efp)
|
|
83 for i,fname in enumerate(flist):
|
|
84 log.info( "GmapDB set_meta %s %s" % (i,fname))
|
|
85 fpath = os.path.join(efp,fname)
|
|
86 if os.path.isdir(fpath):
|
|
87 ilist = os.listdir(fpath)
|
|
88 kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata
|
|
89 for j,iname in enumerate(ilist):
|
|
90 log.info( "GmapDB set_meta file %s %s" % (j,iname))
|
|
91 ipath = os.path.join(fpath,iname)
|
|
92 if os.path.isdir(ipath): # find maps
|
|
93 dataset.metadata.map_dir = iname
|
|
94 for mapfile in os.listdir(ipath):
|
|
95 mapname = mapfile.replace('.iit','')
|
|
96 log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile))
|
|
97 dataset.metadata.maps.append(mapname)
|
|
98 else:
|
|
99 m = re.match(pat,iname)
|
|
100 if m:
|
|
101 log.info( "GmapDB set_meta m %s %s " % (iname, m))
|
|
102 assert len(m.groups()) == 10
|
|
103 dataset.metadata.db_name = fname
|
|
104 if m.groups()[2] == 'ref':
|
|
105 if m.groups()[-1] != None:
|
|
106 dataset.metadata.snps.append(m.groups()[-1])
|
|
107 else:
|
|
108 if m.groups()[-3] != None:
|
|
109 k = int(m.groups()[-3])
|
|
110 kmers[k] = k
|
|
111 if m.groups()[-4] != None:
|
|
112 dataset.metadata.basesize = int( m.groups()[-4])
|
|
113 elif m.groups()[3] == 'met':
|
|
114 dataset.metadata.cmet = True
|
|
115 elif m.groups()[4] == 'a2i':
|
|
116 dataset.metadata.atoi = True
|
|
117 dataset.metadata.kmers = kmers.keys()
|
|
118
|
|
119 class GmapSnpIndex( Text ):
|
|
120 """
|
|
121 A GMAP SNP index created by snpindex
|
|
122 """
|
|
123 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True )
|
|
124 MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True )
|
|
125
|
|
126 file_ext = 'gmapsnpindex'
|
|
127 is_binary = True
|
|
128 composite_type = 'auto_primary_file'
|
|
129 allow_datatype_change = False
|
|
130
|
|
131 def generate_primary_file( self, dataset = None ):
|
|
132 """
|
|
133 This is called only at upload to write the html file
|
|
134 cannot rename the datasets here - they come with the default unfortunately
|
|
135 """
|
|
136 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
|
|
137
|
|
138 def regenerate_primary_file(self,dataset):
|
|
139 """
|
|
140 cannot do this until we are setting metadata
|
|
141 """
|
|
142 bn = dataset.metadata.db_name
|
|
143 log.info( "GmapDB regenerate_primary_file %s" % (bn))
|
|
144 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)]
|
|
145 for i,name in enumerate(dataset.metadata.maps):
|
|
146 rval.append( '<li>%s' % name)
|
|
147 rval.append( '</ul></html>' )
|
|
148 f = file(dataset.file_name,'w')
|
|
149 f.write("\n".join( rval ))
|
|
150 f.write('\n')
|
|
151 f.close()
|
|
152 def set_peek( self, dataset, is_multi_byte=False ):
|
|
153 log.info( "GmapSnpIndex set_peek %s" % (dataset))
|
|
154 if not dataset.dataset.purged:
|
|
155 dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
|
|
156 dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name)
|
|
157 else:
|
|
158 dataset.peek = 'file does not exist'
|
|
159 dataset.blurb = 'file purged from disk'
|
|
160 def display_peek( self, dataset ):
|
|
161 try:
|
|
162 return dataset.peek
|
|
163 except:
|
|
164 return "GMAP SNP index"
|
|
165
|
|
166 def sniff( self, filename ):
|
|
167 return False
|
|
168 def set_meta( self, dataset, overwrite = True, **kwd ):
|
|
169 """
|
|
170 Expecting:
|
|
171 extra_files_path/snp_name.iit
|
|
172 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp.snp_name
|
|
173 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name
|
|
174 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name
|
|
175 """
|
|
176 log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path))
|
|
177 pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?'
|
|
178 efp = dataset.extra_files_path
|
|
179 flist = os.listdir(efp)
|
|
180 for i,fname in enumerate(flist):
|
|
181 m = re.match(pat,fname)
|
|
182 if m:
|
|
183 assert len(m.groups()) == 6
|
|
184 dataset.metadata.db_name = m.groups()[0]
|
|
185 dataset.metadata.snps_name = m.groups()[-1]
|
|
186
|
|
187
|
|
188
|
|
189
|
|
190 class IntervalIndexTree( Text ):
|
|
191 """
|
|
192 A GMAP Interval Index Tree Map
|
|
193 created by iit_store
|
|
194 (/path/to/map)/(mapname).iit
|
|
195 """
|
|
196 file_ext = 'iit'
|
|
197 is_binary = True
|
|
198
|
|
199 class SpliceSitesIntervalIndexTree( IntervalIndexTree ):
|
|
200 """
|
|
201 A GMAP Interval Index Tree Map
|
|
202 created by iit_store
|
|
203 """
|
|
204 file_ext = 'splicesites.iit'
|
|
205
|
|
206 class IntronsIntervalIndexTree( IntervalIndexTree ):
|
|
207 """
|
|
208 A GMAP Interval Index Tree Map
|
|
209 created by iit_store
|
|
210 """
|
|
211 file_ext = 'introns.iit'
|
|
212
|
|
213 class SNPsIntervalIndexTree( IntervalIndexTree ):
|
|
214 """
|
|
215 A GMAP Interval Index Tree Map
|
|
216 created by iit_store
|
|
217 """
|
|
218 file_ext = 'snps.iit'
|
|
219
|
|
220 class TallyIntervalIndexTree( IntervalIndexTree ):
|
|
221 """
|
|
222 A GMAP Interval Index Tree Map
|
|
223 created by iit_store
|
|
224 """
|
|
225 file_ext = 'tally.iit'
|
|
226
|
|
227 class IntervalAnnotation( Text ):
|
|
228 """
|
|
229 Class describing a GMAP Interval format:
|
|
230 >label coords optional_tag
|
|
231 optional_annotation (which may be zero, one, or multiple lines)
|
|
232 The coords should be of the form:
|
|
233 chr:position
|
|
234 chr:startposition..endposition
|
|
235 """
|
|
236 file_ext = 'gmap_annotation'
|
|
237 """Add metadata elements"""
|
|
238 MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 )
|
|
239
|
|
240 def set_meta( self, dataset, **kwd ):
|
|
241 """
|
|
242 Set the number of annotations and the number of data lines in dataset.
|
|
243 """
|
|
244 data_lines = 0
|
|
245 annotations = 0
|
|
246 for line in file( dataset.file_name ):
|
|
247 line = line.strip()
|
|
248 if line and line.startswith( '>' ):
|
|
249 annotations += 1
|
|
250 data_lines +=1
|
|
251 else:
|
|
252 data_lines += 1
|
|
253 dataset.metadata.data_lines = data_lines
|
|
254 dataset.metadata.annotations = annotations
|
|
255 def set_peek( self, dataset, is_multi_byte=False ):
|
|
256 if not dataset.dataset.purged:
|
|
257 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
258 if dataset.metadata.annotations:
|
|
259 dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) )
|
|
260 else:
|
|
261 dataset.blurb = data.nice_size( dataset.get_size() )
|
|
262 else:
|
|
263 dataset.peek = 'file does not exist'
|
|
264 dataset.blurb = 'file purged from disk'
|
|
265
|
|
266 def sniff( self, filename ):
|
|
267 """
|
|
268 Determines whether the file is a gmap annotation file
|
|
269 Format:
|
|
270 >label coords optional_tag
|
|
271 optional_annotation (which may be zero, one, or multiple lines)
|
|
272 For example, the label may be an EST accession, with the coords
|
|
273 representing its genomic position. Labels may be duplicated if
|
|
274 necessary.
|
|
275 The coords should be of the form
|
|
276 chr:position
|
|
277 chr:startposition..endposition
|
|
278 The term "chr:position" is equivalent to "chr:position..position". If
|
|
279 you want to indicate that the interval is on the minus strand or
|
|
280 reverse direction, then <endposition> may be less than <startposition>.
|
|
281 """
|
|
282 try:
|
|
283 pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag]
|
|
284 fh = open( filename )
|
|
285 count = 0
|
|
286 while True and count < 10:
|
|
287 line = fh.readline()
|
|
288 if not line:
|
|
289 break #EOF
|
|
290 line = line.strip()
|
|
291 if line: #first non-empty line
|
|
292 if line.startswith( '>' ):
|
|
293 count += 1
|
|
294 if re.match(pat,line) == None: # Failed to match
|
|
295 return False
|
|
296 finally:
|
|
297 fh.close()
|
|
298 return False
|
|
299
|
|
300 class SpliceSiteAnnotation(IntervalAnnotation):
|
|
301 file_ext = 'gmap_splicesites'
|
|
302 """
|
|
303 Example:
|
|
304 >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678
|
|
305 >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678
|
|
306 >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179
|
|
307 >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179
|
|
308 >NM_004449.ERG.exon1 21:38955452..38955451 donor 783
|
|
309 >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783
|
|
310 >NM_004449.ERG.exon2 21:38878638..38878637 donor 360
|
|
311 >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360
|
|
312 Each line must start with a ">" character, then be followed by an
|
|
313 identifier, which may have duplicates and can have any format, with
|
|
314 the gene name or exon number shown here only as a suggestion. Then
|
|
315 there should be the chromosomal coordinates which straddle the
|
|
316 exon-intron boundary, so one coordinate is on the exon and one is on
|
|
317 the intron. (Coordinates are all 1-based, so the first character of a
|
|
318 chromosome is number 1.) Finally, there should be the splice type:
|
|
319 "donor" or "acceptor". You may optionally store the intron distance
|
|
320 at the end. GSNAP can use this intron distance, if it is longer than
|
|
321 its value for --localsplicedist, to look for long introns at that
|
|
322 splice site. The same splice site may have different intron distances
|
|
323 in the database; GSNAP will use the longest intron distance reported
|
|
324 in searching for long introns.
|
|
325 """
|
|
326 def sniff( self, filename ): # TODO
|
|
327 """
|
|
328 Determines whether the file is a gmap splice site annotation file
|
|
329 """
|
|
330 try:
|
|
331 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist]
|
|
332 fh = open( filename )
|
|
333 count = 0
|
|
334 while True and count < 10:
|
|
335 line = fh.readline()
|
|
336 if not line:
|
|
337 break #EOF
|
|
338 line = line.strip()
|
|
339 if line: #first non-empty line
|
|
340 count += 1
|
|
341 if re.match(pat,line) == None: # Failed to match
|
|
342 return False
|
|
343 finally:
|
|
344 fh.close()
|
|
345 return False
|
|
346
|
|
347 class IntronAnnotation(IntervalAnnotation):
|
|
348 file_ext = 'gmap_introns'
|
|
349 """
|
|
350 Example:
|
|
351 >NM_004448.ERBB2.intron1 17:35110090..35116769
|
|
352 >NM_004448.ERBB2.intron2 17:35116920..35118100
|
|
353 >NM_004449.ERG.intron1 21:38955452..38878739
|
|
354 >NM_004449.ERG.intron2 21:38878638..38869541
|
|
355 The coordinates are 1-based, and specify the exon coordinates
|
|
356 surrounding the intron, with the first coordinate being from the donor
|
|
357 exon and the second one being from the acceptor exon.
|
|
358 """
|
|
359 def sniff( self, filename ): # TODO
|
|
360 """
|
|
361 Determines whether the file is a gmap Intron annotation file
|
|
362 """
|
|
363 try:
|
|
364 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position
|
|
365 fh = open( filename )
|
|
366 count = 0
|
|
367 while True and count < 10:
|
|
368 line = fh.readline()
|
|
369 if not line:
|
|
370 break #EOF
|
|
371 line = line.strip()
|
|
372 if line: #first non-empty line
|
|
373 count += 1
|
|
374 if re.match(pat,line) == None: # Failed to match
|
|
375 return False
|
|
376 finally:
|
|
377 fh.close()
|
|
378 return False
|
|
379
|
|
380 class SNPAnnotation(IntervalAnnotation):
|
|
381 file_ext = 'gmap_snps'
|
|
382 """
|
|
383 Example:
|
|
384 >rs62211261 21:14379270 CG
|
|
385 >rs62211262 21:14379281 AT
|
|
386 >rs62211263 21:14379298 WN
|
|
387 Each line must start with a ">" character, then be followed by an
|
|
388 identifier (which may have duplicates). Then there should be the
|
|
389 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so
|
|
390 the first character of a chromosome is number 1.) Finally, there
|
|
391 should be the two possible alleles. (Previous versions required that
|
|
392 these be in alphabetical order: "AC", "AG", "AT", "CG", "CT", or "GT",
|
|
393 but that is no longer a requirement.) These alleles must correspond
|
|
394 to the possible nucleotides on the plus strand of the genome. If the
|
|
395 one of these two letters does not match the allele in the reference
|
|
396 sequence, that SNP will be ignored in subsequent processing as a
|
|
397 probable error.
|
|
398
|
|
399 GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows
|
|
400 all nucleotides to match at that position, not just a given reference
|
|
401 and alternate allele. It is essentially as if an "N" were recorded at
|
|
402 that genomic location, although the index files still keep track of
|
|
403 the reference allele. To indicate that a position has a wildcard SNP,
|
|
404 you can indicate the genotype as "WN", where "W" is the reference
|
|
405 allele. Another indication of a wildcard SNP is to provide two
|
|
406 separate lines at that position with the genotypes "WX" and "WY",
|
|
407 where "W" is the reference allele and "X" and "Y" are two different
|
|
408 alternate alleles.
|
|
409 """
|
|
410 def sniff( self, filename ):
|
|
411 """
|
|
412 Determines whether the file is a gmap SNP annotation file
|
|
413 """
|
|
414 try:
|
|
415 pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG
|
|
416 fh = open( filename )
|
|
417 count = 0
|
|
418 while True and count < 10:
|
|
419 line = fh.readline()
|
|
420 if not line:
|
|
421 break #EOF
|
|
422 line = line.strip()
|
|
423 if line: #first non-empty line
|
|
424 count += 1
|
|
425 if re.match(pat,line) == None: # Failed to match
|
|
426 return False
|
|
427 finally:
|
|
428 fh.close()
|
|
429 return False
|
|
430
|
|
431
|
|
432 class TallyAnnotation(IntervalAnnotation):
|
|
433 file_ext = 'gsnap_tally'
|
|
434 """
|
|
435 Output produced by gsnap_tally
|
|
436 Example:
|
|
437 >144 chr20:57268791..57268935
|
|
438 G0
|
|
439 A1(1@7|1Q-3)
|
|
440 A2(1@36,1@1|1Q2,1Q-8)
|
|
441 C2 0.889,0.912,0.889,0.889,0.933,0.912,0.912,0.889,0.889,0.889 -2.66,-2.89,-2.66,-2.66,-3.16,-2.89,-2.89,-2.66,-2.66,-2.66
|
|
442 C1 T1 0.888,0.9,0.888,0.9,0.913,0.9,0.911,0.888,0.9,0.913 -2.66,-2.78,-2.66,-2.78,-2.91,-2.78,-2.89,-2.66,-2.78,-2.91
|
|
443 """
|
|
444 def sniff( self, filename ): # TODO
|
|
445 """
|
|
446 Determines whether the file is a gmap splice site annotation file
|
|
447 """
|
|
448 try:
|
|
449 pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' #>total chr:position..position
|
|
450 pat2 = '^[GATCN]\d.*$' #BaseCountDeatails
|
|
451 fh = open( filename )
|
|
452 count = 0
|
|
453 while True and count < 10:
|
|
454 line = fh.readline()
|
|
455 if not line:
|
|
456 break #EOF
|
|
457 line = line.strip()
|
|
458 if line: #first non-empty line
|
|
459 count += 1
|
|
460 if re.match(pat,line) == None and re.match(pat2,line) == None: # Failed to match
|
|
461 return False
|
|
462 finally:
|
|
463 fh.close()
|
|
464 return False
|
|
465
|
|
466 class GsnapResult( Text ):
|
|
467 """
|
|
468 The default output format for gsnap. Can be used as input for gsnap_tally.
|
|
469 """
|
|
470 file_ext = 'gsnap'
|
|
471
|
|
472
|