Mercurial > repos > jjohnson > gmap
comparison lib/galaxy/datatypes/gmap.py @ 11:6adc485b6dc0 draft default tip
Uploaded
author | jjohnson |
---|---|
date | Tue, 31 Jul 2012 08:19:46 -0400 |
parents | 93911bac43da |
children |
comparison
equal
deleted
inserted
replaced
10:93911bac43da | 11:6adc485b6dc0 |
---|---|
1 """ | |
2 GMAP indexes | |
3 """ | |
4 import logging | |
5 import os,os.path,re | |
6 import galaxy.datatypes.data | |
7 from galaxy.datatypes.data import Text | |
8 from galaxy import util | |
9 from galaxy.datatypes.metadata import MetadataElement | |
10 | |
11 log = logging.getLogger(__name__) | |
12 | |
13 class GmapDB( Text ): | |
14 """ | |
15 A GMAP DB for indexes | |
16 """ | |
17 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) | |
18 MetadataElement( name="basesize", default="12", desc="The basesize for offsetscomp", visible=True, readonly=True ) | |
19 MetadataElement( name="kmers", default=[''], desc="The kmer sizes for indexes", visible=True, no_value=[''], readonly=True ) | |
20 MetadataElement( name="map_dir", desc="The maps directory", default='unknown', set_in_upload=True, readonly=True ) | |
21 MetadataElement( name="maps", default=[''], desc="The names of maps stored for this gmap gmapdb", visible=True, no_value=[''], readonly=True ) | |
22 MetadataElement( name="snps", default=[''], desc="The names of SNP indexes stored for this gmapdb", visible=True, no_value=[''], readonly=True ) | |
23 MetadataElement( name="cmet", default=False, desc="Has a cmet index", visible=True, readonly=True ) | |
24 MetadataElement( name="atoi", default=False, desc="Has a atoi index", visible=True, readonly=True ) | |
25 | |
26 file_ext = 'gmapdb' | |
27 is_binary = True | |
28 composite_type = 'auto_primary_file' | |
29 allow_datatype_change = False | |
30 | |
31 def generate_primary_file( self, dataset = None ): | |
32 """ | |
33 This is called only at upload to write the html file | |
34 cannot rename the datasets here - they come with the default unfortunately | |
35 """ | |
36 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' | |
37 | |
38 def regenerate_primary_file(self,dataset): | |
39 """ | |
40 cannot do this until we are setting metadata | |
41 """ | |
42 bn = dataset.metadata.db_name | |
43 log.info( "GmapDB regenerate_primary_file %s" % (bn)) | |
44 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] | |
45 for i,name in enumerate(dataset.metadata.maps): | |
46 rval.append( '<li>%s' % name) | |
47 rval.append( '</ul></html>' ) | |
48 f = file(dataset.file_name,'w') | |
49 f.write("\n".join( rval )) | |
50 f.write('\n') | |
51 f.close() | |
52 | |
53 def set_peek( self, dataset, is_multi_byte=False ): | |
54 log.info( "GmapDB set_peek %s" % (dataset)) | |
55 if not dataset.dataset.purged: | |
56 dataset.peek = "GMAPDB index %s\n cmet %s\n atoi %s\n maps %s" % ( dataset.metadata.db_name,dataset.metadata.cmet,dataset.metadata.atoi,dataset.metadata.maps ) | |
57 dataset.blurb = "GMAPDB %s" % ( dataset.metadata.db_name ) | |
58 else: | |
59 dataset.peek = 'file does not exist' | |
60 dataset.blurb = 'file purged from disk' | |
61 def display_peek( self, dataset ): | |
62 try: | |
63 return dataset.peek | |
64 except: | |
65 return "GMAP index file" | |
66 | |
67 def sniff( self, filename ): | |
68 return False | |
69 def set_meta( self, dataset, overwrite = True, **kwd ): | |
70 """ | |
71 Expecting: | |
72 extra_files_path/<db_name>/db_name>.ref<basesize><kmer>3<index> | |
73 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp | |
74 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions | |
75 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs | |
76 index maps: | |
77 extra_files_path/db_name/db_name.maps/*.iit | |
78 """ | |
79 log.info( "GmapDB set_meta %s %s" % (dataset,dataset.extra_files_path)) | |
80 pat = '(.*)\.((ref)|(met)[atgc][atgc]|(a2i)[atgc][atgc])((\d\d)(\d\d))?3positions(\.(.+))?' | |
81 efp = dataset.extra_files_path | |
82 flist = os.listdir(efp) | |
83 for i,fname in enumerate(flist): | |
84 log.info( "GmapDB set_meta %s %s" % (i,fname)) | |
85 fpath = os.path.join(efp,fname) | |
86 if os.path.isdir(fpath): | |
87 ilist = os.listdir(fpath) | |
88 kmers = {'':'default'} # HACK '' empty key added so user has default choice when selecting kmer from metadata | |
89 for j,iname in enumerate(ilist): | |
90 log.info( "GmapDB set_meta file %s %s" % (j,iname)) | |
91 ipath = os.path.join(fpath,iname) | |
92 if os.path.isdir(ipath): # find maps | |
93 dataset.metadata.map_dir = iname | |
94 for mapfile in os.listdir(ipath): | |
95 mapname = mapfile.replace('.iit','') | |
96 log.info( "GmapDB set_meta map %s %s" % (mapname,mapfile)) | |
97 dataset.metadata.maps.append(mapname) | |
98 else: | |
99 m = re.match(pat,iname) | |
100 if m: | |
101 log.info( "GmapDB set_meta m %s %s " % (iname, m)) | |
102 assert len(m.groups()) == 10 | |
103 dataset.metadata.db_name = fname | |
104 if m.groups()[2] == 'ref': | |
105 if m.groups()[-1] != None: | |
106 dataset.metadata.snps.append(m.groups()[-1]) | |
107 else: | |
108 if m.groups()[-3] != None: | |
109 k = int(m.groups()[-3]) | |
110 kmers[k] = k | |
111 if m.groups()[-4] != None: | |
112 dataset.metadata.basesize = int( m.groups()[-4]) | |
113 elif m.groups()[3] == 'met': | |
114 dataset.metadata.cmet = True | |
115 elif m.groups()[4] == 'a2i': | |
116 dataset.metadata.atoi = True | |
117 dataset.metadata.kmers = kmers.keys() | |
118 | |
119 class GmapSnpIndex( Text ): | |
120 """ | |
121 A GMAP SNP index created by snpindex | |
122 """ | |
123 MetadataElement( name="db_name", desc="The db name for this index set", default='unknown', set_in_upload=True, readonly=True ) | |
124 MetadataElement( name="snps_name", default='snps', desc="The name of SNP index", visible=True, no_value='', readonly=True ) | |
125 | |
126 file_ext = 'gmapsnpindex' | |
127 is_binary = True | |
128 composite_type = 'auto_primary_file' | |
129 allow_datatype_change = False | |
130 | |
131 def generate_primary_file( self, dataset = None ): | |
132 """ | |
133 This is called only at upload to write the html file | |
134 cannot rename the datasets here - they come with the default unfortunately | |
135 """ | |
136 return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' | |
137 | |
138 def regenerate_primary_file(self,dataset): | |
139 """ | |
140 cannot do this until we are setting metadata | |
141 """ | |
142 bn = dataset.metadata.db_name | |
143 log.info( "GmapDB regenerate_primary_file %s" % (bn)) | |
144 rval = ['<html><head><title>GMAPDB %s</title></head><p/><H3>GMAPDB %s</H3><p/>cmet %s<br>atoi %s<H4>Maps:</H4><ul>' % (bn,bn,dataset.metadata.cmet,dataset.metadata.atoi)] | |
145 for i,name in enumerate(dataset.metadata.maps): | |
146 rval.append( '<li>%s' % name) | |
147 rval.append( '</ul></html>' ) | |
148 f = file(dataset.file_name,'w') | |
149 f.write("\n".join( rval )) | |
150 f.write('\n') | |
151 f.close() | |
152 def set_peek( self, dataset, is_multi_byte=False ): | |
153 log.info( "GmapSnpIndex set_peek %s" % (dataset)) | |
154 if not dataset.dataset.purged: | |
155 dataset.peek = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) | |
156 dataset.blurb = "GMAP SNPindex %s on %s\n" % ( dataset.metadata.snps_name,dataset.metadata.db_name) | |
157 else: | |
158 dataset.peek = 'file does not exist' | |
159 dataset.blurb = 'file purged from disk' | |
160 def display_peek( self, dataset ): | |
161 try: | |
162 return dataset.peek | |
163 except: | |
164 return "GMAP SNP index" | |
165 | |
166 def sniff( self, filename ): | |
167 return False | |
168 def set_meta( self, dataset, overwrite = True, **kwd ): | |
169 """ | |
170 Expecting: | |
171 extra_files_path/snp_name.iit | |
172 extra_files_path/db_name/db_name.ref1[2345]1[2345]3offsetscomp.snp_name | |
173 extra_files_path/db_name/db_name.ref1[2345]1[2345]3positions.snp_name | |
174 extra_files_path/db_name/db_name.ref1[2345]1[2345]3gammaptrs.snp_name | |
175 """ | |
176 log.info( "GmapSnpIndex set_meta %s %s" % (dataset,dataset.extra_files_path)) | |
177 pat = '(.*)\.(ref((\d\d)(\d\d))?3positions)\.(.+)?' | |
178 efp = dataset.extra_files_path | |
179 flist = os.listdir(efp) | |
180 for i,fname in enumerate(flist): | |
181 m = re.match(pat,fname) | |
182 if m: | |
183 assert len(m.groups()) == 6 | |
184 dataset.metadata.db_name = m.groups()[0] | |
185 dataset.metadata.snps_name = m.groups()[-1] | |
186 | |
187 | |
188 | |
189 | |
190 class IntervalIndexTree( Text ): | |
191 """ | |
192 A GMAP Interval Index Tree Map | |
193 created by iit_store | |
194 (/path/to/map)/(mapname).iit | |
195 """ | |
196 file_ext = 'iit' | |
197 is_binary = True | |
198 | |
199 class SpliceSitesIntervalIndexTree( IntervalIndexTree ): | |
200 """ | |
201 A GMAP Interval Index Tree Map | |
202 created by iit_store | |
203 """ | |
204 file_ext = 'splicesites.iit' | |
205 | |
206 class IntronsIntervalIndexTree( IntervalIndexTree ): | |
207 """ | |
208 A GMAP Interval Index Tree Map | |
209 created by iit_store | |
210 """ | |
211 file_ext = 'introns.iit' | |
212 | |
213 class SNPsIntervalIndexTree( IntervalIndexTree ): | |
214 """ | |
215 A GMAP Interval Index Tree Map | |
216 created by iit_store | |
217 """ | |
218 file_ext = 'snps.iit' | |
219 | |
220 class TallyIntervalIndexTree( IntervalIndexTree ): | |
221 """ | |
222 A GMAP Interval Index Tree Map | |
223 created by iit_store | |
224 """ | |
225 file_ext = 'tally.iit' | |
226 | |
227 class IntervalAnnotation( Text ): | |
228 """ | |
229 Class describing a GMAP Interval format: | |
230 >label coords optional_tag | |
231 optional_annotation (which may be zero, one, or multiple lines) | |
232 The coords should be of the form: | |
233 chr:position | |
234 chr:startposition..endposition | |
235 """ | |
236 file_ext = 'gmap_annotation' | |
237 """Add metadata elements""" | |
238 MetadataElement( name="annotations", default=0, desc="Number of interval annotations", readonly=True, optional=True, visible=False, no_value=0 ) | |
239 | |
240 def set_meta( self, dataset, **kwd ): | |
241 """ | |
242 Set the number of annotations and the number of data lines in dataset. | |
243 """ | |
244 data_lines = 0 | |
245 annotations = 0 | |
246 for line in file( dataset.file_name ): | |
247 line = line.strip() | |
248 if line and line.startswith( '>' ): | |
249 annotations += 1 | |
250 data_lines +=1 | |
251 else: | |
252 data_lines += 1 | |
253 dataset.metadata.data_lines = data_lines | |
254 dataset.metadata.annotations = annotations | |
255 def set_peek( self, dataset, is_multi_byte=False ): | |
256 if not dataset.dataset.purged: | |
257 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
258 if dataset.metadata.annotations: | |
259 dataset.blurb = "%s annotations" % util.commaify( str( dataset.metadata.annotations ) ) | |
260 else: | |
261 dataset.blurb = data.nice_size( dataset.get_size() ) | |
262 else: | |
263 dataset.peek = 'file does not exist' | |
264 dataset.blurb = 'file purged from disk' | |
265 | |
266 def sniff( self, filename ): | |
267 """ | |
268 Determines whether the file is a gmap annotation file | |
269 Format: | |
270 >label coords optional_tag | |
271 optional_annotation (which may be zero, one, or multiple lines) | |
272 For example, the label may be an EST accession, with the coords | |
273 representing its genomic position. Labels may be duplicated if | |
274 necessary. | |
275 The coords should be of the form | |
276 chr:position | |
277 chr:startposition..endposition | |
278 The term "chr:position" is equivalent to "chr:position..position". If | |
279 you want to indicate that the interval is on the minus strand or | |
280 reverse direction, then <endposition> may be less than <startposition>. | |
281 """ | |
282 try: | |
283 pat = '>(\S+)\s((\S+):(\d+)(\.\.(\d+))?(\s.(.+))?$' #>label chr:position[..endposition][ optional_tag] | |
284 fh = open( filename ) | |
285 count = 0 | |
286 while True and count < 10: | |
287 line = fh.readline() | |
288 if not line: | |
289 break #EOF | |
290 line = line.strip() | |
291 if line: #first non-empty line | |
292 if line.startswith( '>' ): | |
293 count += 1 | |
294 if re.match(pat,line) == None: # Failed to match | |
295 return False | |
296 finally: | |
297 fh.close() | |
298 return False | |
299 | |
300 class SpliceSiteAnnotation(IntervalAnnotation): | |
301 file_ext = 'gmap_splicesites' | |
302 """ | |
303 Example: | |
304 >NM_004448.ERBB2.exon1 17:35110090..35110091 donor 6678 | |
305 >NM_004448.ERBB2.exon2 17:35116768..35116769 acceptor 6678 | |
306 >NM_004448.ERBB2.exon2 17:35116920..35116921 donor 1179 | |
307 >NM_004448.ERBB2.exon3 17:35118099..35118100 acceptor 1179 | |
308 >NM_004449.ERG.exon1 21:38955452..38955451 donor 783 | |
309 >NM_004449.ERG.exon2 21:38878740..38878739 acceptor 783 | |
310 >NM_004449.ERG.exon2 21:38878638..38878637 donor 360 | |
311 >NM_004449.ERG.exon3 21:38869542..38869541 acceptor 360 | |
312 Each line must start with a ">" character, then be followed by an | |
313 identifier, which may have duplicates and can have any format, with | |
314 the gene name or exon number shown here only as a suggestion. Then | |
315 there should be the chromosomal coordinates which straddle the | |
316 exon-intron boundary, so one coordinate is on the exon and one is on | |
317 the intron. (Coordinates are all 1-based, so the first character of a | |
318 chromosome is number 1.) Finally, there should be the splice type: | |
319 "donor" or "acceptor". You may optionally store the intron distance | |
320 at the end. GSNAP can use this intron distance, if it is longer than | |
321 its value for --localsplicedist, to look for long introns at that | |
322 splice site. The same splice site may have different intron distances | |
323 in the database; GSNAP will use the longest intron distance reported | |
324 in searching for long introns. | |
325 """ | |
326 def sniff( self, filename ): # TODO | |
327 """ | |
328 Determines whether the file is a gmap splice site annotation file | |
329 """ | |
330 try: | |
331 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+))\s(donor|acceptor)(\s(\d+))?$' #>label chr:position..position donor|acceptor[ intron_dist] | |
332 fh = open( filename ) | |
333 count = 0 | |
334 while True and count < 10: | |
335 line = fh.readline() | |
336 if not line: | |
337 break #EOF | |
338 line = line.strip() | |
339 if line: #first non-empty line | |
340 count += 1 | |
341 if re.match(pat,line) == None: # Failed to match | |
342 return False | |
343 finally: | |
344 fh.close() | |
345 return False | |
346 | |
347 class IntronAnnotation(IntervalAnnotation): | |
348 file_ext = 'gmap_introns' | |
349 """ | |
350 Example: | |
351 >NM_004448.ERBB2.intron1 17:35110090..35116769 | |
352 >NM_004448.ERBB2.intron2 17:35116920..35118100 | |
353 >NM_004449.ERG.intron1 21:38955452..38878739 | |
354 >NM_004449.ERG.intron2 21:38878638..38869541 | |
355 The coordinates are 1-based, and specify the exon coordinates | |
356 surrounding the intron, with the first coordinate being from the donor | |
357 exon and the second one being from the acceptor exon. | |
358 """ | |
359 def sniff( self, filename ): # TODO | |
360 """ | |
361 Determines whether the file is a gmap Intron annotation file | |
362 """ | |
363 try: | |
364 pat = '>(\S+\.intron\d+)\s((\S+):(\d+)\.\.(\d+)(\s(.)+)?$' #>label chr:position | |
365 fh = open( filename ) | |
366 count = 0 | |
367 while True and count < 10: | |
368 line = fh.readline() | |
369 if not line: | |
370 break #EOF | |
371 line = line.strip() | |
372 if line: #first non-empty line | |
373 count += 1 | |
374 if re.match(pat,line) == None: # Failed to match | |
375 return False | |
376 finally: | |
377 fh.close() | |
378 return False | |
379 | |
380 class SNPAnnotation(IntervalAnnotation): | |
381 file_ext = 'gmap_snps' | |
382 """ | |
383 Example: | |
384 >rs62211261 21:14379270 CG | |
385 >rs62211262 21:14379281 AT | |
386 >rs62211263 21:14379298 WN | |
387 Each line must start with a ">" character, then be followed by an | |
388 identifier (which may have duplicates). Then there should be the | |
389 chromosomal coordinate of the SNP. (Coordinates are all 1-based, so | |
390 the first character of a chromosome is number 1.) Finally, there | |
391 should be the two possible alleles. (Previous versions required that | |
392 these be in alphabetical order: "AC", "AG", "AT", "CG", "CT", or "GT", | |
393 but that is no longer a requirement.) These alleles must correspond | |
394 to the possible nucleotides on the plus strand of the genome. If the | |
395 one of these two letters does not match the allele in the reference | |
396 sequence, that SNP will be ignored in subsequent processing as a | |
397 probable error. | |
398 | |
399 GSNAP also supports the idea of a wildcard SNP. A wildcard SNP allows | |
400 all nucleotides to match at that position, not just a given reference | |
401 and alternate allele. It is essentially as if an "N" were recorded at | |
402 that genomic location, although the index files still keep track of | |
403 the reference allele. To indicate that a position has a wildcard SNP, | |
404 you can indicate the genotype as "WN", where "W" is the reference | |
405 allele. Another indication of a wildcard SNP is to provide two | |
406 separate lines at that position with the genotypes "WX" and "WY", | |
407 where "W" is the reference allele and "X" and "Y" are two different | |
408 alternate alleles. | |
409 """ | |
410 def sniff( self, filename ): | |
411 """ | |
412 Determines whether the file is a gmap SNP annotation file | |
413 """ | |
414 try: | |
415 pat = '>(\S+)\s((\S+):(\d+)\s([TACGW][TACGN])$' #>label chr:position ATCG | |
416 fh = open( filename ) | |
417 count = 0 | |
418 while True and count < 10: | |
419 line = fh.readline() | |
420 if not line: | |
421 break #EOF | |
422 line = line.strip() | |
423 if line: #first non-empty line | |
424 count += 1 | |
425 if re.match(pat,line) == None: # Failed to match | |
426 return False | |
427 finally: | |
428 fh.close() | |
429 return False | |
430 | |
431 | |
432 class TallyAnnotation(IntervalAnnotation): | |
433 file_ext = 'gsnap_tally' | |
434 """ | |
435 Output produced by gsnap_tally | |
436 Example: | |
437 >144 chr20:57268791..57268935 | |
438 G0 | |
439 A1(1@7|1Q-3) | |
440 A2(1@36,1@1|1Q2,1Q-8) | |
441 C2 0.889,0.912,0.889,0.889,0.933,0.912,0.912,0.889,0.889,0.889 -2.66,-2.89,-2.66,-2.66,-3.16,-2.89,-2.89,-2.66,-2.66,-2.66 | |
442 C1 T1 0.888,0.9,0.888,0.9,0.913,0.9,0.911,0.888,0.9,0.913 -2.66,-2.78,-2.66,-2.78,-2.91,-2.78,-2.89,-2.66,-2.78,-2.91 | |
443 """ | |
444 def sniff( self, filename ): # TODO | |
445 """ | |
446 Determines whether the file is a gmap splice site annotation file | |
447 """ | |
448 try: | |
449 pat = '^>(\d+)\s((\S+):(\d+)\.\.(\d+))$' #>total chr:position..position | |
450 pat2 = '^[GATCN]\d.*$' #BaseCountDeatails | |
451 fh = open( filename ) | |
452 count = 0 | |
453 while True and count < 10: | |
454 line = fh.readline() | |
455 if not line: | |
456 break #EOF | |
457 line = line.strip() | |
458 if line: #first non-empty line | |
459 count += 1 | |
460 if re.match(pat,line) == None and re.match(pat2,line) == None: # Failed to match | |
461 return False | |
462 finally: | |
463 fh.close() | |
464 return False | |
465 | |
466 class GsnapResult( Text ): | |
467 """ | |
468 The default output format for gsnap. Can be used as input for gsnap_tally. | |
469 """ | |
470 file_ext = 'gsnap' | |
471 | |
472 |