Mercurial > repos > jjohnson > mothur_toolsuite
comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 0:3202a38e44d9
Migrated tool version 1.15.1 from old tool shed archive to new tool shed repository
author | jjohnson |
---|---|
date | Tue, 07 Jun 2011 17:32:23 -0400 |
parents | |
children | fcc0778f6987 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3202a38e44d9 |
---|---|
1 """ | |
2 metagenomics datatypes | |
3 James E Johnson - University of Minnesota | |
4 for Mothur | |
5 """ | |
6 | |
7 import data | |
8 import logging, os, sys, time, tempfile, shutil, string, glob, re | |
9 import galaxy.model | |
10 from galaxy.datatypes import metadata | |
11 from galaxy.datatypes import tabular | |
12 from galaxy.datatypes import sequence | |
13 from galaxy.datatypes.metadata import MetadataElement | |
14 from galaxy.datatypes.tabular import Tabular | |
15 from galaxy.datatypes.sequence import Fasta | |
16 from galaxy import util | |
17 from galaxy.datatypes.images import Html | |
18 from sniff import * | |
19 | |
20 log = logging.getLogger(__name__) | |
21 | |
22 | |
23 ## Mothur Classes | |
24 | |
25 class Otu( data.Text ): | |
26 file_ext = 'otu' | |
27 | |
28 def sniff( self, filename ): | |
29 """ | |
30 Determines whether the file is a otu (operational taxonomic unit) format | |
31 """ | |
32 try: | |
33 fh = open( filename ) | |
34 count = 0 | |
35 while True: | |
36 line = fh.readline() | |
37 line = line.strip() | |
38 if not line: | |
39 break #EOF | |
40 if line: | |
41 if line[0] != '@': | |
42 linePieces = line.split('\t') | |
43 if len(linePieces) < 2: | |
44 return False | |
45 try: | |
46 check = int(linePieces[1]) | |
47 if check + 2 != len(linePieces): | |
48 return False | |
49 except ValueError: | |
50 return False | |
51 count += 1 | |
52 if count == 5: | |
53 return True | |
54 fh.close() | |
55 if count < 5 and count > 0: | |
56 return True | |
57 except: | |
58 pass | |
59 finally: | |
60 fh.close() | |
61 return False | |
62 | |
63 class OtuList( Otu ): | |
64 file_ext = 'list' | |
65 | |
66 class Sabund( Otu ): | |
67 file_ext = 'sabund' | |
68 | |
69 def sniff( self, filename ): | |
70 """ | |
71 Determines whether the file is a otu (operational taxonomic unit) format | |
72 label<TAB>count[<TAB>value(1..n)] | |
73 """ | |
74 try: | |
75 fh = open( filename ) | |
76 count = 0 | |
77 while True: | |
78 line = fh.readline() | |
79 line = line.strip() | |
80 if not line: | |
81 break #EOF | |
82 if line: | |
83 if line[0] != '@': | |
84 linePieces = line.split('\t') | |
85 if len(linePieces) < 2: | |
86 return False | |
87 try: | |
88 check = int(linePieces[1]) | |
89 if check + 2 != len(linePieces): | |
90 return False | |
91 for i in range( 2, len(linePieces)): | |
92 ival = int(linePieces[i]) | |
93 except ValueError: | |
94 return False | |
95 count += 1 | |
96 if count >= 5: | |
97 return True | |
98 fh.close() | |
99 if count < 5 and count > 0: | |
100 return True | |
101 except: | |
102 pass | |
103 finally: | |
104 fh.close() | |
105 return False | |
106 | |
107 class Rabund( Sabund ): | |
108 file_ext = 'rabund' | |
109 | |
110 | |
111 class SharedRabund( Rabund ): | |
112 file_ext = 'shared' | |
113 | |
114 def sniff( self, filename ): | |
115 """ | |
116 Determines whether the file is a otu (operational taxonomic unit) Shared format | |
117 label<TAB>group<TAB>count[<TAB>value(1..n)] | |
118 """ | |
119 try: | |
120 fh = open( filename ) | |
121 count = 0 | |
122 while True: | |
123 line = fh.readline() | |
124 line = line.strip() | |
125 if not line: | |
126 break #EOF | |
127 if line: | |
128 if line[0] != '@': | |
129 linePieces = line.split('\t') | |
130 if len(linePieces) < 3: | |
131 return False | |
132 try: | |
133 check = int(linePieces[2]) | |
134 if check + 3 != len(linePieces): | |
135 return False | |
136 for i in range( 3, len(linePieces)): | |
137 ival = int(linePieces[i]) | |
138 except ValueError: | |
139 return False | |
140 count += 1 | |
141 if count >= 5: | |
142 return True | |
143 fh.close() | |
144 if count < 5 and count > 0: | |
145 return True | |
146 except: | |
147 pass | |
148 finally: | |
149 fh.close() | |
150 return False | |
151 | |
152 class RelAbund( Rabund ): | |
153 file_ext = 'relabund' | |
154 | |
155 def sniff( self, filename ): | |
156 """ | |
157 Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format | |
158 label<TAB>group<TAB>count[<TAB>value(1..n)] | |
159 """ | |
160 try: | |
161 fh = open( filename ) | |
162 count = 0 | |
163 while True: | |
164 line = fh.readline() | |
165 line = line.strip() | |
166 if not line: | |
167 break #EOF | |
168 if line: | |
169 if line[0] != '@': | |
170 linePieces = line.split('\t') | |
171 if len(linePieces) < 3: | |
172 return False | |
173 try: | |
174 check = int(linePieces[2]) | |
175 if check + 3 != len(linePieces): | |
176 return False | |
177 for i in range( 3, len(linePieces)): | |
178 fval = float(linePieces[i]) | |
179 except ValueError: | |
180 return False | |
181 count += 1 | |
182 if count >= 5: | |
183 return True | |
184 fh.close() | |
185 if count < 5 and count > 0: | |
186 return True | |
187 except: | |
188 pass | |
189 finally: | |
190 fh.close() | |
191 return False | |
192 | |
193 class SecondaryStructureMap(Tabular): | |
194 file_ext = 'map' | |
195 def __init__(self, **kwd): | |
196 """Initialize secondary structure map datatype""" | |
197 Tabular.__init__( self, **kwd ) | |
198 self.column_names = ['Map'] | |
199 | |
200 def sniff( self, filename ): | |
201 """ | |
202 Determines whether the file is a secondary structure map format | |
203 A single column with an integer value which indicates the row that this row maps to. | |
204 check you make sure is structMap[10] = 380 then structMap[380] = 10. | |
205 """ | |
206 try: | |
207 fh = open( filename ) | |
208 line_num = 0 | |
209 rowidxmap = {} | |
210 while True: | |
211 line = fh.readline() | |
212 line_num += 1 | |
213 line = line.strip() | |
214 if not line: | |
215 break #EOF | |
216 if line: | |
217 try: | |
218 pointer = int(line) | |
219 if pointer > 0: | |
220 if pointer > line_num: | |
221 rowidxmap[line_num] = pointer | |
222 elif pointer < line_num & rowidxmap[pointer] != line_num: | |
223 return False | |
224 except ValueError: | |
225 return False | |
226 fh.close() | |
227 if count < 5 and count > 0: | |
228 return True | |
229 except: | |
230 pass | |
231 finally: | |
232 fh.close() | |
233 return False | |
234 | |
235 class SequenceAlignment( Fasta ): | |
236 file_ext = 'align' | |
237 def __init__(self, **kwd): | |
238 Fasta.__init__( self, **kwd ) | |
239 """Initialize AlignCheck datatype""" | |
240 | |
241 def sniff( self, filename ): | |
242 """ | |
243 Determines whether the file is in Mothur align fasta format | |
244 Each sequence line must be the same length | |
245 """ | |
246 | |
247 try: | |
248 fh = open( filename ) | |
249 len = -1 | |
250 while True: | |
251 line = fh.readline() | |
252 if not line: | |
253 break #EOF | |
254 line = line.strip() | |
255 if line: #first non-empty line | |
256 if line.startswith( '>' ): | |
257 #The next line.strip() must not be '', nor startwith '>' | |
258 line = fh.readline().strip() | |
259 if line == '' or line.startswith( '>' ): | |
260 break | |
261 if len < 0: | |
262 len = len(line) | |
263 elif len != len(line): | |
264 return False | |
265 else: | |
266 break #we found a non-empty line, but its not a fasta header | |
267 if len > 0: | |
268 return True | |
269 except: | |
270 pass | |
271 finally: | |
272 fh.close() | |
273 return False | |
274 | |
275 class AlignCheck( Tabular ): | |
276 file_ext = 'align.check' | |
277 def __init__(self, **kwd): | |
278 """Initialize AlignCheck datatype""" | |
279 Tabular.__init__( self, **kwd ) | |
280 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
281 self.column_types = ['str','int','int','int','int','int','int','int'] | |
282 self.comment_lines = 1 | |
283 | |
284 def set_meta( self, dataset, overwrite = True, **kwd ): | |
285 # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 ) | |
286 data_lines = 0 | |
287 if dataset.has_data(): | |
288 dataset_fh = open( dataset.file_name ) | |
289 while True: | |
290 line = dataset_fh.readline() | |
291 if not line: break | |
292 data_lines += 1 | |
293 dataset_fh.close() | |
294 dataset.metadata.comment_lines = 1 | |
295 dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 | |
296 dataset.metadata.column_names = self.column_names | |
297 dataset.metadata.column_types = self.column_types | |
298 | |
299 class AlignReport(Tabular): | |
300 """ | |
301 QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template | |
302 AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6 | |
303 """ | |
304 file_ext = 'align.report' | |
305 def __init__(self, **kwd): | |
306 """Initialize AlignCheck datatype""" | |
307 Tabular.__init__( self, **kwd ) | |
308 self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore', | |
309 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd', | |
310 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template' | |
311 ] | |
312 | |
313 class BellerophonChimera( Tabular ): | |
314 file_ext = 'bellerophon.chimera' | |
315 def __init__(self, **kwd): | |
316 """Initialize AlignCheck datatype""" | |
317 Tabular.__init__( self, **kwd ) | |
318 self.column_names = ['Name','Score','Left','Right'] | |
319 | |
320 class SecondaryStructureMatch(Tabular): | |
321 """ | |
322 name pound dash plus equal loop tilde total | |
323 9_1_12 42 68 8 28 275 420 872 | |
324 9_1_14 36 68 6 26 266 422 851 | |
325 9_1_15 44 68 8 28 276 418 873 | |
326 9_1_16 34 72 6 30 267 430 860 | |
327 9_1_18 46 80 2 36 261 | |
328 """ | |
329 def __init__(self, **kwd): | |
330 """Initialize SecondaryStructureMatch datatype""" | |
331 Tabular.__init__( self, **kwd ) | |
332 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
333 | |
334 class DistanceMatrix(data.Text): | |
335 file_ext = 'dist' | |
336 """Add metadata elements""" | |
337 MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 ) | |
338 | |
339 | |
340 class LowerTriangleDistanceMatrix(DistanceMatrix): | |
341 file_ext = 'lower.dist' | |
342 def __init__(self, **kwd): | |
343 """Initialize secondary structure map datatype""" | |
344 DistanceMatrix.__init__( self, **kwd ) | |
345 | |
346 def sniff( self, filename ): | |
347 """ | |
348 Determines whether the file is a lower-triangle distance matrix (phylip) format | |
349 The first line has the number of sequences in the matrix. | |
350 The remaining lines have the sequence name followed by a list of distances from all preceeding sequences | |
351 5 | |
352 U68589 | |
353 U68590 0.3371 | |
354 U68591 0.3609 0.3782 | |
355 U68592 0.4155 0.3197 0.4148 | |
356 U68593 0.2872 0.1690 0.3361 0.2842 | |
357 """ | |
358 try: | |
359 fh = open( filename ) | |
360 count = 0 | |
361 while True: | |
362 line = fh.readline() | |
363 line = line.strip() | |
364 if not line: | |
365 break #EOF | |
366 if line: | |
367 if line[0] != '@': | |
368 linePieces = line.split('\t') | |
369 if len(linePieces) != 3: | |
370 return False | |
371 try: | |
372 check = float(linePieces[2]) | |
373 except ValueError: | |
374 return False | |
375 count += 1 | |
376 if count == 5: | |
377 return True | |
378 fh.close() | |
379 if count < 5 and count > 0: | |
380 return True | |
381 except: | |
382 pass | |
383 finally: | |
384 fh.close() | |
385 return False | |
386 | |
387 class SquareDistanceMatrix(DistanceMatrix,Tabular): | |
388 file_ext = 'square.dist' | |
389 sequence_count = -1 | |
390 | |
391 def __init__(self, **kwd): | |
392 """Initialize secondary structure map datatype""" | |
393 Tabular.__init__( self, **kwd ) | |
394 def init_meta( self, dataset, copy_from=None ): | |
395 data.Text.init_meta( self, dataset, copy_from=copy_from ) | |
396 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
397 dataset.metadata.sequences = 0 | |
398 | |
399 def sniff( self, filename ): | |
400 """ | |
401 Determines whether the file is a square distance matrix (Column-formatted distance matrix) format | |
402 The first line has the number of sequences in the matrix. | |
403 The following lines have the sequence name in the first column plus a column for the distance to each sequence | |
404 in the row order in which they appear in the matrix. | |
405 3 | |
406 U68589 0.0000 0.3371 0.3610 | |
407 U68590 0.3371 0.0000 0.3783 | |
408 U68590 0.3371 0.0000 0.3783 | |
409 """ | |
410 try: | |
411 fh = open( filename ) | |
412 count = 0 | |
413 line = fh.readline() | |
414 line = line.strip() | |
415 sequence_count = int(line) | |
416 col_cnt = seq_cnt + 1 | |
417 while True: | |
418 line = fh.readline() | |
419 line = line.strip() | |
420 if not line: | |
421 break #EOF | |
422 if line: | |
423 if line[0] != '@': | |
424 linePieces = line.split('\t') | |
425 if len(linePieces) != col_cnt : | |
426 return False | |
427 try: | |
428 for i in range(1, col_cnt): | |
429 check = float(linePieces[i]) | |
430 except ValueError: | |
431 return False | |
432 count += 1 | |
433 if count == 5: | |
434 return True | |
435 fh.close() | |
436 if count < 5 and count > 0: | |
437 return True | |
438 except: | |
439 pass | |
440 finally: | |
441 fh.close() | |
442 return False | |
443 | |
444 class PairwiseDistanceMatrix(DistanceMatrix,Tabular): | |
445 file_ext = 'pair.dist' | |
446 def __init__(self, **kwd): | |
447 """Initialize secondary structure map datatype""" | |
448 Tabular.__init__( self, **kwd ) | |
449 self.column_names = ['Sequence','Sequence','Distance'] | |
450 self.column_types = ['str','str','float'] | |
451 self.comment_lines = 1 | |
452 | |
453 def sniff( self, filename ): | |
454 """ | |
455 Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format | |
456 The first and second columns have the sequence names and the third column is the distance between those sequences. | |
457 """ | |
458 try: | |
459 fh = open( filename ) | |
460 count = 0 | |
461 while True: | |
462 line = fh.readline() | |
463 line = line.strip() | |
464 if not line: | |
465 break #EOF | |
466 if line: | |
467 if line[0] != '@': | |
468 linePieces = line.split('\t') | |
469 if len(linePieces) != 3: | |
470 return False | |
471 try: | |
472 check = float(linePieces[2]) | |
473 except ValueError: | |
474 return False | |
475 count += 1 | |
476 if count == 5: | |
477 return True | |
478 fh.close() | |
479 if count < 5 and count > 0: | |
480 return True | |
481 except: | |
482 pass | |
483 finally: | |
484 fh.close() | |
485 return False | |
486 | |
487 class Alignment(Tabular): | |
488 file_ext = 'align' | |
489 def __init__(self, **kwd): | |
490 """Initialize secondary structure map datatype""" | |
491 Tabular.__init__( self, **kwd ) | |
492 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
493 | |
494 class AlignCheck(Tabular): | |
495 file_ext = 'align.check' | |
496 def __init__(self, **kwd): | |
497 """Initialize secondary structure map datatype""" | |
498 Tabular.__init__( self, **kwd ) | |
499 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
500 | |
501 class Names(Tabular): | |
502 file_ext = 'names' | |
503 def __init__(self, **kwd): | |
504 """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" | |
505 Tabular.__init__( self, **kwd ) | |
506 self.column_names = ['name','representatives'] | |
507 | |
508 class Summary(Tabular): | |
509 file_ext = 'summary' | |
510 def __init__(self, **kwd): | |
511 """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" | |
512 Tabular.__init__( self, **kwd ) | |
513 self.column_names = ['seqname','start','end','nbases','ambigs','polymer'] | |
514 | |
515 class Group(Tabular): | |
516 file_ext = 'groups' | |
517 def __init__(self, **kwd): | |
518 """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" | |
519 Tabular.__init__( self, **kwd ) | |
520 self.column_names = ['name','group'] | |
521 | |
522 class AccNos(Tabular): | |
523 file_ext = 'accnos' | |
524 def __init__(self, **kwd): | |
525 """A list of names""" | |
526 Tabular.__init__( self, **kwd ) | |
527 self.column_names = ['name'] | |
528 | |
529 class Oligos( data.Text ): | |
530 file_ext = 'oligos' | |
531 | |
532 def sniff( self, filename ): | |
533 """ | |
534 Determines whether the file is a otu (operational taxonomic unit) format | |
535 """ | |
536 try: | |
537 fh = open( filename ) | |
538 count = 0 | |
539 while True: | |
540 line = fh.readline() | |
541 line = line.strip() | |
542 if not line: | |
543 break #EOF | |
544 else: | |
545 if line[0] != '#': | |
546 linePieces = line.split('\t') | |
547 if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]): | |
548 count += 1 | |
549 continue | |
550 elif len(linePieces) == 3 and re.match('barcode',linePieces[0]): | |
551 count += 1 | |
552 continue | |
553 else: | |
554 return False | |
555 if count > 20: | |
556 return True | |
557 if count > 0: | |
558 return True | |
559 except: | |
560 pass | |
561 finally: | |
562 fh.close() | |
563 return False | |
564 | |
565 class Frequency(Tabular): | |
566 file_ext = 'freq' | |
567 def __init__(self, **kwd): | |
568 """A list of names""" | |
569 Tabular.__init__( self, **kwd ) | |
570 self.column_names = ['position','frequency'] | |
571 self.column_types = ['int','float'] | |
572 | |
573 def sniff( self, filename ): | |
574 """ | |
575 Determines whether the file is a frequency tabular format for chimera analysis | |
576 #1.14.0 | |
577 0 0.000 | |
578 1 0.000 | |
579 ... | |
580 155 0.975 | |
581 """ | |
582 try: | |
583 fh = open( filename ) | |
584 count = 0 | |
585 while True: | |
586 line = fh.readline() | |
587 line = line.strip() | |
588 if not line: | |
589 break #EOF | |
590 else: | |
591 if line[0] != '#': | |
592 try: | |
593 linePieces = line.split('\t') | |
594 i = int(linePieces[0]) | |
595 f = float(linePieces[1]) | |
596 count += 1 | |
597 continue | |
598 except: | |
599 return False | |
600 if count > 20: | |
601 return True | |
602 if count > 0: | |
603 return True | |
604 except: | |
605 pass | |
606 finally: | |
607 fh.close() | |
608 return False | |
609 | |
610 class Quantile(Tabular): | |
611 file_ext = 'quan' | |
612 MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True) | |
613 MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True) | |
614 def __init__(self, **kwd): | |
615 """Quantiles for chimera analysis""" | |
616 Tabular.__init__( self, **kwd ) | |
617 self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine'] | |
618 self.column_types = ['int','float','float','float','float','float','float'] | |
619 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
620 log.info( "Mothur Quantile set_meta %s" % kwd) | |
621 def sniff( self, filename ): | |
622 """ | |
623 Determines whether the file is a quantiles tabular format for chimera analysis | |
624 1 0 0 0 0 0 0 | |
625 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 | |
626 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 | |
627 ... | |
628 """ | |
629 try: | |
630 fh = open( filename ) | |
631 count = 0 | |
632 while True: | |
633 line = fh.readline() | |
634 line = line.strip() | |
635 if not line: | |
636 break #EOF | |
637 else: | |
638 if line[0] != '#': | |
639 try: | |
640 linePieces = line.split('\t') | |
641 i = int(linePieces[0]) | |
642 f = float(linePieces[1]) | |
643 f = float(linePieces[2]) | |
644 f = float(linePieces[3]) | |
645 f = float(linePieces[4]) | |
646 f = float(linePieces[5]) | |
647 f = float(linePieces[6]) | |
648 count += 1 | |
649 continue | |
650 except: | |
651 return False | |
652 if count > 10: | |
653 return True | |
654 if count > 0: | |
655 return True | |
656 except: | |
657 pass | |
658 finally: | |
659 fh.close() | |
660 return False | |
661 | |
662 class FilteredQuantile(Quantile): | |
663 file_ext = 'filtered.quan' | |
664 def __init__(self, **kwd): | |
665 """Quantiles for chimera analysis""" | |
666 Quantile.__init__( self, **kwd ) | |
667 self.filtered = True | |
668 | |
669 class MaskedQuantile(Quantile): | |
670 file_ext = 'masked.quan' | |
671 def __init__(self, **kwd): | |
672 """Quantiles for chimera analysis""" | |
673 Quantile.__init__( self, **kwd ) | |
674 self.masked = True | |
675 self.filtered = False | |
676 | |
677 class FilteredMaskedQuantile(Quantile): | |
678 file_ext = 'filtered.masked.quan' | |
679 def __init__(self, **kwd): | |
680 """Quantiles for chimera analysis""" | |
681 Quantile.__init__( self, **kwd ) | |
682 self.masked = True | |
683 self.filtered = True | |
684 | |
685 class LaneMask(data.Text): | |
686 file_ext = 'filter' | |
687 | |
688 def sniff( self, filename ): | |
689 """ | |
690 Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. | |
691 """ | |
692 try: | |
693 fh = open( filename ) | |
694 while True: | |
695 buff = fh.read(1000) | |
696 if not buff: | |
697 break #EOF | |
698 else: | |
699 if not re.match('^[01]+$',line): | |
700 return False | |
701 return True | |
702 except: | |
703 pass | |
704 finally: | |
705 close(fh) | |
706 return False | |
707 | |
708 class SequenceTaxonomy(Tabular): | |
709 file_ext = 'taxonomy' | |
710 def __init__(self, **kwd): | |
711 """A list of names""" | |
712 Tabular.__init__( self, **kwd ) | |
713 self.column_names = ['name','taxonomy'] | |
714 | |
715 class ConsensusTaxonomy(Tabular): | |
716 file_ext = 'cons.taxonomy' | |
717 def __init__(self, **kwd): | |
718 """A list of names""" | |
719 Tabular.__init__( self, **kwd ) | |
720 self.column_names = ['OTU','count','taxonomy'] | |
721 | |
722 class TaxonomySummary(Tabular): | |
723 file_ext = 'tax.summary' | |
724 def __init__(self, **kwd): | |
725 """A Summary of taxon classification""" | |
726 Tabular.__init__( self, **kwd ) | |
727 self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total'] | |
728 | |
729 class Phylip(data.Text): | |
730 file_ext = 'phy' | |
731 | |
732 def sniff( self, filename ): | |
733 """ | |
734 Determines whether the file is in Phylip format (Interleaved or Sequential) | |
735 The first line of the input file contains the number of species and the | |
736 number of characters, in free format, separated by blanks (not by | |
737 commas). The information for each species follows, starting with a | |
738 ten-character species name (which can include punctuation marks and blanks), | |
739 and continuing with the characters for that species. | |
740 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
741 Interleaved Example: | |
742 6 39 | |
743 Archaeopt CGATGCTTAC CGCCGATGCT | |
744 HesperorniCGTTACTCGT TGTCGTTACT | |
745 BaluchitheTAATGTTAAT TGTTAATGTT | |
746 B. virginiTAATGTTCGT TGTTAATGTT | |
747 BrontosaurCAAAACCCAT CATCAAAACC | |
748 B.subtilisGGCAGCCAAT CACGGCAGCC | |
749 | |
750 TACCGCCGAT GCTTACCGC | |
751 CGTTGTCGTT ACTCGTTGT | |
752 AATTGTTAAT GTTAATTGT | |
753 CGTTGTTAAT GTTCGTTGT | |
754 CATCATCAAA ACCCATCAT | |
755 AATCACGGCA GCCAATCAC | |
756 """ | |
757 try: | |
758 fh = open( filename ) | |
759 # counts line | |
760 line = fh.readline().strip() | |
761 linePieces = line.split() | |
762 count = int(linePieces[0]) | |
763 seq_len = int(linePieces[1]) | |
764 # data lines | |
765 """ | |
766 TODO check data lines | |
767 while True: | |
768 line = fh.readline() | |
769 # name is the first 10 characters | |
770 name = line[0:10] | |
771 seq = line[10:].strip() | |
772 # nucleic base or amino acid 1-char designators (spaces allowed) | |
773 bases = ''.join(seq.split()) | |
774 # float per base (each separated by space) | |
775 """ | |
776 return True | |
777 except: | |
778 pass | |
779 finally: | |
780 close(fh) | |
781 return False | |
782 | |
783 | |
784 ## Qiime Classes | |
785 | |
786 class MetadataMapping(Tabular): | |
787 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) | |
788 file_ext = 'mapping' | |
789 | |
790 def __init__(self, **kwd): | |
791 """ | |
792 http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview | |
793 Information about the samples necessary to perform the data analysis. | |
794 # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description'] | |
795 """ | |
796 Tabular.__init__( self, **kwd ) | |
797 | |
798 def sniff( self, filename ): | |
799 """ | |
800 Determines whether the file is a qiime mapping file | |
801 Just checking for an appropriate header line for now, could be improved | |
802 """ | |
803 try: | |
804 pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription' | |
805 fh = open( filename ) | |
806 while True: | |
807 line = dataset_fh.readline() | |
808 if re.match(pat,line): | |
809 return True | |
810 except: | |
811 pass | |
812 finally: | |
813 close(fh) | |
814 return False | |
815 | |
816 def set_column_names(self, dataset): | |
817 if dataset.has_data(): | |
818 dataset_fh = open( dataset.file_name ) | |
819 line = dataset_fh.readline() | |
820 if line.startswith('#SampleID'): | |
821 dataset.metadata.column_names = line.strip().split('\t'); | |
822 dataset_fh.close() | |
823 | |
824 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): | |
825 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) | |
826 self.set_column_names(dataset) | |
827 | |
828 if __name__ == '__main__': | |
829 import doctest, sys | |
830 doctest.testmod(sys.modules[__name__]) | |
831 |