comparison lib/galaxy/datatypes/metagenomics.py @ 0:e5c3175506b7 default tip

Initial tool configs for qiime, most need work.
author Jim Johnson <jj@umn.edu>
date Sun, 17 Jul 2011 10:30:11 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e5c3175506b7
1 """
2 metagenomics datatypes
3 James E Johnson - University of Minnesota
4 for Mothur
5 """
6
7 import data
8 import logging, os, sys, time, tempfile, shutil, string, glob, re
9 import galaxy.model
10 from galaxy.datatypes import metadata
11 from galaxy.datatypes import tabular
12 from galaxy.datatypes import sequence
13 from galaxy.datatypes.metadata import MetadataElement
14 from galaxy.datatypes.tabular import Tabular
15 from galaxy.datatypes.sequence import Fasta
16 from galaxy import util
17 from galaxy.datatypes.images import Html
18 from sniff import *
19
20 log = logging.getLogger(__name__)
21
22
23 ## Mothur Classes
24
25 class Otu( Tabular ):
26 file_ext = 'otu'
27
28 def sniff( self, filename ):
29 """
30 Determines whether the file is a otu (operational taxonomic unit) format
31 """
32 try:
33 fh = open( filename )
34 count = 0
35 while True:
36 line = fh.readline()
37 line = line.strip()
38 if not line:
39 break #EOF
40 if line:
41 if line[0] != '@':
42 linePieces = line.split('\t')
43 if len(linePieces) < 2:
44 return False
45 try:
46 check = int(linePieces[1])
47 if check + 2 != len(linePieces):
48 return False
49 except ValueError:
50 return False
51 count += 1
52 if count == 5:
53 return True
54 fh.close()
55 if count < 5 and count > 0:
56 return True
57 except:
58 pass
59 finally:
60 fh.close()
61 return False
62
63 class OtuList( Otu ):
64 file_ext = 'list'
65
66 class Sabund( Otu ):
67 file_ext = 'sabund'
68
69 def sniff( self, filename ):
70 """
71 Determines whether the file is a otu (operational taxonomic unit) format
72 label<TAB>count[<TAB>value(1..n)]
73
74 """
75 try:
76 fh = open( filename )
77 count = 0
78 while True:
79 line = fh.readline()
80 line = line.strip()
81 if not line:
82 break #EOF
83 if line:
84 if line[0] != '@':
85 linePieces = line.split('\t')
86 if len(linePieces) < 2:
87 return False
88 try:
89 check = int(linePieces[1])
90 if check + 2 != len(linePieces):
91 return False
92 for i in range( 2, len(linePieces)):
93 ival = int(linePieces[i])
94 except ValueError:
95 return False
96 count += 1
97 if count >= 5:
98 return True
99 fh.close()
100 if count < 5 and count > 0:
101 return True
102 except:
103 pass
104 finally:
105 fh.close()
106 return False
107
108 class Rabund( Sabund ):
109 file_ext = 'rabund'
110
111 class GroupAbund( Otu ):
112 file_ext = 'grpabund'
113 def init_meta( self, dataset, copy_from=None ):
114 Otu.init_meta( self, dataset, copy_from=copy_from )
115 def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ):
116 # See if file starts with header line
117 if dataset.has_data():
118 try:
119 fh = open( dataset.file_name )
120 line = fh.readline()
121 line = line.strip()
122 linePieces = line.split('\t')
123 if linePieces[0] == 'label' and linePieces[1] == 'Group':
124 skip=1
125 else:
126 skip=0
127 finally:
128 fh.close()
129 Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd)
130 def sniff( self, filename, vals_are_int=False):
131 """
132 Determines whether the file is a otu (operational taxonomic unit) Shared format
133 label<TAB>group<TAB>count[<TAB>value(1..n)]
134 The first line is column headings as of Mothur v 1.20
135 """
136 log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int)
137 try:
138 fh = open( filename )
139 count = 0
140 while True:
141 line = fh.readline()
142 line = line.strip()
143 if not line:
144 break #EOF
145 if line:
146 if line[0] != '@':
147 linePieces = line.split('\t')
148 if len(linePieces) < 3:
149 return False
150 if count > 0 or linePieces[0] != 'label':
151 try:
152 check = int(linePieces[2])
153 if check + 3 != len(linePieces):
154 return False
155 for i in range( 3, len(linePieces)):
156 if vals_are_int:
157 ival = int(linePieces[i])
158 else:
159 fval = float(linePieces[i])
160 except ValueError:
161 return False
162 count += 1
163 if count >= 5:
164 return True
165 fh.close()
166 if count < 5 and count > 0:
167 return True
168 except:
169 pass
170 finally:
171 fh.close()
172 return False
173
174 class SharedRabund( GroupAbund ):
175 file_ext = 'shared'
176
177
178 def sniff( self, filename ):
179 """
180 Determines whether the file is a otu (operational taxonomic unit) Shared format
181 label<TAB>group<TAB>count[<TAB>value(1..n)]
182 The first line is column headings as of Mothur v 1.20
183 """
184 # return GroupAbund.sniff(self,filename,True)
185 isme = GroupAbund.sniff(self,filename,True)
186 log.info( "is SharedRabund %s" % isme)
187 return isme
188
189
190 class RelAbund( GroupAbund ):
191 file_ext = 'relabund'
192
193 def sniff( self, filename ):
194 """
195 Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
196 label<TAB>group<TAB>count[<TAB>value(1..n)]
197 The first line is column headings as of Mothur v 1.20
198 """
199 # return GroupAbund.sniff(self,filename,False)
200 isme = GroupAbund.sniff(self,filename,False)
201 log.info( "is RelAbund %s" % isme)
202 return isme
203
204 class SecondaryStructureMap(Tabular):
205 file_ext = 'map'
206 def __init__(self, **kwd):
207 """Initialize secondary structure map datatype"""
208 Tabular.__init__( self, **kwd )
209 self.column_names = ['Map']
210
211 def sniff( self, filename ):
212 """
213 Determines whether the file is a secondary structure map format
214 A single column with an integer value which indicates the row that this row maps to.
215 check you make sure is structMap[10] = 380 then structMap[380] = 10.
216 """
217 try:
218 fh = open( filename )
219 line_num = 0
220 rowidxmap = {}
221 while True:
222 line = fh.readline()
223 line_num += 1
224 line = line.strip()
225 if not line:
226 break #EOF
227 if line:
228 try:
229 pointer = int(line)
230 if pointer > 0:
231 if pointer > line_num:
232 rowidxmap[line_num] = pointer
233 elif pointer < line_num & rowidxmap[pointer] != line_num:
234 return False
235 except ValueError:
236 return False
237 fh.close()
238 if count < 5 and count > 0:
239 return True
240 except:
241 pass
242 finally:
243 fh.close()
244 return False
245
246 class SequenceAlignment( Fasta ):
247 file_ext = 'align'
248 def __init__(self, **kwd):
249 Fasta.__init__( self, **kwd )
250 """Initialize AlignCheck datatype"""
251
252 def sniff( self, filename ):
253 """
254 Determines whether the file is in Mothur align fasta format
255 Each sequence line must be the same length
256 """
257
258 try:
259 fh = open( filename )
260 len = -1
261 while True:
262 line = fh.readline()
263 if not line:
264 break #EOF
265 line = line.strip()
266 if line: #first non-empty line
267 if line.startswith( '>' ):
268 #The next line.strip() must not be '', nor startwith '>'
269 line = fh.readline().strip()
270 if line == '' or line.startswith( '>' ):
271 break
272 if len < 0:
273 len = len(line)
274 elif len != len(line):
275 return False
276 else:
277 break #we found a non-empty line, but its not a fasta header
278 if len > 0:
279 return True
280 except:
281 pass
282 finally:
283 fh.close()
284 return False
285
286 class AlignCheck( Tabular ):
287 file_ext = 'align.check'
288 def __init__(self, **kwd):
289 """Initialize AlignCheck datatype"""
290 Tabular.__init__( self, **kwd )
291 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
292 self.column_types = ['str','int','int','int','int','int','int','int']
293 self.comment_lines = 1
294
295 def set_meta( self, dataset, overwrite = True, **kwd ):
296 # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
297 data_lines = 0
298 if dataset.has_data():
299 dataset_fh = open( dataset.file_name )
300 while True:
301 line = dataset_fh.readline()
302 if not line: break
303 data_lines += 1
304 dataset_fh.close()
305 dataset.metadata.comment_lines = 1
306 dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
307 dataset.metadata.column_names = self.column_names
308 dataset.metadata.column_types = self.column_types
309
310 class AlignReport(Tabular):
311 """
312 QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template
313 AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6
314 """
315 file_ext = 'align.report'
316 def __init__(self, **kwd):
317 """Initialize AlignCheck datatype"""
318 Tabular.__init__( self, **kwd )
319 self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
320 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
321 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
322 ]
323
324 class BellerophonChimera( Tabular ):
325 file_ext = 'bellerophon.chimera'
326 def __init__(self, **kwd):
327 """Initialize AlignCheck datatype"""
328 Tabular.__init__( self, **kwd )
329 self.column_names = ['Name','Score','Left','Right']
330
331 class SecondaryStructureMatch(Tabular):
332 """
333 name pound dash plus equal loop tilde total
334 9_1_12 42 68 8 28 275 420 872
335 9_1_14 36 68 6 26 266 422 851
336 9_1_15 44 68 8 28 276 418 873
337 9_1_16 34 72 6 30 267 430 860
338 9_1_18 46 80 2 36 261
339 """
340 def __init__(self, **kwd):
341 """Initialize SecondaryStructureMatch datatype"""
342 Tabular.__init__( self, **kwd )
343 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
344
345 class DistanceMatrix(data.Text):
346 file_ext = 'dist'
347 """Add metadata elements"""
348 MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )
349
350
351 class LowerTriangleDistanceMatrix(DistanceMatrix):
352 file_ext = 'lower.dist'
353 def __init__(self, **kwd):
354 """Initialize secondary structure map datatype"""
355 DistanceMatrix.__init__( self, **kwd )
356
357 def sniff( self, filename ):
358 """
359 Determines whether the file is a lower-triangle distance matrix (phylip) format
360 The first line has the number of sequences in the matrix.
361 The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
362 5
363 U68589
364 U68590 0.3371
365 U68591 0.3609 0.3782
366 U68592 0.4155 0.3197 0.4148
367 U68593 0.2872 0.1690 0.3361 0.2842
368 """
369 try:
370 fh = open( filename )
371 count = 0
372 while True:
373 line = fh.readline()
374 line = line.strip()
375 if not line:
376 break #EOF
377 if line:
378 if line[0] != '@':
379 linePieces = line.split('\t')
380 if len(linePieces) != 3:
381 return False
382 try:
383 check = float(linePieces[2])
384 except ValueError:
385 return False
386 count += 1
387 if count == 5:
388 return True
389 fh.close()
390 if count < 5 and count > 0:
391 return True
392 except:
393 pass
394 finally:
395 fh.close()
396 return False
397
398 class SquareDistanceMatrix(DistanceMatrix,Tabular):
399 file_ext = 'square.dist'
400 sequence_count = -1
401
402 def __init__(self, **kwd):
403 """Initialize secondary structure map datatype"""
404 Tabular.__init__( self, **kwd )
405 def init_meta( self, dataset, copy_from=None ):
406 data.Text.init_meta( self, dataset, copy_from=copy_from )
407 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
408 dataset.metadata.sequences = 0
409
410 def sniff( self, filename ):
411 """
412 Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
413 The first line has the number of sequences in the matrix.
414 The following lines have the sequence name in the first column plus a column for the distance to each sequence
415 in the row order in which they appear in the matrix.
416 3
417 U68589 0.0000 0.3371 0.3610
418 U68590 0.3371 0.0000 0.3783
419 U68590 0.3371 0.0000 0.3783
420 """
421 try:
422 fh = open( filename )
423 count = 0
424 line = fh.readline()
425 line = line.strip()
426 sequence_count = int(line)
427 col_cnt = seq_cnt + 1
428 while True:
429 line = fh.readline()
430 line = line.strip()
431 if not line:
432 break #EOF
433 if line:
434 if line[0] != '@':
435 linePieces = line.split('\t')
436 if len(linePieces) != col_cnt :
437 return False
438 try:
439 for i in range(1, col_cnt):
440 check = float(linePieces[i])
441 except ValueError:
442 return False
443 count += 1
444 if count == 5:
445 return True
446 fh.close()
447 if count < 5 and count > 0:
448 return True
449 except:
450 pass
451 finally:
452 fh.close()
453 return False
454
455 class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
456 file_ext = 'pair.dist'
457 def __init__(self, **kwd):
458 """Initialize secondary structure map datatype"""
459 Tabular.__init__( self, **kwd )
460 self.column_names = ['Sequence','Sequence','Distance']
461 self.column_types = ['str','str','float']
462 self.comment_lines = 1
463
464 def sniff( self, filename ):
465 """
466 Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
467 The first and second columns have the sequence names and the third column is the distance between those sequences.
468 """
469 try:
470 fh = open( filename )
471 count = 0
472 while True:
473 line = fh.readline()
474 line = line.strip()
475 if not line:
476 break #EOF
477 if line:
478 if line[0] != '@':
479 linePieces = line.split('\t')
480 if len(linePieces) != 3:
481 return False
482 try:
483 check = float(linePieces[2])
484 except ValueError:
485 return False
486 count += 1
487 if count == 5:
488 return True
489 fh.close()
490 if count < 5 and count > 0:
491 return True
492 except:
493 pass
494 finally:
495 fh.close()
496 return False
497
498 class AlignCheck(Tabular):
499 file_ext = 'align.check'
500 def __init__(self, **kwd):
501 """Initialize secondary structure map datatype"""
502 Tabular.__init__( self, **kwd )
503 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
504 self.columns = 8
505
506 class Names(Tabular):
507 file_ext = 'names'
508 def __init__(self, **kwd):
509 """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)"""
510 Tabular.__init__( self, **kwd )
511 self.column_names = ['name','representatives']
512 self.columns = 2
513
514 class Summary(Tabular):
515 file_ext = 'summary'
516 def __init__(self, **kwd):
517 """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file"""
518 Tabular.__init__( self, **kwd )
519 self.column_names = ['seqname','start','end','nbases','ambigs','polymer']
520 self.columns = 6
521
522 class Group(Tabular):
523 file_ext = 'groups'
524 def __init__(self, **kwd):
525 """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)"""
526 Tabular.__init__( self, **kwd )
527 self.column_names = ['name','group']
528 self.columns = 2
529
530 class Design(Tabular):
531 file_ext = 'design'
532 def __init__(self, **kwd):
533 """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
534 Tabular.__init__( self, **kwd )
535 self.column_names = ['group','grouping']
536 self.columns = 2
537
538 class AccNos(Tabular):
539 file_ext = 'accnos'
540 def __init__(self, **kwd):
541 """A list of names"""
542 Tabular.__init__( self, **kwd )
543 self.column_names = ['name']
544 self.columns = 1
545
546 class Oligos( data.Text ):
547 file_ext = 'oligos'
548
549 def sniff( self, filename ):
550 """
551 Determines whether the file is a otu (operational taxonomic unit) format
552 """
553 try:
554 fh = open( filename )
555 count = 0
556 while True:
557 line = fh.readline()
558 line = line.strip()
559 if not line:
560 break #EOF
561 else:
562 if line[0] != '#':
563 linePieces = line.split('\t')
564 if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
565 count += 1
566 continue
567 elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
568 count += 1
569 continue
570 else:
571 return False
572 if count > 20:
573 return True
574 if count > 0:
575 return True
576 except:
577 pass
578 finally:
579 fh.close()
580 return False
581
582 class Frequency(Tabular):
583 file_ext = 'freq'
584 def __init__(self, **kwd):
585 """A list of names"""
586 Tabular.__init__( self, **kwd )
587 self.column_names = ['position','frequency']
588 self.column_types = ['int','float']
589
590 def sniff( self, filename ):
591 """
592 Determines whether the file is a frequency tabular format for chimera analysis
593 #1.14.0
594 0 0.000
595 1 0.000
596 ...
597 155 0.975
598 """
599 try:
600 fh = open( filename )
601 count = 0
602 while True:
603 line = fh.readline()
604 line = line.strip()
605 if not line:
606 break #EOF
607 else:
608 if line[0] != '#':
609 try:
610 linePieces = line.split('\t')
611 i = int(linePieces[0])
612 f = float(linePieces[1])
613 count += 1
614 continue
615 except:
616 return False
617 if count > 20:
618 return True
619 if count > 0:
620 return True
621 except:
622 pass
623 finally:
624 fh.close()
625 return False
626
627 class Quantile(Tabular):
628 file_ext = 'quan'
629 MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
630 MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
631 def __init__(self, **kwd):
632 """Quantiles for chimera analysis"""
633 Tabular.__init__( self, **kwd )
634 self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
635 self.column_types = ['int','float','float','float','float','float','float']
636 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
637 log.info( "Mothur Quantile set_meta %s" % kwd)
638 def sniff( self, filename ):
639 """
640 Determines whether the file is a quantiles tabular format for chimera analysis
641 1 0 0 0 0 0 0
642 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161
643 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608
644 ...
645 """
646 try:
647 fh = open( filename )
648 count = 0
649 while True:
650 line = fh.readline()
651 line = line.strip()
652 if not line:
653 break #EOF
654 else:
655 if line[0] != '#':
656 try:
657 linePieces = line.split('\t')
658 i = int(linePieces[0])
659 f = float(linePieces[1])
660 f = float(linePieces[2])
661 f = float(linePieces[3])
662 f = float(linePieces[4])
663 f = float(linePieces[5])
664 f = float(linePieces[6])
665 count += 1
666 continue
667 except:
668 return False
669 if count > 10:
670 return True
671 if count > 0:
672 return True
673 except:
674 pass
675 finally:
676 fh.close()
677 return False
678
679 class FilteredQuantile(Quantile):
680 file_ext = 'filtered.quan'
681 def __init__(self, **kwd):
682 """Quantiles for chimera analysis"""
683 Quantile.__init__( self, **kwd )
684 self.filtered = True
685
686 class MaskedQuantile(Quantile):
687 file_ext = 'masked.quan'
688 def __init__(self, **kwd):
689 """Quantiles for chimera analysis"""
690 Quantile.__init__( self, **kwd )
691 self.masked = True
692 self.filtered = False
693
694 class FilteredMaskedQuantile(Quantile):
695 file_ext = 'filtered.masked.quan'
696 def __init__(self, **kwd):
697 """Quantiles for chimera analysis"""
698 Quantile.__init__( self, **kwd )
699 self.masked = True
700 self.filtered = True
701
702 class LaneMask(data.Text):
703 file_ext = 'filter'
704
705 def sniff( self, filename ):
706 """
707 Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones.
708 """
709 try:
710 fh = open( filename )
711 while True:
712 buff = fh.read(1000)
713 if not buff:
714 break #EOF
715 else:
716 if not re.match('^[01]+$',line):
717 return False
718 return True
719 except:
720 pass
721 finally:
722 close(fh)
723 return False
724
725 class SequenceTaxonomy(Tabular):
726 file_ext = 'seq.taxonomy'
727 """
728 A table with 2 columns:
729 - SequenceName
730 - Taxonomy (semicolon-separated taxonomy in descending order)
731 Example:
732 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
733 X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
734 AF052717.1 Eukaryota;Parabasalidea;
735 """
736 def __init__(self, **kwd):
737 Tabular.__init__( self, **kwd )
738 self.column_names = ['name','taxonomy']
739
740 def sniff( self, filename ):
741 """
742 Determines whether the file is a SequenceTaxonomy
743 """
744 try:
745 pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$'
746 fh = open( filename )
747 count = 0
748 while True:
749 line = fh.readline()
750 if not line:
751 break #EOF
752 line = line.strip()
753 if line:
754 fields = line.split('\t')
755 if len(fields) != 2:
756 return False
757 if not re.match(pat,fields[1]):
758 return False
759 count += 1
760 if count > 10:
761 break
762 if count > 0:
763 return True
764 except:
765 pass
766 finally:
767 fh.close()
768 return False
769
770 class RDPSequenceTaxonomy(SequenceTaxonomy):
771 file_ext = 'rdp.taxonomy'
772 """
773 A table with 2 columns:
774 - SequenceName
775 - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep)
776 Example:
777 AB001518.1 Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales;
778 AB001724.1 Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa;
779 AB001774.1 Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila;
780 """
781 def sniff( self, filename ):
782 """
783 Determines whether the file is a SequenceTaxonomy
784 """
785 try:
786 pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$'
787 fh = open( filename )
788 count = 0
789 while True:
790 line = fh.readline()
791 if not line:
792 break #EOF
793 line = line.strip()
794 if line:
795 fields = line.split('\t')
796 if len(fields) != 2:
797 return False
798 if not re.match(pat,fields[1]):
799 return False
800 count += 1
801 if count > 10:
802 break
803 if count > 0:
804 return True
805 except:
806 pass
807 finally:
808 fh.close()
809 return False
810
811 class ConsensusTaxonomy(Tabular):
812 file_ext = 'cons.taxonomy'
813 def __init__(self, **kwd):
814 """A list of names"""
815 Tabular.__init__( self, **kwd )
816 self.column_names = ['OTU','count','taxonomy']
817
818 class TaxonomySummary(Tabular):
819 file_ext = 'tax.summary'
820 def __init__(self, **kwd):
821 """A Summary of taxon classification"""
822 Tabular.__init__( self, **kwd )
823 self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']
824
825 class Phylip(data.Text):
826 file_ext = 'phy'
827
828 def sniff( self, filename ):
829 """
830 Determines whether the file is in Phylip format (Interleaved or Sequential)
831 The first line of the input file contains the number of species and the
832 number of characters, in free format, separated by blanks (not by
833 commas). The information for each species follows, starting with a
834 ten-character species name (which can include punctuation marks and blanks),
835 and continuing with the characters for that species.
836 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
837 Interleaved Example:
838 6 39
839 Archaeopt CGATGCTTAC CGCCGATGCT
840 HesperorniCGTTACTCGT TGTCGTTACT
841 BaluchitheTAATGTTAAT TGTTAATGTT
842 B. virginiTAATGTTCGT TGTTAATGTT
843 BrontosaurCAAAACCCAT CATCAAAACC
844 B.subtilisGGCAGCCAAT CACGGCAGCC
845
846 TACCGCCGAT GCTTACCGC
847 CGTTGTCGTT ACTCGTTGT
848 AATTGTTAAT GTTAATTGT
849 CGTTGTTAAT GTTCGTTGT
850 CATCATCAAA ACCCATCAT
851 AATCACGGCA GCCAATCAC
852 """
853 try:
854 fh = open( filename )
855 # counts line
856 line = fh.readline().strip()
857 linePieces = line.split()
858 count = int(linePieces[0])
859 seq_len = int(linePieces[1])
860 # data lines
861 """
862 TODO check data lines
863 while True:
864 line = fh.readline()
865 # name is the first 10 characters
866 name = line[0:10]
867 seq = line[10:].strip()
868 # nucleic base or amino acid 1-char designators (spaces allowed)
869 bases = ''.join(seq.split())
870 # float per base (each separated by space)
871 """
872 return True
873 except:
874 pass
875 finally:
876 close(fh)
877 return False
878
879
880 class Axes(Tabular):
881 file_ext = 'axes'
882
883 def __init__(self, **kwd):
884 """Initialize axes datatype"""
885 Tabular.__init__( self, **kwd )
886 def sniff( self, filename ):
887 """
888 Determines whether the file is an axes format
889 The first line may have column headings.
890 The following lines have the name in the first column plus float columns for each axis.
891 ==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
892 group axis1 axis2
893 forest 0.000000 0.145743
894 pasture 0.145743 0.000000
895
896 ==> 98_sq_phylip_amazon.nmds.axes <==
897 axis1 axis2
898 U68589 0.262608 -0.077498
899 U68590 0.027118 0.195197
900 U68591 0.329854 0.014395
901 """
902 try:
903 fh = open( filename )
904 count = 0
905 line = fh.readline()
906 line = line.strip()
907 col_cnt = None
908 while True:
909 line = fh.readline()
910 line = line.strip()
911 if not line:
912 break #EOF
913 if line:
914 fields = line.split('\t')
915 if col_cnt == None: # ignore values in first line as they may be column headings
916 col_cnt = len(fields)
917 else:
918 if len(fields) != col_cnt :
919 return False
920 try:
921 for i in range(1, col_cnt):
922 check = float(fields[i])
923 except ValueError:
924 return False
925 count += 1
926 if count > 10:
927 return True
928 if count > 0:
929 return True
930 except:
931 pass
932 finally:
933 fh.close()
934 return False
935
936 ## Qiime Classes
937
938 class QiimeMetadataMapping(Tabular):
939 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
940 file_ext = 'qiimemapping'
941
942 def __init__(self, **kwd):
943 """
944 http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
945 Information about the samples necessary to perform the data analysis.
946 # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
947 """
948 Tabular.__init__( self, **kwd )
949
950 def sniff( self, filename ):
951 """
952 Determines whether the file is a qiime mapping file
953 Just checking for an appropriate header line for now, could be improved
954 """
955 try:
956 pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
957 fh = open( filename )
958 while True:
959 line = dataset_fh.readline()
960 if re.match(pat,line):
961 return True
962 except:
963 pass
964 finally:
965 close(fh)
966 return False
967
968 def set_column_names(self, dataset):
969 if dataset.has_data():
970 dataset_fh = open( dataset.file_name )
971 line = dataset_fh.readline()
972 if line.startswith('#SampleID'):
973 dataset.metadata.column_names = line.strip().split('\t');
974 dataset_fh.close()
975
976 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
977 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
978 self.set_column_names(dataset)
979
980 class QiimeOTU(Tabular):
981 """
982 Associates OTUs with sequence IDs
983 Example:
984 0 FLP3FBN01C2MYD FLP3FBN01B2ALM
985 1 FLP3FBN01DF6NE FLP3FBN01CKW1J FLP3FBN01CHVM4
986 2 FLP3FBN01AXQ2Z
987 """
988 file_ext = 'qiimeotu'
989
990 class QiimeOTUTable(Tabular):
991 """
992 #Full OTU Counts
993 #OTU ID PC.354 PC.355 PC.356 Consensus Lineage
994 0 0 1 0 Root;Bacteria;Firmicutes;"Clostridia";Clostridiales
995 1 1 3 1 Root;Bacteria
996 2 0 2 2 Root;Bacteria;Bacteroidetes
997 """
998 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
999 file_ext = 'qiimeotutable'
1000 def init_meta( self, dataset, copy_from=None ):
1001 tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
1002 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
1003 self.set_column_names(dataset)
1004 def set_column_names(self, dataset):
1005 if dataset.has_data():
1006 dataset_fh = open( dataset.file_name )
1007 line = dataset_fh.readline()
1008 line = dataset_fh.readline()
1009 if line.startswith('#OTU ID'):
1010 dataset.metadata.column_names = line.strip().split('\t');
1011 dataset_fh.close()
1012 dataset.metadata.comment_lines = 2
1013
1014 class QiimeDistanceMatrix(Tabular):
1015 """
1016 PC.354 PC.355 PC.356
1017 PC.354 0.0 3.177 1.955
1018 PC.355 3.177 0.0 3.444
1019 PC.356 1.955 3.444 0.0
1020 """
1021 file_ext = 'qiimedistmat'
1022 def init_meta( self, dataset, copy_from=None ):
1023 tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
1024 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
1025 self.set_column_names(dataset)
1026 def set_column_names(self, dataset):
1027 if dataset.has_data():
1028 dataset_fh = open( dataset.file_name )
1029 line = dataset_fh.readline()
1030 # first line contains the names
1031 dataset.metadata.column_names = line.strip().split('\t');
1032 dataset_fh.close()
1033 dataset.metadata.comment_lines = 1
1034
1035 class QiimePCA(Tabular):
1036 """
1037 Principal Coordinate Analysis Data
1038 The principal coordinate (PC) axes (columns) for each sample (rows).
1039 Pairs of PCs can then be graphed to view the relationships between samples.
1040 The bottom of the output file contains the eigenvalues and % variation explained for each PC.
1041 Example:
1042 pc vector number 1 2 3
1043 PC.354 -0.309063936588 0.0398252112257 0.0744672231759
1044 PC.355 -0.106593922619 0.141125998277 0.0780204374172
1045 PC.356 -0.219869362955 0.00917241121781 0.0357281314115
1046
1047
1048 eigvals 0.480220500471 0.163567082874 0.125594470811
1049 % variation explained 51.6955484555 17.6079322939
1050 """
1051 file_ext = 'qiimepca'
1052
1053 class QiimeParams(Tabular):
1054 """
1055 ###pick_otus_through_otu_table.py parameters###
1056
1057 # OTU picker parameters
1058 pick_otus:otu_picking_method uclust
1059 pick_otus:clustering_algorithm furthest
1060
1061 # Representative set picker parameters
1062 pick_rep_set:rep_set_picking_method first
1063 pick_rep_set:sort_by otu
1064 """
1065 file_ext = 'qiimeparams'
1066
1067 class QiimePrefs(data.Text):
1068 """
1069 A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py.
1070 Example:
1071 {
1072 'background_color':'black',
1073
1074 'sample_coloring':
1075 {
1076 'Treatment':
1077 {
1078 'column':'Treatment',
1079 'colors':(('red',(0,100,100)),('blue',(240,100,100)))
1080 },
1081 'DOB':
1082 {
1083 'column':'DOB',
1084 'colors':(('red',(0,100,100)),('blue',(240,100,100)))
1085 }
1086 },
1087 'MONTE_CARLO_GROUP_DISTANCES':
1088 {
1089 'Treatment': 10,
1090 'DOB': 10
1091 }
1092 }
1093 """
1094 file_ext = 'qiimeprefs'
1095
1096 class QiimeTaxaSummary(Tabular):
1097 """
1098 Taxon PC.354 PC.355 PC.356
1099 Root;Bacteria;Actinobacteria 0.0 0.177 0.955
1100 Root;Bacteria;Firmicutes 0.177 0.0 0.444
1101 Root;Bacteria;Proteobacteria 0.955 0.444 0.0
1102 """
1103 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
1104 file_ext = 'qiimetaxsummary'
1105
1106 def set_column_names(self, dataset):
1107 if dataset.has_data():
1108 dataset_fh = open( dataset.file_name )
1109 line = dataset_fh.readline()
1110 if line.startswith('Taxon'):
1111 dataset.metadata.column_names = line.strip().split('\t');
1112 dataset_fh.close()
1113
1114 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
1115 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
1116 self.set_column_names(dataset)
1117
1118 if __name__ == '__main__':
1119 import doctest, sys
1120 doctest.testmod(sys.modules[__name__])
1121