Mercurial > repos > jjohnson > qiime
comparison lib/galaxy/datatypes/metagenomics.py @ 0:e5c3175506b7 default tip
Initial tool configs for qiime, most need work.
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Sun, 17 Jul 2011 10:30:11 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e5c3175506b7 |
---|---|
1 """ | |
2 metagenomics datatypes | |
3 James E Johnson - University of Minnesota | |
4 for Mothur | |
5 """ | |
6 | |
7 import data | |
8 import logging, os, sys, time, tempfile, shutil, string, glob, re | |
9 import galaxy.model | |
10 from galaxy.datatypes import metadata | |
11 from galaxy.datatypes import tabular | |
12 from galaxy.datatypes import sequence | |
13 from galaxy.datatypes.metadata import MetadataElement | |
14 from galaxy.datatypes.tabular import Tabular | |
15 from galaxy.datatypes.sequence import Fasta | |
16 from galaxy import util | |
17 from galaxy.datatypes.images import Html | |
18 from sniff import * | |
19 | |
20 log = logging.getLogger(__name__) | |
21 | |
22 | |
23 ## Mothur Classes | |
24 | |
25 class Otu( Tabular ): | |
26 file_ext = 'otu' | |
27 | |
28 def sniff( self, filename ): | |
29 """ | |
30 Determines whether the file is a otu (operational taxonomic unit) format | |
31 """ | |
32 try: | |
33 fh = open( filename ) | |
34 count = 0 | |
35 while True: | |
36 line = fh.readline() | |
37 line = line.strip() | |
38 if not line: | |
39 break #EOF | |
40 if line: | |
41 if line[0] != '@': | |
42 linePieces = line.split('\t') | |
43 if len(linePieces) < 2: | |
44 return False | |
45 try: | |
46 check = int(linePieces[1]) | |
47 if check + 2 != len(linePieces): | |
48 return False | |
49 except ValueError: | |
50 return False | |
51 count += 1 | |
52 if count == 5: | |
53 return True | |
54 fh.close() | |
55 if count < 5 and count > 0: | |
56 return True | |
57 except: | |
58 pass | |
59 finally: | |
60 fh.close() | |
61 return False | |
62 | |
63 class OtuList( Otu ): | |
64 file_ext = 'list' | |
65 | |
66 class Sabund( Otu ): | |
67 file_ext = 'sabund' | |
68 | |
69 def sniff( self, filename ): | |
70 """ | |
71 Determines whether the file is a otu (operational taxonomic unit) format | |
72 label<TAB>count[<TAB>value(1..n)] | |
73 | |
74 """ | |
75 try: | |
76 fh = open( filename ) | |
77 count = 0 | |
78 while True: | |
79 line = fh.readline() | |
80 line = line.strip() | |
81 if not line: | |
82 break #EOF | |
83 if line: | |
84 if line[0] != '@': | |
85 linePieces = line.split('\t') | |
86 if len(linePieces) < 2: | |
87 return False | |
88 try: | |
89 check = int(linePieces[1]) | |
90 if check + 2 != len(linePieces): | |
91 return False | |
92 for i in range( 2, len(linePieces)): | |
93 ival = int(linePieces[i]) | |
94 except ValueError: | |
95 return False | |
96 count += 1 | |
97 if count >= 5: | |
98 return True | |
99 fh.close() | |
100 if count < 5 and count > 0: | |
101 return True | |
102 except: | |
103 pass | |
104 finally: | |
105 fh.close() | |
106 return False | |
107 | |
108 class Rabund( Sabund ): | |
109 file_ext = 'rabund' | |
110 | |
111 class GroupAbund( Otu ): | |
112 file_ext = 'grpabund' | |
113 def init_meta( self, dataset, copy_from=None ): | |
114 Otu.init_meta( self, dataset, copy_from=copy_from ) | |
115 def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ): | |
116 # See if file starts with header line | |
117 if dataset.has_data(): | |
118 try: | |
119 fh = open( dataset.file_name ) | |
120 line = fh.readline() | |
121 line = line.strip() | |
122 linePieces = line.split('\t') | |
123 if linePieces[0] == 'label' and linePieces[1] == 'Group': | |
124 skip=1 | |
125 else: | |
126 skip=0 | |
127 finally: | |
128 fh.close() | |
129 Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd) | |
130 def sniff( self, filename, vals_are_int=False): | |
131 """ | |
132 Determines whether the file is a otu (operational taxonomic unit) Shared format | |
133 label<TAB>group<TAB>count[<TAB>value(1..n)] | |
134 The first line is column headings as of Mothur v 1.20 | |
135 """ | |
136 log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int) | |
137 try: | |
138 fh = open( filename ) | |
139 count = 0 | |
140 while True: | |
141 line = fh.readline() | |
142 line = line.strip() | |
143 if not line: | |
144 break #EOF | |
145 if line: | |
146 if line[0] != '@': | |
147 linePieces = line.split('\t') | |
148 if len(linePieces) < 3: | |
149 return False | |
150 if count > 0 or linePieces[0] != 'label': | |
151 try: | |
152 check = int(linePieces[2]) | |
153 if check + 3 != len(linePieces): | |
154 return False | |
155 for i in range( 3, len(linePieces)): | |
156 if vals_are_int: | |
157 ival = int(linePieces[i]) | |
158 else: | |
159 fval = float(linePieces[i]) | |
160 except ValueError: | |
161 return False | |
162 count += 1 | |
163 if count >= 5: | |
164 return True | |
165 fh.close() | |
166 if count < 5 and count > 0: | |
167 return True | |
168 except: | |
169 pass | |
170 finally: | |
171 fh.close() | |
172 return False | |
173 | |
174 class SharedRabund( GroupAbund ): | |
175 file_ext = 'shared' | |
176 | |
177 | |
178 def sniff( self, filename ): | |
179 """ | |
180 Determines whether the file is a otu (operational taxonomic unit) Shared format | |
181 label<TAB>group<TAB>count[<TAB>value(1..n)] | |
182 The first line is column headings as of Mothur v 1.20 | |
183 """ | |
184 # return GroupAbund.sniff(self,filename,True) | |
185 isme = GroupAbund.sniff(self,filename,True) | |
186 log.info( "is SharedRabund %s" % isme) | |
187 return isme | |
188 | |
189 | |
190 class RelAbund( GroupAbund ): | |
191 file_ext = 'relabund' | |
192 | |
193 def sniff( self, filename ): | |
194 """ | |
195 Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format | |
196 label<TAB>group<TAB>count[<TAB>value(1..n)] | |
197 The first line is column headings as of Mothur v 1.20 | |
198 """ | |
199 # return GroupAbund.sniff(self,filename,False) | |
200 isme = GroupAbund.sniff(self,filename,False) | |
201 log.info( "is RelAbund %s" % isme) | |
202 return isme | |
203 | |
204 class SecondaryStructureMap(Tabular): | |
205 file_ext = 'map' | |
206 def __init__(self, **kwd): | |
207 """Initialize secondary structure map datatype""" | |
208 Tabular.__init__( self, **kwd ) | |
209 self.column_names = ['Map'] | |
210 | |
211 def sniff( self, filename ): | |
212 """ | |
213 Determines whether the file is a secondary structure map format | |
214 A single column with an integer value which indicates the row that this row maps to. | |
215 check you make sure is structMap[10] = 380 then structMap[380] = 10. | |
216 """ | |
217 try: | |
218 fh = open( filename ) | |
219 line_num = 0 | |
220 rowidxmap = {} | |
221 while True: | |
222 line = fh.readline() | |
223 line_num += 1 | |
224 line = line.strip() | |
225 if not line: | |
226 break #EOF | |
227 if line: | |
228 try: | |
229 pointer = int(line) | |
230 if pointer > 0: | |
231 if pointer > line_num: | |
232 rowidxmap[line_num] = pointer | |
233 elif pointer < line_num & rowidxmap[pointer] != line_num: | |
234 return False | |
235 except ValueError: | |
236 return False | |
237 fh.close() | |
238 if count < 5 and count > 0: | |
239 return True | |
240 except: | |
241 pass | |
242 finally: | |
243 fh.close() | |
244 return False | |
245 | |
246 class SequenceAlignment( Fasta ): | |
247 file_ext = 'align' | |
248 def __init__(self, **kwd): | |
249 Fasta.__init__( self, **kwd ) | |
250 """Initialize AlignCheck datatype""" | |
251 | |
252 def sniff( self, filename ): | |
253 """ | |
254 Determines whether the file is in Mothur align fasta format | |
255 Each sequence line must be the same length | |
256 """ | |
257 | |
258 try: | |
259 fh = open( filename ) | |
260 len = -1 | |
261 while True: | |
262 line = fh.readline() | |
263 if not line: | |
264 break #EOF | |
265 line = line.strip() | |
266 if line: #first non-empty line | |
267 if line.startswith( '>' ): | |
268 #The next line.strip() must not be '', nor startwith '>' | |
269 line = fh.readline().strip() | |
270 if line == '' or line.startswith( '>' ): | |
271 break | |
272 if len < 0: | |
273 len = len(line) | |
274 elif len != len(line): | |
275 return False | |
276 else: | |
277 break #we found a non-empty line, but its not a fasta header | |
278 if len > 0: | |
279 return True | |
280 except: | |
281 pass | |
282 finally: | |
283 fh.close() | |
284 return False | |
285 | |
286 class AlignCheck( Tabular ): | |
287 file_ext = 'align.check' | |
288 def __init__(self, **kwd): | |
289 """Initialize AlignCheck datatype""" | |
290 Tabular.__init__( self, **kwd ) | |
291 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
292 self.column_types = ['str','int','int','int','int','int','int','int'] | |
293 self.comment_lines = 1 | |
294 | |
295 def set_meta( self, dataset, overwrite = True, **kwd ): | |
296 # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 ) | |
297 data_lines = 0 | |
298 if dataset.has_data(): | |
299 dataset_fh = open( dataset.file_name ) | |
300 while True: | |
301 line = dataset_fh.readline() | |
302 if not line: break | |
303 data_lines += 1 | |
304 dataset_fh.close() | |
305 dataset.metadata.comment_lines = 1 | |
306 dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 | |
307 dataset.metadata.column_names = self.column_names | |
308 dataset.metadata.column_types = self.column_types | |
309 | |
310 class AlignReport(Tabular): | |
311 """ | |
312 QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template | |
313 AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6 | |
314 """ | |
315 file_ext = 'align.report' | |
316 def __init__(self, **kwd): | |
317 """Initialize AlignCheck datatype""" | |
318 Tabular.__init__( self, **kwd ) | |
319 self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore', | |
320 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd', | |
321 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template' | |
322 ] | |
323 | |
324 class BellerophonChimera( Tabular ): | |
325 file_ext = 'bellerophon.chimera' | |
326 def __init__(self, **kwd): | |
327 """Initialize AlignCheck datatype""" | |
328 Tabular.__init__( self, **kwd ) | |
329 self.column_names = ['Name','Score','Left','Right'] | |
330 | |
331 class SecondaryStructureMatch(Tabular): | |
332 """ | |
333 name pound dash plus equal loop tilde total | |
334 9_1_12 42 68 8 28 275 420 872 | |
335 9_1_14 36 68 6 26 266 422 851 | |
336 9_1_15 44 68 8 28 276 418 873 | |
337 9_1_16 34 72 6 30 267 430 860 | |
338 9_1_18 46 80 2 36 261 | |
339 """ | |
340 def __init__(self, **kwd): | |
341 """Initialize SecondaryStructureMatch datatype""" | |
342 Tabular.__init__( self, **kwd ) | |
343 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
344 | |
345 class DistanceMatrix(data.Text): | |
346 file_ext = 'dist' | |
347 """Add metadata elements""" | |
348 MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 ) | |
349 | |
350 | |
351 class LowerTriangleDistanceMatrix(DistanceMatrix): | |
352 file_ext = 'lower.dist' | |
353 def __init__(self, **kwd): | |
354 """Initialize secondary structure map datatype""" | |
355 DistanceMatrix.__init__( self, **kwd ) | |
356 | |
357 def sniff( self, filename ): | |
358 """ | |
359 Determines whether the file is a lower-triangle distance matrix (phylip) format | |
360 The first line has the number of sequences in the matrix. | |
361 The remaining lines have the sequence name followed by a list of distances from all preceeding sequences | |
362 5 | |
363 U68589 | |
364 U68590 0.3371 | |
365 U68591 0.3609 0.3782 | |
366 U68592 0.4155 0.3197 0.4148 | |
367 U68593 0.2872 0.1690 0.3361 0.2842 | |
368 """ | |
369 try: | |
370 fh = open( filename ) | |
371 count = 0 | |
372 while True: | |
373 line = fh.readline() | |
374 line = line.strip() | |
375 if not line: | |
376 break #EOF | |
377 if line: | |
378 if line[0] != '@': | |
379 linePieces = line.split('\t') | |
380 if len(linePieces) != 3: | |
381 return False | |
382 try: | |
383 check = float(linePieces[2]) | |
384 except ValueError: | |
385 return False | |
386 count += 1 | |
387 if count == 5: | |
388 return True | |
389 fh.close() | |
390 if count < 5 and count > 0: | |
391 return True | |
392 except: | |
393 pass | |
394 finally: | |
395 fh.close() | |
396 return False | |
397 | |
398 class SquareDistanceMatrix(DistanceMatrix,Tabular): | |
399 file_ext = 'square.dist' | |
400 sequence_count = -1 | |
401 | |
402 def __init__(self, **kwd): | |
403 """Initialize secondary structure map datatype""" | |
404 Tabular.__init__( self, **kwd ) | |
405 def init_meta( self, dataset, copy_from=None ): | |
406 data.Text.init_meta( self, dataset, copy_from=copy_from ) | |
407 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
408 dataset.metadata.sequences = 0 | |
409 | |
410 def sniff( self, filename ): | |
411 """ | |
412 Determines whether the file is a square distance matrix (Column-formatted distance matrix) format | |
413 The first line has the number of sequences in the matrix. | |
414 The following lines have the sequence name in the first column plus a column for the distance to each sequence | |
415 in the row order in which they appear in the matrix. | |
416 3 | |
417 U68589 0.0000 0.3371 0.3610 | |
418 U68590 0.3371 0.0000 0.3783 | |
419 U68590 0.3371 0.0000 0.3783 | |
420 """ | |
421 try: | |
422 fh = open( filename ) | |
423 count = 0 | |
424 line = fh.readline() | |
425 line = line.strip() | |
426 sequence_count = int(line) | |
427 col_cnt = seq_cnt + 1 | |
428 while True: | |
429 line = fh.readline() | |
430 line = line.strip() | |
431 if not line: | |
432 break #EOF | |
433 if line: | |
434 if line[0] != '@': | |
435 linePieces = line.split('\t') | |
436 if len(linePieces) != col_cnt : | |
437 return False | |
438 try: | |
439 for i in range(1, col_cnt): | |
440 check = float(linePieces[i]) | |
441 except ValueError: | |
442 return False | |
443 count += 1 | |
444 if count == 5: | |
445 return True | |
446 fh.close() | |
447 if count < 5 and count > 0: | |
448 return True | |
449 except: | |
450 pass | |
451 finally: | |
452 fh.close() | |
453 return False | |
454 | |
455 class PairwiseDistanceMatrix(DistanceMatrix,Tabular): | |
456 file_ext = 'pair.dist' | |
457 def __init__(self, **kwd): | |
458 """Initialize secondary structure map datatype""" | |
459 Tabular.__init__( self, **kwd ) | |
460 self.column_names = ['Sequence','Sequence','Distance'] | |
461 self.column_types = ['str','str','float'] | |
462 self.comment_lines = 1 | |
463 | |
464 def sniff( self, filename ): | |
465 """ | |
466 Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format | |
467 The first and second columns have the sequence names and the third column is the distance between those sequences. | |
468 """ | |
469 try: | |
470 fh = open( filename ) | |
471 count = 0 | |
472 while True: | |
473 line = fh.readline() | |
474 line = line.strip() | |
475 if not line: | |
476 break #EOF | |
477 if line: | |
478 if line[0] != '@': | |
479 linePieces = line.split('\t') | |
480 if len(linePieces) != 3: | |
481 return False | |
482 try: | |
483 check = float(linePieces[2]) | |
484 except ValueError: | |
485 return False | |
486 count += 1 | |
487 if count == 5: | |
488 return True | |
489 fh.close() | |
490 if count < 5 and count > 0: | |
491 return True | |
492 except: | |
493 pass | |
494 finally: | |
495 fh.close() | |
496 return False | |
497 | |
498 class AlignCheck(Tabular): | |
499 file_ext = 'align.check' | |
500 def __init__(self, **kwd): | |
501 """Initialize secondary structure map datatype""" | |
502 Tabular.__init__( self, **kwd ) | |
503 self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] | |
504 self.columns = 8 | |
505 | |
506 class Names(Tabular): | |
507 file_ext = 'names' | |
508 def __init__(self, **kwd): | |
509 """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" | |
510 Tabular.__init__( self, **kwd ) | |
511 self.column_names = ['name','representatives'] | |
512 self.columns = 2 | |
513 | |
514 class Summary(Tabular): | |
515 file_ext = 'summary' | |
516 def __init__(self, **kwd): | |
517 """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file""" | |
518 Tabular.__init__( self, **kwd ) | |
519 self.column_names = ['seqname','start','end','nbases','ambigs','polymer'] | |
520 self.columns = 6 | |
521 | |
522 class Group(Tabular): | |
523 file_ext = 'groups' | |
524 def __init__(self, **kwd): | |
525 """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" | |
526 Tabular.__init__( self, **kwd ) | |
527 self.column_names = ['name','group'] | |
528 self.columns = 2 | |
529 | |
530 class Design(Tabular): | |
531 file_ext = 'design' | |
532 def __init__(self, **kwd): | |
533 """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" | |
534 Tabular.__init__( self, **kwd ) | |
535 self.column_names = ['group','grouping'] | |
536 self.columns = 2 | |
537 | |
538 class AccNos(Tabular): | |
539 file_ext = 'accnos' | |
540 def __init__(self, **kwd): | |
541 """A list of names""" | |
542 Tabular.__init__( self, **kwd ) | |
543 self.column_names = ['name'] | |
544 self.columns = 1 | |
545 | |
546 class Oligos( data.Text ): | |
547 file_ext = 'oligos' | |
548 | |
549 def sniff( self, filename ): | |
550 """ | |
551 Determines whether the file is a otu (operational taxonomic unit) format | |
552 """ | |
553 try: | |
554 fh = open( filename ) | |
555 count = 0 | |
556 while True: | |
557 line = fh.readline() | |
558 line = line.strip() | |
559 if not line: | |
560 break #EOF | |
561 else: | |
562 if line[0] != '#': | |
563 linePieces = line.split('\t') | |
564 if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]): | |
565 count += 1 | |
566 continue | |
567 elif len(linePieces) == 3 and re.match('barcode',linePieces[0]): | |
568 count += 1 | |
569 continue | |
570 else: | |
571 return False | |
572 if count > 20: | |
573 return True | |
574 if count > 0: | |
575 return True | |
576 except: | |
577 pass | |
578 finally: | |
579 fh.close() | |
580 return False | |
581 | |
582 class Frequency(Tabular): | |
583 file_ext = 'freq' | |
584 def __init__(self, **kwd): | |
585 """A list of names""" | |
586 Tabular.__init__( self, **kwd ) | |
587 self.column_names = ['position','frequency'] | |
588 self.column_types = ['int','float'] | |
589 | |
590 def sniff( self, filename ): | |
591 """ | |
592 Determines whether the file is a frequency tabular format for chimera analysis | |
593 #1.14.0 | |
594 0 0.000 | |
595 1 0.000 | |
596 ... | |
597 155 0.975 | |
598 """ | |
599 try: | |
600 fh = open( filename ) | |
601 count = 0 | |
602 while True: | |
603 line = fh.readline() | |
604 line = line.strip() | |
605 if not line: | |
606 break #EOF | |
607 else: | |
608 if line[0] != '#': | |
609 try: | |
610 linePieces = line.split('\t') | |
611 i = int(linePieces[0]) | |
612 f = float(linePieces[1]) | |
613 count += 1 | |
614 continue | |
615 except: | |
616 return False | |
617 if count > 20: | |
618 return True | |
619 if count > 0: | |
620 return True | |
621 except: | |
622 pass | |
623 finally: | |
624 fh.close() | |
625 return False | |
626 | |
627 class Quantile(Tabular): | |
628 file_ext = 'quan' | |
629 MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True) | |
630 MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True) | |
631 def __init__(self, **kwd): | |
632 """Quantiles for chimera analysis""" | |
633 Tabular.__init__( self, **kwd ) | |
634 self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine'] | |
635 self.column_types = ['int','float','float','float','float','float','float'] | |
636 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
637 log.info( "Mothur Quantile set_meta %s" % kwd) | |
638 def sniff( self, filename ): | |
639 """ | |
640 Determines whether the file is a quantiles tabular format for chimera analysis | |
641 1 0 0 0 0 0 0 | |
642 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 | |
643 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 | |
644 ... | |
645 """ | |
646 try: | |
647 fh = open( filename ) | |
648 count = 0 | |
649 while True: | |
650 line = fh.readline() | |
651 line = line.strip() | |
652 if not line: | |
653 break #EOF | |
654 else: | |
655 if line[0] != '#': | |
656 try: | |
657 linePieces = line.split('\t') | |
658 i = int(linePieces[0]) | |
659 f = float(linePieces[1]) | |
660 f = float(linePieces[2]) | |
661 f = float(linePieces[3]) | |
662 f = float(linePieces[4]) | |
663 f = float(linePieces[5]) | |
664 f = float(linePieces[6]) | |
665 count += 1 | |
666 continue | |
667 except: | |
668 return False | |
669 if count > 10: | |
670 return True | |
671 if count > 0: | |
672 return True | |
673 except: | |
674 pass | |
675 finally: | |
676 fh.close() | |
677 return False | |
678 | |
679 class FilteredQuantile(Quantile): | |
680 file_ext = 'filtered.quan' | |
681 def __init__(self, **kwd): | |
682 """Quantiles for chimera analysis""" | |
683 Quantile.__init__( self, **kwd ) | |
684 self.filtered = True | |
685 | |
686 class MaskedQuantile(Quantile): | |
687 file_ext = 'masked.quan' | |
688 def __init__(self, **kwd): | |
689 """Quantiles for chimera analysis""" | |
690 Quantile.__init__( self, **kwd ) | |
691 self.masked = True | |
692 self.filtered = False | |
693 | |
694 class FilteredMaskedQuantile(Quantile): | |
695 file_ext = 'filtered.masked.quan' | |
696 def __init__(self, **kwd): | |
697 """Quantiles for chimera analysis""" | |
698 Quantile.__init__( self, **kwd ) | |
699 self.masked = True | |
700 self.filtered = True | |
701 | |
702 class LaneMask(data.Text): | |
703 file_ext = 'filter' | |
704 | |
705 def sniff( self, filename ): | |
706 """ | |
707 Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. | |
708 """ | |
709 try: | |
710 fh = open( filename ) | |
711 while True: | |
712 buff = fh.read(1000) | |
713 if not buff: | |
714 break #EOF | |
715 else: | |
716 if not re.match('^[01]+$',line): | |
717 return False | |
718 return True | |
719 except: | |
720 pass | |
721 finally: | |
722 close(fh) | |
723 return False | |
724 | |
725 class SequenceTaxonomy(Tabular): | |
726 file_ext = 'seq.taxonomy' | |
727 """ | |
728 A table with 2 columns: | |
729 - SequenceName | |
730 - Taxonomy (semicolon-separated taxonomy in descending order) | |
731 Example: | |
732 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; | |
733 X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; | |
734 AF052717.1 Eukaryota;Parabasalidea; | |
735 """ | |
736 def __init__(self, **kwd): | |
737 Tabular.__init__( self, **kwd ) | |
738 self.column_names = ['name','taxonomy'] | |
739 | |
740 def sniff( self, filename ): | |
741 """ | |
742 Determines whether the file is a SequenceTaxonomy | |
743 """ | |
744 try: | |
745 pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$' | |
746 fh = open( filename ) | |
747 count = 0 | |
748 while True: | |
749 line = fh.readline() | |
750 if not line: | |
751 break #EOF | |
752 line = line.strip() | |
753 if line: | |
754 fields = line.split('\t') | |
755 if len(fields) != 2: | |
756 return False | |
757 if not re.match(pat,fields[1]): | |
758 return False | |
759 count += 1 | |
760 if count > 10: | |
761 break | |
762 if count > 0: | |
763 return True | |
764 except: | |
765 pass | |
766 finally: | |
767 fh.close() | |
768 return False | |
769 | |
770 class RDPSequenceTaxonomy(SequenceTaxonomy): | |
771 file_ext = 'rdp.taxonomy' | |
772 """ | |
773 A table with 2 columns: | |
774 - SequenceName | |
775 - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep) | |
776 Example: | |
777 AB001518.1 Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales; | |
778 AB001724.1 Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa; | |
779 AB001774.1 Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila; | |
780 """ | |
781 def sniff( self, filename ): | |
782 """ | |
783 Determines whether the file is a SequenceTaxonomy | |
784 """ | |
785 try: | |
786 pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$' | |
787 fh = open( filename ) | |
788 count = 0 | |
789 while True: | |
790 line = fh.readline() | |
791 if not line: | |
792 break #EOF | |
793 line = line.strip() | |
794 if line: | |
795 fields = line.split('\t') | |
796 if len(fields) != 2: | |
797 return False | |
798 if not re.match(pat,fields[1]): | |
799 return False | |
800 count += 1 | |
801 if count > 10: | |
802 break | |
803 if count > 0: | |
804 return True | |
805 except: | |
806 pass | |
807 finally: | |
808 fh.close() | |
809 return False | |
810 | |
811 class ConsensusTaxonomy(Tabular): | |
812 file_ext = 'cons.taxonomy' | |
813 def __init__(self, **kwd): | |
814 """A list of names""" | |
815 Tabular.__init__( self, **kwd ) | |
816 self.column_names = ['OTU','count','taxonomy'] | |
817 | |
818 class TaxonomySummary(Tabular): | |
819 file_ext = 'tax.summary' | |
820 def __init__(self, **kwd): | |
821 """A Summary of taxon classification""" | |
822 Tabular.__init__( self, **kwd ) | |
823 self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total'] | |
824 | |
825 class Phylip(data.Text): | |
826 file_ext = 'phy' | |
827 | |
828 def sniff( self, filename ): | |
829 """ | |
830 Determines whether the file is in Phylip format (Interleaved or Sequential) | |
831 The first line of the input file contains the number of species and the | |
832 number of characters, in free format, separated by blanks (not by | |
833 commas). The information for each species follows, starting with a | |
834 ten-character species name (which can include punctuation marks and blanks), | |
835 and continuing with the characters for that species. | |
836 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles | |
837 Interleaved Example: | |
838 6 39 | |
839 Archaeopt CGATGCTTAC CGCCGATGCT | |
840 HesperorniCGTTACTCGT TGTCGTTACT | |
841 BaluchitheTAATGTTAAT TGTTAATGTT | |
842 B. virginiTAATGTTCGT TGTTAATGTT | |
843 BrontosaurCAAAACCCAT CATCAAAACC | |
844 B.subtilisGGCAGCCAAT CACGGCAGCC | |
845 | |
846 TACCGCCGAT GCTTACCGC | |
847 CGTTGTCGTT ACTCGTTGT | |
848 AATTGTTAAT GTTAATTGT | |
849 CGTTGTTAAT GTTCGTTGT | |
850 CATCATCAAA ACCCATCAT | |
851 AATCACGGCA GCCAATCAC | |
852 """ | |
853 try: | |
854 fh = open( filename ) | |
855 # counts line | |
856 line = fh.readline().strip() | |
857 linePieces = line.split() | |
858 count = int(linePieces[0]) | |
859 seq_len = int(linePieces[1]) | |
860 # data lines | |
861 """ | |
862 TODO check data lines | |
863 while True: | |
864 line = fh.readline() | |
865 # name is the first 10 characters | |
866 name = line[0:10] | |
867 seq = line[10:].strip() | |
868 # nucleic base or amino acid 1-char designators (spaces allowed) | |
869 bases = ''.join(seq.split()) | |
870 # float per base (each separated by space) | |
871 """ | |
872 return True | |
873 except: | |
874 pass | |
875 finally: | |
876 close(fh) | |
877 return False | |
878 | |
879 | |
880 class Axes(Tabular): | |
881 file_ext = 'axes' | |
882 | |
883 def __init__(self, **kwd): | |
884 """Initialize axes datatype""" | |
885 Tabular.__init__( self, **kwd ) | |
886 def sniff( self, filename ): | |
887 """ | |
888 Determines whether the file is an axes format | |
889 The first line may have column headings. | |
890 The following lines have the name in the first column plus float columns for each axis. | |
891 ==> 98_sq_phylip_amazon.fn.unique.pca.axes <== | |
892 group axis1 axis2 | |
893 forest 0.000000 0.145743 | |
894 pasture 0.145743 0.000000 | |
895 | |
896 ==> 98_sq_phylip_amazon.nmds.axes <== | |
897 axis1 axis2 | |
898 U68589 0.262608 -0.077498 | |
899 U68590 0.027118 0.195197 | |
900 U68591 0.329854 0.014395 | |
901 """ | |
902 try: | |
903 fh = open( filename ) | |
904 count = 0 | |
905 line = fh.readline() | |
906 line = line.strip() | |
907 col_cnt = None | |
908 while True: | |
909 line = fh.readline() | |
910 line = line.strip() | |
911 if not line: | |
912 break #EOF | |
913 if line: | |
914 fields = line.split('\t') | |
915 if col_cnt == None: # ignore values in first line as they may be column headings | |
916 col_cnt = len(fields) | |
917 else: | |
918 if len(fields) != col_cnt : | |
919 return False | |
920 try: | |
921 for i in range(1, col_cnt): | |
922 check = float(fields[i]) | |
923 except ValueError: | |
924 return False | |
925 count += 1 | |
926 if count > 10: | |
927 return True | |
928 if count > 0: | |
929 return True | |
930 except: | |
931 pass | |
932 finally: | |
933 fh.close() | |
934 return False | |
935 | |
936 ## Qiime Classes | |
937 | |
938 class QiimeMetadataMapping(Tabular): | |
939 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) | |
940 file_ext = 'qiimemapping' | |
941 | |
942 def __init__(self, **kwd): | |
943 """ | |
944 http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview | |
945 Information about the samples necessary to perform the data analysis. | |
946 # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description'] | |
947 """ | |
948 Tabular.__init__( self, **kwd ) | |
949 | |
950 def sniff( self, filename ): | |
951 """ | |
952 Determines whether the file is a qiime mapping file | |
953 Just checking for an appropriate header line for now, could be improved | |
954 """ | |
955 try: | |
956 pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription' | |
957 fh = open( filename ) | |
958 while True: | |
959 line = dataset_fh.readline() | |
960 if re.match(pat,line): | |
961 return True | |
962 except: | |
963 pass | |
964 finally: | |
965 close(fh) | |
966 return False | |
967 | |
968 def set_column_names(self, dataset): | |
969 if dataset.has_data(): | |
970 dataset_fh = open( dataset.file_name ) | |
971 line = dataset_fh.readline() | |
972 if line.startswith('#SampleID'): | |
973 dataset.metadata.column_names = line.strip().split('\t'); | |
974 dataset_fh.close() | |
975 | |
976 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): | |
977 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) | |
978 self.set_column_names(dataset) | |
979 | |
980 class QiimeOTU(Tabular): | |
981 """ | |
982 Associates OTUs with sequence IDs | |
983 Example: | |
984 0 FLP3FBN01C2MYD FLP3FBN01B2ALM | |
985 1 FLP3FBN01DF6NE FLP3FBN01CKW1J FLP3FBN01CHVM4 | |
986 2 FLP3FBN01AXQ2Z | |
987 """ | |
988 file_ext = 'qiimeotu' | |
989 | |
990 class QiimeOTUTable(Tabular): | |
991 """ | |
992 #Full OTU Counts | |
993 #OTU ID PC.354 PC.355 PC.356 Consensus Lineage | |
994 0 0 1 0 Root;Bacteria;Firmicutes;"Clostridia";Clostridiales | |
995 1 1 3 1 Root;Bacteria | |
996 2 0 2 2 Root;Bacteria;Bacteroidetes | |
997 """ | |
998 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) | |
999 file_ext = 'qiimeotutable' | |
1000 def init_meta( self, dataset, copy_from=None ): | |
1001 tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) | |
1002 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
1003 self.set_column_names(dataset) | |
1004 def set_column_names(self, dataset): | |
1005 if dataset.has_data(): | |
1006 dataset_fh = open( dataset.file_name ) | |
1007 line = dataset_fh.readline() | |
1008 line = dataset_fh.readline() | |
1009 if line.startswith('#OTU ID'): | |
1010 dataset.metadata.column_names = line.strip().split('\t'); | |
1011 dataset_fh.close() | |
1012 dataset.metadata.comment_lines = 2 | |
1013 | |
1014 class QiimeDistanceMatrix(Tabular): | |
1015 """ | |
1016 PC.354 PC.355 PC.356 | |
1017 PC.354 0.0 3.177 1.955 | |
1018 PC.355 3.177 0.0 3.444 | |
1019 PC.356 1.955 3.444 0.0 | |
1020 """ | |
1021 file_ext = 'qiimedistmat' | |
1022 def init_meta( self, dataset, copy_from=None ): | |
1023 tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) | |
1024 def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): | |
1025 self.set_column_names(dataset) | |
1026 def set_column_names(self, dataset): | |
1027 if dataset.has_data(): | |
1028 dataset_fh = open( dataset.file_name ) | |
1029 line = dataset_fh.readline() | |
1030 # first line contains the names | |
1031 dataset.metadata.column_names = line.strip().split('\t'); | |
1032 dataset_fh.close() | |
1033 dataset.metadata.comment_lines = 1 | |
1034 | |
1035 class QiimePCA(Tabular): | |
1036 """ | |
1037 Principal Coordinate Analysis Data | |
1038 The principal coordinate (PC) axes (columns) for each sample (rows). | |
1039 Pairs of PCs can then be graphed to view the relationships between samples. | |
1040 The bottom of the output file contains the eigenvalues and % variation explained for each PC. | |
1041 Example: | |
1042 pc vector number 1 2 3 | |
1043 PC.354 -0.309063936588 0.0398252112257 0.0744672231759 | |
1044 PC.355 -0.106593922619 0.141125998277 0.0780204374172 | |
1045 PC.356 -0.219869362955 0.00917241121781 0.0357281314115 | |
1046 | |
1047 | |
1048 eigvals 0.480220500471 0.163567082874 0.125594470811 | |
1049 % variation explained 51.6955484555 17.6079322939 | |
1050 """ | |
1051 file_ext = 'qiimepca' | |
1052 | |
1053 class QiimeParams(Tabular): | |
1054 """ | |
1055 ###pick_otus_through_otu_table.py parameters### | |
1056 | |
1057 # OTU picker parameters | |
1058 pick_otus:otu_picking_method uclust | |
1059 pick_otus:clustering_algorithm furthest | |
1060 | |
1061 # Representative set picker parameters | |
1062 pick_rep_set:rep_set_picking_method first | |
1063 pick_rep_set:sort_by otu | |
1064 """ | |
1065 file_ext = 'qiimeparams' | |
1066 | |
1067 class QiimePrefs(data.Text): | |
1068 """ | |
1069 A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py. | |
1070 Example: | |
1071 { | |
1072 'background_color':'black', | |
1073 | |
1074 'sample_coloring': | |
1075 { | |
1076 'Treatment': | |
1077 { | |
1078 'column':'Treatment', | |
1079 'colors':(('red',(0,100,100)),('blue',(240,100,100))) | |
1080 }, | |
1081 'DOB': | |
1082 { | |
1083 'column':'DOB', | |
1084 'colors':(('red',(0,100,100)),('blue',(240,100,100))) | |
1085 } | |
1086 }, | |
1087 'MONTE_CARLO_GROUP_DISTANCES': | |
1088 { | |
1089 'Treatment': 10, | |
1090 'DOB': 10 | |
1091 } | |
1092 } | |
1093 """ | |
1094 file_ext = 'qiimeprefs' | |
1095 | |
1096 class QiimeTaxaSummary(Tabular): | |
1097 """ | |
1098 Taxon PC.354 PC.355 PC.356 | |
1099 Root;Bacteria;Actinobacteria 0.0 0.177 0.955 | |
1100 Root;Bacteria;Firmicutes 0.177 0.0 0.444 | |
1101 Root;Bacteria;Proteobacteria 0.955 0.444 0.0 | |
1102 """ | |
1103 MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) | |
1104 file_ext = 'qiimetaxsummary' | |
1105 | |
1106 def set_column_names(self, dataset): | |
1107 if dataset.has_data(): | |
1108 dataset_fh = open( dataset.file_name ) | |
1109 line = dataset_fh.readline() | |
1110 if line.startswith('Taxon'): | |
1111 dataset.metadata.column_names = line.strip().split('\t'); | |
1112 dataset_fh.close() | |
1113 | |
1114 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): | |
1115 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) | |
1116 self.set_column_names(dataset) | |
1117 | |
1118 if __name__ == '__main__': | |
1119 import doctest, sys | |
1120 doctest.testmod(sys.modules[__name__]) | |
1121 |