qiime: lib/galaxy/datatypes/metagenomics.py comparison

comparison lib/galaxy/datatypes/metagenomics.py @ 0:e5c3175506b7 default tip

Initial tool configs for qiime, most need work.

author	Jim Johnson <jj@umn.edu>
date	Sun, 17 Jul 2011 10:30:11 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:e5c3175506b7
+"""
+metagenomics datatypes
+James E Johnson - University of Minnesota
+for Mothur
+"""
+import data
+import logging, os, sys, time, tempfile, shutil, string, glob, re
+import galaxy.model
+from galaxy.datatypes import metadata
+from galaxy.datatypes import tabular
+from galaxy.datatypes import sequence
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes.tabular import Tabular
+from galaxy.datatypes.sequence import Fasta
+from galaxy import util
+from galaxy.datatypes.images import Html
+from sniff import *
+log = logging.getLogger(__name__)
+## Mothur Classes
+class Otu( Tabular ):
+file_ext = 'otu'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 2:
+return False
+try:
+check = int(linePieces[1])
+if check + 2 != len(linePieces):
+return False
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class OtuList( Otu ):
+file_ext = 'list'
+class Sabund( Otu ):
+file_ext = 'sabund'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+label<TAB>count[<TAB>value(1..n)]
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 2:
+return False
+try:
+check = int(linePieces[1])
+if check + 2 != len(linePieces):
+return False
+for i in range( 2, len(linePieces)):
+ival = int(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count >= 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Rabund( Sabund ):
+file_ext = 'rabund'
+class GroupAbund( Otu ):
+file_ext = 'grpabund'
+def init_meta( self, dataset, copy_from=None ):
+Otu.init_meta( self, dataset, copy_from=copy_from )
+def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ):
+# See if file starts with header line
+if dataset.has_data():
+try:
+fh = open( dataset.file_name )
+line = fh.readline()
+line = line.strip()
+linePieces = line.split('\t')
+if linePieces[0] == 'label' and linePieces[1] == 'Group':
+skip=1
+else:
+skip=0
+finally:
+fh.close()
+Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd)
+def sniff( self, filename, vals_are_int=False):
+"""
+Determines whether the file is a otu (operational taxonomic unit) Shared format
+label<TAB>group<TAB>count[<TAB>value(1..n)]
+The first line is column headings as of Mothur v 1.20
+"""
+log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int)
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 3:
+return False
+if count > 0 or linePieces[0] != 'label':
+try:
+check = int(linePieces[2])
+if check + 3 != len(linePieces):
+return False
+for i in range( 3, len(linePieces)):
+if vals_are_int:
+ival = int(linePieces[i])
+else:
+fval = float(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count >= 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SharedRabund( GroupAbund ):
+file_ext = 'shared'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) Shared format
+label<TAB>group<TAB>count[<TAB>value(1..n)]
+The first line is column headings as of Mothur v 1.20
+"""
+# return GroupAbund.sniff(self,filename,True)
+isme = GroupAbund.sniff(self,filename,True)
+log.info( "is SharedRabund %s" % isme)
+return isme
+class RelAbund( GroupAbund ):
+file_ext = 'relabund'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
+label<TAB>group<TAB>count[<TAB>value(1..n)]
+The first line is column headings as of Mothur v 1.20
+"""
+# return GroupAbund.sniff(self,filename,False)
+isme = GroupAbund.sniff(self,filename,False)
+log.info( "is RelAbund %s" % isme)
+return isme
+class SecondaryStructureMap(Tabular):
+file_ext = 'map'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Map']
+def sniff( self, filename ):
+"""
+Determines whether the file is a secondary structure map format
+A single column with an integer value which indicates the row that this row maps to.
+check you make sure is structMap[10] = 380 then structMap[380] = 10.
+"""
+try:
+fh = open( filename )
+line_num = 0
+rowidxmap = {}
+while True:
+line = fh.readline()
+line_num += 1
+line = line.strip()
+if not line:
+break #EOF
+if line:
+try:
+pointer = int(line)
+if pointer > 0:
+if pointer > line_num:
+rowidxmap[line_num] = pointer
+elif pointer < line_num & rowidxmap[pointer] != line_num:
+return False
+except ValueError:
+return False
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SequenceAlignment( Fasta ):
+file_ext = 'align'
+def __init__(self, **kwd):
+Fasta.__init__( self, **kwd )
+"""Initialize AlignCheck datatype"""
+def sniff( self, filename ):
+"""
+Determines whether the file is in Mothur align fasta format
+Each sequence line must be the same length
+"""
+try:
+fh = open( filename )
+len = -1
+while True:
+line = fh.readline()
+if not line:
+break #EOF
+line = line.strip()
+if line: #first non-empty line
+if line.startswith( '>' ):
+#The next line.strip() must not be '', nor startwith '>'
+line = fh.readline().strip()
+if line == '' or line.startswith( '>' ):
+break
+if len < 0:
+len = len(line)
+elif len != len(line):
+return False
+else:
+break #we found a non-empty line, but its not a fasta header
+if len > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class AlignCheck( Tabular ):
+file_ext = 'align.check'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+self.column_types = ['str','int','int','int','int','int','int','int']
+self.comment_lines = 1
+def set_meta( self, dataset, overwrite = True, **kwd ):
+# Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
+data_lines = 0
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+while True:
+line = dataset_fh.readline()
+if not line: break
+data_lines += 1
+dataset_fh.close()
+dataset.metadata.comment_lines = 1
+dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
+dataset.metadata.column_names = self.column_names
+dataset.metadata.column_types = self.column_types
+class AlignReport(Tabular):
+"""
+QueryName	QueryLength	TemplateName	TemplateLength	SearchMethod	SearchScore	AlignmentMethod	QueryStart	QueryEnd	TemplateStart	TemplateEnd	PairwiseAlignmentLength	GapsInQuery	GapsInTemplate	LongestInsert	SimBtwnQuery&Template
+AY457915	501		82283		1525		kmer		89.07		needleman	5		501		1		499		499			2		0		0		97.6
+"""
+file_ext = 'align.report'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
+'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
+'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
+]
+class BellerophonChimera( Tabular ):
+file_ext = 'bellerophon.chimera'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Name','Score','Left','Right']
+class SecondaryStructureMatch(Tabular):
+"""
+	name	pound	dash	plus	equal	loop	tilde	total
+	9_1_12	42	68	8	28	275	420	872
+	9_1_14	36	68	6	26	266	422	851
+	9_1_15	44	68	8	28	276	418	873
+	9_1_16	34	72	6	30	267	430	860
+	9_1_18	46	80	2	36	261
+"""
+def __init__(self, **kwd):
+"""Initialize SecondaryStructureMatch datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+class DistanceMatrix(data.Text):
+file_ext = 'dist'
+"""Add metadata elements"""
+MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )
+class LowerTriangleDistanceMatrix(DistanceMatrix):
+file_ext = 'lower.dist'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+DistanceMatrix.__init__( self, **kwd )
+def sniff( self, filename ):
+"""
+Determines whether the file is a lower-triangle distance matrix (phylip) format
+The first line has the number of sequences in the matrix.
+The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
+5
+U68589
+U68590	0.3371
+U68591	0.3609	0.3782
+U68592	0.4155	0.3197	0.4148
+U68593	0.2872	0.1690	0.3361	0.2842
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != 3:
+return False
+try:
+check = float(linePieces[2])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SquareDistanceMatrix(DistanceMatrix,Tabular):
+file_ext = 'square.dist'
+sequence_count = -1
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+def init_meta( self, dataset, copy_from=None ):
+data.Text.init_meta( self, dataset, copy_from=copy_from )
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+dataset.metadata.sequences = 0
+def sniff( self, filename ):
+"""
+Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
+The first line has the number of sequences in the matrix.
+The following lines have the sequence name in the first column plus a column for the distance to each sequence
+in the row order in which they appear in the matrix.
+3
+U68589  0.0000  0.3371  0.3610
+U68590  0.3371  0.0000  0.3783
+U68590  0.3371  0.0000  0.3783
+"""
+try:
+fh = open( filename )
+count = 0
+line = fh.readline()
+line = line.strip()
+sequence_count = int(line)
+col_cnt = seq_cnt + 1
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != col_cnt :
+return False
+try:
+for i in range(1, col_cnt):
+check = float(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
+file_ext = 'pair.dist'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Sequence','Sequence','Distance']
+self.column_types = ['str','str','float']
+self.comment_lines = 1
+def sniff( self, filename ):
+"""
+Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
+The first and second columns have the sequence names and the third column is the distance between those sequences.
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != 3:
+return False
+try:
+check = float(linePieces[2])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class AlignCheck(Tabular):
+file_ext = 'align.check'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+self.columns = 8
+class Names(Tabular):
+file_ext = 'names'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','representatives']
+self.columns = 2
+class Summary(Tabular):
+file_ext = 'summary'
+def __init__(self, **kwd):
+"""summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['seqname','start','end','nbases','ambigs','polymer']
+self.columns = 6
+class Group(Tabular):
+file_ext = 'groups'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','group']
+self.columns = 2
+class Design(Tabular):
+file_ext = 'design'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
+Tabular.__init__( self, **kwd )
+self.column_names = ['group','grouping']
+self.columns = 2
+class AccNos(Tabular):
+file_ext = 'accnos'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name']
+self.columns = 1
+class Oligos( data.Text ):
+file_ext = 'oligos'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+linePieces = line.split('\t')
+if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
+count += 1
+continue
+elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
+count += 1
+continue
+else:
+return False
+if count > 20:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Frequency(Tabular):
+file_ext = 'freq'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['position','frequency']
+self.column_types = ['int','float']
+def sniff( self, filename ):
+"""
+Determines whether the file is a frequency tabular format for chimera analysis
+#1.14.0
+0	0.000
+1	0.000
+...
+155	0.975
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+try:
+linePieces = line.split('\t')
+i = int(linePieces[0])
+f = float(linePieces[1])
+count += 1
+continue
+except:
+return False
+if count > 20:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Quantile(Tabular):
+file_ext = 'quan'
+MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
+MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
+self.column_types = ['int','float','float','float','float','float','float']
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+log.info( "Mothur Quantile set_meta %s" % kwd)
+def sniff( self, filename ):
+"""
+Determines whether the file is a quantiles tabular format for chimera analysis
+1	0	0	0	0	0	0
+2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
+3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
+...
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+try:
+linePieces = line.split('\t')
+i = int(linePieces[0])
+f = float(linePieces[1])
+f = float(linePieces[2])
+f = float(linePieces[3])
+f = float(linePieces[4])
+f = float(linePieces[5])
+f = float(linePieces[6])
+count += 1
+continue
+except:
+return False
+if count > 10:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class FilteredQuantile(Quantile):
+file_ext = 'filtered.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.filtered = True
+class MaskedQuantile(Quantile):
+file_ext = 'masked.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.masked = True
+self.filtered = False
+class FilteredMaskedQuantile(Quantile):
+file_ext = 'filtered.masked.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.masked = True
+self.filtered = True
+class LaneMask(data.Text):
+file_ext = 'filter'
+def sniff( self, filename ):
+"""
+Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.
+"""
+try:
+fh = open( filename )
+while True:
+buff = fh.read(1000)
+if not buff:
+break #EOF
+else:
+if not re.match('^[01]+$',line):
+return False
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+class SequenceTaxonomy(Tabular):
+file_ext = 'seq.taxonomy'
+"""
+A table with 2 columns:
+- SequenceName
+- Taxonomy (semicolon-separated taxonomy in descending order)
+Example:
+X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
+X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
+AF052717.1      Eukaryota;Parabasalidea;
+"""
+def __init__(self, **kwd):
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','taxonomy']
+def sniff( self, filename ):
+"""
+Determines whether the file is a SequenceTaxonomy
+"""
+try:
+pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$'
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+if not line:
+break #EOF
+line = line.strip()
+if line:
+fields = line.split('\t')
+if len(fields) != 2:
+return False
+if not re.match(pat,fields[1]):
+return False
+count += 1
+if count > 10:
+break
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class RDPSequenceTaxonomy(SequenceTaxonomy):
+file_ext = 'rdp.taxonomy'
+"""
+A table with 2 columns:
+- SequenceName
+- Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep)
+Example:
+AB001518.1      Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales;
+AB001724.1      Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa;
+AB001774.1      Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila;
+"""
+def sniff( self, filename ):
+"""
+Determines whether the file is a SequenceTaxonomy
+"""
+try:
+pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$'
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+if not line:
+break #EOF
+line = line.strip()
+if line:
+fields = line.split('\t')
+if len(fields) != 2:
+return False
+if not re.match(pat,fields[1]):
+return False
+count += 1
+if count > 10:
+break
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class ConsensusTaxonomy(Tabular):
+file_ext = 'cons.taxonomy'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['OTU','count','taxonomy']
+class TaxonomySummary(Tabular):
+file_ext = 'tax.summary'
+def __init__(self, **kwd):
+"""A Summary of taxon classification"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']
+class Phylip(data.Text):
+file_ext = 'phy'
+def sniff( self, filename ):
+"""
+Determines whether the file is in Phylip format (Interleaved or Sequential)
+The first line of the input file contains the number of species and the
+number of characters, in free format, separated by blanks (not by
+commas). The information for each species follows, starting with a
+ten-character species name (which can include punctuation marks and blanks),
+and continuing with the characters for that species.
+http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+Interleaved Example:
+6   39
+Archaeopt CGATGCTTAC CGCCGATGCT
+HesperorniCGTTACTCGT TGTCGTTACT
+BaluchitheTAATGTTAAT TGTTAATGTT
+B. virginiTAATGTTCGT TGTTAATGTT
+BrontosaurCAAAACCCAT CATCAAAACC
+B.subtilisGGCAGCCAAT CACGGCAGCC
+TACCGCCGAT GCTTACCGC
+CGTTGTCGTT ACTCGTTGT
+AATTGTTAAT GTTAATTGT
+CGTTGTTAAT GTTCGTTGT
+CATCATCAAA ACCCATCAT
+AATCACGGCA GCCAATCAC
+"""
+try:
+fh = open( filename )
+# counts line
+line = fh.readline().strip()
+linePieces = line.split()
+count = int(linePieces[0])
+seq_len = int(linePieces[1])
+# data lines
+"""
+TODO check data lines
+while True:
+line = fh.readline()
+# name is the first 10 characters
+name = line[0:10]
+seq = line[10:].strip()
+# nucleic base or amino acid 1-char designators (spaces allowed)
+bases = ''.join(seq.split())
+# float per base (each separated by space)
+"""
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+class Axes(Tabular):
+file_ext = 'axes'
+def __init__(self, **kwd):
+"""Initialize axes datatype"""
+Tabular.__init__( self, **kwd )
+def sniff( self, filename ):
+"""
+Determines whether the file is an axes format
+The first line may have column headings.
+The following lines have the name in the first column plus float columns for each axis.
+		==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
+		group   axis1   axis2
+		forest  0.000000        0.145743
+		pasture 0.145743        0.000000
+		==> 98_sq_phylip_amazon.nmds.axes <==
+		axis1   axis2
+		U68589  0.262608        -0.077498
+		U68590  0.027118        0.195197
+		U68591  0.329854        0.014395
+"""
+try:
+fh = open( filename )
+count = 0
+line = fh.readline()
+line = line.strip()
+col_cnt = None
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+fields = line.split('\t')
+if col_cnt == None:  # ignore values in first line as they may be column headings
+col_cnt = len(fields)
+else:
+if len(fields) != col_cnt :
+return False
+try:
+for i in range(1, col_cnt):
+check = float(fields[i])
+except ValueError:
+return False
+count += 1
+if count > 10:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+## Qiime Classes
+class QiimeMetadataMapping(Tabular):
+MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+file_ext = 'qiimemapping'
+def __init__(self, **kwd):
+"""
+http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
+Information about the samples necessary to perform the data analysis.
+# self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
+"""
+Tabular.__init__( self, **kwd )
+def sniff( self, filename ):
+"""
+Determines whether the file is a qiime mapping file
+Just checking for an appropriate header line for now, could be improved
+"""
+try:
+pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
+fh = open( filename )
+while True:
+line = dataset_fh.readline()
+if re.match(pat,line):
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+def set_column_names(self, dataset):
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+line = dataset_fh.readline()
+if line.startswith('#SampleID'):
+dataset.metadata.column_names = line.strip().split('\t');
+dataset_fh.close()
+def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
+Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
+self.set_column_names(dataset)
+class QiimeOTU(Tabular):
+"""
+Associates OTUs with sequence IDs
+Example:
+0	FLP3FBN01C2MYD	FLP3FBN01B2ALM
+1	FLP3FBN01DF6NE	FLP3FBN01CKW1J	FLP3FBN01CHVM4
+2	FLP3FBN01AXQ2Z
+"""
+file_ext = 'qiimeotu'
+class QiimeOTUTable(Tabular):
+"""
+#Full OTU Counts
+#OTU ID	PC.354	PC.355	PC.356	Consensus Lineage
+0	0	1	0	Root;Bacteria;Firmicutes;"Clostridia";Clostridiales
+1	1	3	1	Root;Bacteria
+2	0	2	2	Root;Bacteria;Bacteroidetes
+"""
+MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+file_ext = 'qiimeotutable'
+def init_meta( self, dataset, copy_from=None ):
+tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+self.set_column_names(dataset)
+def set_column_names(self, dataset):
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+line = dataset_fh.readline()
+line = dataset_fh.readline()
+if line.startswith('#OTU ID'):
+dataset.metadata.column_names = line.strip().split('\t');
+dataset_fh.close()
+dataset.metadata.comment_lines = 2
+class QiimeDistanceMatrix(Tabular):
+"""
+	PC.354	PC.355	PC.356
+PC.354	0.0	3.177	1.955
+PC.355	3.177	0.0	3.444
+PC.356	1.955	3.444	0.0
+"""
+file_ext = 'qiimedistmat'
+def init_meta( self, dataset, copy_from=None ):
+tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+self.set_column_names(dataset)
+def set_column_names(self, dataset):
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+line = dataset_fh.readline()
+# first line contains the names
+dataset.metadata.column_names = line.strip().split('\t');
+dataset_fh.close()
+dataset.metadata.comment_lines = 1
+class QiimePCA(Tabular):
+"""
+Principal Coordinate Analysis Data
+The principal coordinate (PC) axes (columns) for each sample (rows).
+Pairs of PCs can then be graphed to view the relationships between samples.
+The bottom of the output file contains the eigenvalues and % variation explained for each PC.
+Example:
+pc vector number	1	2	3
+PC.354	-0.309063936588	0.0398252112257	0.0744672231759
+PC.355	-0.106593922619	0.141125998277	0.0780204374172
+PC.356	-0.219869362955	0.00917241121781	0.0357281314115
+eigvals	0.480220500471	0.163567082874	0.125594470811
+% variation explained	51.6955484555	17.6079322939
+"""
+file_ext = 'qiimepca'
+class QiimeParams(Tabular):
+"""
+###pick_otus_through_otu_table.py parameters###
+# OTU picker parameters
+pick_otus:otu_picking_method    uclust
+pick_otus:clustering_algorithm  furthest
+# Representative set picker parameters
+pick_rep_set:rep_set_picking_method     first
+pick_rep_set:sort_by    otu
+"""
+file_ext = 'qiimeparams'
+class QiimePrefs(data.Text):
+"""
+A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py.
+Example:
+{
+'background_color':'black',
+'sample_coloring':
+{
+'Treatment':
+{
+'column':'Treatment',
+'colors':(('red',(0,100,100)),('blue',(240,100,100)))
+},
+'DOB':
+{
+'column':'DOB',
+'colors':(('red',(0,100,100)),('blue',(240,100,100)))
+}
+},
+'MONTE_CARLO_GROUP_DISTANCES':
+{
+'Treatment': 10,
+'DOB': 10
+}
+}
+"""
+file_ext = 'qiimeprefs'
+class QiimeTaxaSummary(Tabular):
+"""
+Taxon	PC.354	PC.355	PC.356
+Root;Bacteria;Actinobacteria	0.0	0.177	0.955
+Root;Bacteria;Firmicutes	0.177	0.0	0.444
+Root;Bacteria;Proteobacteria	0.955	0.444	0.0
+"""
+MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+file_ext = 'qiimetaxsummary'
+def set_column_names(self, dataset):
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+line = dataset_fh.readline()
+if line.startswith('Taxon'):
+dataset.metadata.column_names = line.strip().split('\t');
+dataset_fh.close()
+def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
+Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
+self.set_column_names(dataset)
+if __name__ == '__main__':
+import doctest, sys
+doctest.testmod(sys.modules[__name__])

Mercurial > repos > jjohnson > qiime

comparison lib/galaxy/datatypes/metagenomics.py @ 0:e5c3175506b7 default tip