Mercurial > repos > jjohnson > mothur_toolsuite
view mothur/lib/galaxy/datatypes/metagenomics.py @ 21:97e35ab2887c
Changed np_shannon calcultor to npshannon and posted bug to mothur.org
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Tue, 17 Jan 2012 14:42:27 -0600 |
parents | 57df76d861e4 |
children | bfbaf823be4c |
line wrap: on
line source
""" metagenomics datatypes James E Johnson - University of Minnesota for Mothur """ import logging, os, os.path, sys, time, tempfile, shutil, string, glob, re import galaxy.model from galaxy.datatypes import data from galaxy.datatypes.sniff import * from galaxy.datatypes import metadata from galaxy.datatypes import tabular from galaxy.datatypes import sequence from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes.data import Text from galaxy.datatypes.tabular import Tabular from galaxy.datatypes.sequence import Fasta from galaxy import util from galaxy.datatypes.images import Html log = logging.getLogger(__name__) ## Mothur Classes class Otu( Tabular ): file_ext = 'otu' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 2: return False try: check = int(linePieces[1]) if check + 2 != len(linePieces): return False except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class OtuList( Otu ): file_ext = 'list' class Sabund( Otu ): file_ext = 'sabund' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format label<TAB>count[<TAB>value(1..n)] """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 2: return False try: check = int(linePieces[1]) if check + 2 != len(linePieces): return False for i in range( 2, len(linePieces)): ival = int(linePieces[i]) except ValueError: return False count += 1 if count >= 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class Rabund( Sabund ): file_ext = 'rabund' class GroupAbund( Otu ): file_ext = 'grpabund' def init_meta( self, dataset, copy_from=None ): Otu.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ): # See if file starts with header line if dataset.has_data(): try: fh = open( dataset.file_name ) line = fh.readline() line = line.strip() linePieces = line.split('\t') if linePieces[0] == 'label' and linePieces[1] == 'Group': skip=1 else: skip=0 finally: fh.close() Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd) def sniff( self, filename, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int) try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 3: return False if count > 0 or linePieces[0] != 'label': try: check = int(linePieces[2]) if check + 3 != len(linePieces): return False for i in range( 3, len(linePieces)): if vals_are_int: ival = int(linePieces[i]) else: fval = float(linePieces[i]) except ValueError: return False count += 1 if count >= 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SharedRabund( GroupAbund ): file_ext = 'shared' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ # return GroupAbund.sniff(self,filename,True) isme = GroupAbund.sniff(self,filename,True) log.info( "is SharedRabund %s" % isme) return isme class RelAbund( GroupAbund ): file_ext = 'relabund' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ # return GroupAbund.sniff(self,filename,False) isme = GroupAbund.sniff(self,filename,False) log.info( "is RelAbund %s" % isme) return isme class SecondaryStructureMap(Tabular): file_ext = 'map' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Map'] def sniff( self, filename ): """ Determines whether the file is a secondary structure map format A single column with an integer value which indicates the row that this row maps to. check you make sure is structMap[10] = 380 then structMap[380] = 10. """ try: fh = open( filename ) line_num = 0 rowidxmap = {} while True: line = fh.readline() line_num += 1 line = line.strip() if not line: break #EOF if line: try: pointer = int(line) if pointer > 0: if pointer > line_num: rowidxmap[line_num] = pointer elif pointer < line_num & rowidxmap[pointer] != line_num: return False except ValueError: return False fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SequenceAlignment( Fasta ): file_ext = 'align' def __init__(self, **kwd): Fasta.__init__( self, **kwd ) """Initialize AlignCheck datatype""" def sniff( self, filename ): """ Determines whether the file is in Mothur align fasta format Each sequence line must be the same length """ try: fh = open( filename ) len = -1 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: #first non-empty line if line.startswith( '>' ): #The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == '' or line.startswith( '>' ): break if len < 0: len = len(line) elif len != len(line): return False else: break #we found a non-empty line, but its not a fasta header if len > 0: return True except: pass finally: fh.close() return False class AlignCheck( Tabular ): file_ext = 'align.check' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] self.column_types = ['str','int','int','int','int','int','int','int'] self.comment_lines = 1 def set_meta( self, dataset, overwrite = True, **kwd ): # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 ) data_lines = 0 if dataset.has_data(): dataset_fh = open( dataset.file_name ) while True: line = dataset_fh.readline() if not line: break data_lines += 1 dataset_fh.close() dataset.metadata.comment_lines = 1 dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 dataset.metadata.column_names = self.column_names dataset.metadata.column_types = self.column_types class AlignReport(Tabular): """ QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6 """ file_ext = 'align.report' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore', 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd', 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template' ] class BellerophonChimera( Tabular ): file_ext = 'bellerophon.chimera' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Name','Score','Left','Right'] class SecondaryStructureMatch(Tabular): """ name pound dash plus equal loop tilde total 9_1_12 42 68 8 28 275 420 872 9_1_14 36 68 6 26 266 422 851 9_1_15 44 68 8 28 276 418 873 9_1_16 34 72 6 30 267 430 860 9_1_18 46 80 2 36 261 """ def __init__(self, **kwd): """Initialize SecondaryStructureMatch datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] class DistanceMatrix( Text ): file_ext = 'dist' """Add metadata elements""" MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 ) class LowerTriangleDistanceMatrix(DistanceMatrix): file_ext = 'lower.dist' def __init__(self, **kwd): """Initialize secondary structure map datatype""" DistanceMatrix.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is a lower-triangle distance matrix (phylip) format The first line has the number of sequences in the matrix. The remaining lines have the sequence name followed by a list of distances from all preceeding sequences 5 U68589 U68590 0.3371 U68591 0.3609 0.3782 U68592 0.4155 0.3197 0.4148 U68593 0.2872 0.1690 0.3361 0.2842 """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != 3: return False try: check = float(linePieces[2]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SquareDistanceMatrix(DistanceMatrix,Tabular): file_ext = 'square.dist' sequence_count = -1 def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): Text.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): dataset.metadata.sequences = 0 def sniff( self, filename ): """ Determines whether the file is a square distance matrix (Column-formatted distance matrix) format The first line has the number of sequences in the matrix. The following lines have the sequence name in the first column plus a column for the distance to each sequence in the row order in which they appear in the matrix. 3 U68589 0.0000 0.3371 0.3610 U68590 0.3371 0.0000 0.3783 U68590 0.3371 0.0000 0.3783 """ try: fh = open( filename ) count = 0 line = fh.readline() line = line.strip() sequence_count = int(line) col_cnt = seq_cnt + 1 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != col_cnt : return False try: for i in range(1, col_cnt): check = float(linePieces[i]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class PairwiseDistanceMatrix(DistanceMatrix,Tabular): file_ext = 'pair.dist' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Sequence','Sequence','Distance'] self.column_types = ['str','str','float'] self.comment_lines = 1 def sniff( self, filename ): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != 3: return False try: check = float(linePieces[2]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class AlignCheck(Tabular): file_ext = 'align.check' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] self.columns = 8 class Names(Tabular): file_ext = 'names' def __init__(self, **kwd): """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" Tabular.__init__( self, **kwd ) self.column_names = ['name','representatives'] self.columns = 2 class Summary(Tabular): file_ext = 'summary' def __init__(self, **kwd): """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file""" Tabular.__init__( self, **kwd ) self.column_names = ['seqname','start','end','nbases','ambigs','polymer'] self.columns = 6 class Group(Tabular): file_ext = 'groups' def __init__(self, **kwd): """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" Tabular.__init__( self, **kwd ) self.column_names = ['name','group'] self.columns = 2 class Design(Tabular): file_ext = 'design' def __init__(self, **kwd): """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" Tabular.__init__( self, **kwd ) self.column_names = ['group','grouping'] self.columns = 2 class AccNos(Tabular): file_ext = 'accnos' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['name'] self.columns = 1 class Oligos( Text ): file_ext = 'oligos' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': linePieces = line.split('\t') if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]): count += 1 continue elif len(linePieces) == 3 and re.match('barcode',linePieces[0]): count += 1 continue else: return False if count > 20: return True if count > 0: return True except: pass finally: fh.close() return False class Frequency(Tabular): file_ext = 'freq' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['position','frequency'] self.column_types = ['int','float'] def sniff( self, filename ): """ Determines whether the file is a frequency tabular format for chimera analysis #1.14.0 0 0.000 1 0.000 ... 155 0.975 """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': try: linePieces = line.split('\t') i = int(linePieces[0]) f = float(linePieces[1]) count += 1 continue except: return False if count > 20: return True if count > 0: return True except: pass finally: fh.close() return False class Quantile(Tabular): file_ext = 'quan' MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True) MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True) def __init__(self, **kwd): """Quantiles for chimera analysis""" Tabular.__init__( self, **kwd ) self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine'] self.column_types = ['int','float','float','float','float','float','float'] def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): log.info( "Mothur Quantile set_meta %s" % kwd) def sniff( self, filename ): """ Determines whether the file is a quantiles tabular format for chimera analysis 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': try: linePieces = line.split('\t') i = int(linePieces[0]) f = float(linePieces[1]) f = float(linePieces[2]) f = float(linePieces[3]) f = float(linePieces[4]) f = float(linePieces[5]) f = float(linePieces[6]) count += 1 continue except: return False if count > 10: return True if count > 0: return True except: pass finally: fh.close() return False class FilteredQuantile(Quantile): file_ext = 'filtered.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.filtered = True class MaskedQuantile(Quantile): file_ext = 'masked.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.masked = True self.filtered = False class FilteredMaskedQuantile(Quantile): file_ext = 'filtered.masked.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.masked = True self.filtered = True class LaneMask(Text): file_ext = 'filter' def sniff( self, filename ): """ Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. """ try: fh = open( filename ) while True: buff = fh.read(1000) if not buff: break #EOF else: if not re.match('^[01]+$',line): return False return True except: pass finally: close(fh) return False class RefTaxonomy(Tabular): file_ext = 'ref.taxonomy' """ A table with 2 or 3 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) - integer ? Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline ) X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; Example: 3-column ( http://vamps.mbl.edu/resources/databases.php ) v3_AA008 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus 5 v3_AA016 Bacteria 120 v3_AA019 Archaea;Crenarchaeota;Marine_Group_I 1 """ def __init__(self, **kwd): Tabular.__init__( self, **kwd ) self.column_names = ['name','taxonomy'] def sniff( self, filename ): """ Determines whether the file is a SequenceTaxonomy """ try: pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' fh = open( filename ) count = 0 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: fields = line.split('\t') if 2 <= len(fields) <= 3: return False if not re.match(pat,fields[1]): return False count += 1 if count > 10: break if count > 0: return True except: pass finally: fh.close() return False class SequenceTaxonomy(RefTaxonomy): file_ext = 'seq.taxonomy' """ A table with 2 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) Example: X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; """ def __init__(self, **kwd): Tabular.__init__( self, **kwd ) self.column_names = ['name','taxonomy'] def sniff( self, filename ): """ Determines whether the file is a SequenceTaxonomy """ try: pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$' fh = open( filename ) count = 0 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: fields = line.split('\t') if len(fields) != 2: return False if not re.match(pat,fields[1]): return False count += 1 if count > 10: break if count > 0: return True except: pass finally: fh.close() return False class RDPSequenceTaxonomy(SequenceTaxonomy): file_ext = 'rdp.taxonomy' """ A table with 2 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep) Example: AB001518.1 Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales; AB001724.1 Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa; AB001774.1 Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila; """ def sniff( self, filename ): """ Determines whether the file is a SequenceTaxonomy """ try: pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$' fh = open( filename ) count = 0 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: fields = line.split('\t') if len(fields) != 2: return False if not re.match(pat,fields[1]): return False count += 1 if count > 10: break if count > 0: return True except: pass finally: fh.close() return False class ConsensusTaxonomy(Tabular): file_ext = 'cons.taxonomy' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['OTU','count','taxonomy'] class TaxonomySummary(Tabular): file_ext = 'tax.summary' def __init__(self, **kwd): """A Summary of taxon classification""" Tabular.__init__( self, **kwd ) self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total'] class Phylip(Text): file_ext = 'phy' def sniff( self, filename ): """ Determines whether the file is in Phylip format (Interleaved or Sequential) The first line of the input file contains the number of species and the number of characters, in free format, separated by blanks (not by commas). The information for each species follows, starting with a ten-character species name (which can include punctuation marks and blanks), and continuing with the characters for that species. http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles Interleaved Example: 6 39 Archaeopt CGATGCTTAC CGCCGATGCT HesperorniCGTTACTCGT TGTCGTTACT BaluchitheTAATGTTAAT TGTTAATGTT B. virginiTAATGTTCGT TGTTAATGTT BrontosaurCAAAACCCAT CATCAAAACC B.subtilisGGCAGCCAAT CACGGCAGCC TACCGCCGAT GCTTACCGC CGTTGTCGTT ACTCGTTGT AATTGTTAAT GTTAATTGT CGTTGTTAAT GTTCGTTGT CATCATCAAA ACCCATCAT AATCACGGCA GCCAATCAC """ try: fh = open( filename ) # counts line line = fh.readline().strip() linePieces = line.split() count = int(linePieces[0]) seq_len = int(linePieces[1]) # data lines """ TODO check data lines while True: line = fh.readline() # name is the first 10 characters name = line[0:10] seq = line[10:].strip() # nucleic base or amino acid 1-char designators (spaces allowed) bases = ''.join(seq.split()) # float per base (each separated by space) """ return True except: pass finally: close(fh) return False class Axes(Tabular): file_ext = 'axes' def __init__(self, **kwd): """Initialize axes datatype""" Tabular.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is an axes format The first line may have column headings. The following lines have the name in the first column plus float columns for each axis. ==> 98_sq_phylip_amazon.fn.unique.pca.axes <== group axis1 axis2 forest 0.000000 0.145743 pasture 0.145743 0.000000 ==> 98_sq_phylip_amazon.nmds.axes <== axis1 axis2 U68589 0.262608 -0.077498 U68590 0.027118 0.195197 U68591 0.329854 0.014395 """ try: fh = open( filename ) count = 0 line = fh.readline() line = line.strip() col_cnt = None while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: fields = line.split('\t') if col_cnt == None: # ignore values in first line as they may be column headings col_cnt = len(fields) else: if len(fields) != col_cnt : return False try: for i in range(1, col_cnt): check = float(fields[i]) except ValueError: return False count += 1 if count > 10: return True if count > 0: return True except: pass finally: fh.close() return False class SffFlow(Tabular): MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True) MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) file_ext = 'sff.flow' """ The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. Following lines contain: - SequenceName - the number of useable flows as defined by 454's software - the flow intensity for each base going in the order of TACG. Example: 800 GQY1XT001CQL4K 85 1.04 0.00 1.00 0.02 0.03 1.02 0.05 ... GQY1XT001CQIRF 84 1.02 0.06 0.98 0.06 0.09 1.05 0.07 ... GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ... """ def __init__(self, **kwd): Tabular.__init__( self, **kwd ) def set_meta( self, dataset, overwrite = True, skip = 1, max_data_lines = None, **kwd ): Tabular.set_meta(self, dataset, overwrite, 1, max_data_lines) try: fh = open( dataset.file_name ) line = fh.readline() line = line.strip() flow_values = int(line) dataset.metadata.flow_values = flow_values finally: fh.close() def make_html_table( self, dataset, skipchars=[] ): """Create HTML table, used for displaying peek""" out = ['<table cellspacing="0" cellpadding="3">'] comments = [] try: # Generate column header out.append('<tr>') out.append( '<th>%d. Name</th>' % 1 ) out.append( '<th>%d. Flows</th>' % 2 ) for i in range( 3, dataset.metadata.columns+1 ): base = dataset.metadata.flow_order[(i+1)%4] out.append( '<th>%d. %d %s</th>' % (i-2,base) ) out.append('</tr>') out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) out.append( '</table>' ) out = "".join( out ) except Exception, exc: out = "Can't create peek %s" % str( exc ) return out class Newick( Text ): """ The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses. http://evolution.genetics.washington.edu/phylip/newicktree.html http://en.wikipedia.org/wiki/Newick_format Example: (B,(A,C,E),D); or example with branch lengths: (B:6.0,(A:5.0,C:3.0,E:4.0):5.0,D:11.0); or an example with embedded comments but no branch lengths: ((a [&&PRIME S=x], b [&&PRIME S=y]), c [&&PRIME S=z]); Example with named interior noe: (B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0); """ file_ext = 'tre' def __init__(self, **kwd): Text.__init__( self, **kwd ) def sniff( self, filename ): ## TODO """ Determine whether the file is in Newick format Note: Last non-space char of a tree should be a semicolon: ';' Usually the first char will be a open parenthesis: '(' (,,(,)); no nodes are named (A,B,(C,D)); leaf nodes are named (A,B,(C,D)E)F; all nodes are named (:0.1,:0.2,(:0.3,:0.4):0.5); all but root node have a distance to parent (:0.1,:0.2,(:0.3,:0.4):0.5):0.0; all have a distance to parent (A:0.1,B:0.2,(C:0.3,D:0.4):0.5); distances and leaf names (popular) (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F; distances and all names ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A; a tree rooted on a leaf node (rare) """ if not os.path.exists(filename): return False try: ## For now, guess this is a Newick file if it starts with a '(' and ends with a ';' flen = os.path.getsize(filename) fh = open( filename ) len = min(flen,2000) # check end of the file for a semicolon fh.seek(-len,os.SEEK_END) buf = fh.read(len).strip() buf = buf.strip() if not buf.endswith(';'): return False # See if this starts with a open parenthesis if len < flen: fh.seek(0) buf = fh.read(len).strip() if buf.startswith('('): return True except: pass finally: close(fh) return False class Nhx( Newick ): """ New Hampshire eXtended Newick with embedded The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses. http://evolution.genetics.washington.edu/phylip/newicktree.html http://en.wikipedia.org/wiki/Newick_format Example: (gene1_Hu[&&NHX:S=Hu_Homo_sapiens], (gene2_Hu[&&NHX:S=Hu_Homo_sapiens], gene2_Mu[&&NHX:S=Mu_Mus_musculus])); """ file_ext = 'nhx' class Nexus( Text ): """ http://en.wikipedia.org/wiki/Nexus_file Example: #NEXUS BEGIN TAXA; Dimensions NTax=4; TaxLabels fish frog snake mouse; END; BEGIN CHARACTERS; Dimensions NChar=20; Format DataType=DNA; Matrix fish ACATA GAGGG TACCT CTAAG frog ACATA GAGGG TACCT CTAAG snake ACATA GAGGG TACCT CTAAG mouse ACATA GAGGG TACCT CTAAG END; BEGIN TREES; Tree best=(fish, (frog, (snake, mouse))); END; """ file_ext = 'nex' def __init__(self, **kwd): Text.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is in nexus format First line should be: #NEXUS """ try: fh = open( filename ) count = 0 line = fh.readline() line = line.strip() if line and line == '#NEXUS': fh.close() return True except: pass finally: fh.close() return False ## Qiime Classes class QiimeMetadataMapping(Tabular): MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimemapping' def __init__(self, **kwd): """ http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview Information about the samples necessary to perform the data analysis. # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description'] """ Tabular.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is a qiime mapping file Just checking for an appropriate header line for now, could be improved """ try: pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription' fh = open( filename ) while True: line = dataset_fh.readline() if re.match(pat,line): return True except: pass finally: close(fh) return False def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() if line.startswith('#SampleID'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) self.set_column_names(dataset) class QiimeOTU(Tabular): """ Associates OTUs with sequence IDs Example: 0 FLP3FBN01C2MYD FLP3FBN01B2ALM 1 FLP3FBN01DF6NE FLP3FBN01CKW1J FLP3FBN01CHVM4 2 FLP3FBN01AXQ2Z """ file_ext = 'qiimeotu' class QiimeOTUTable(Tabular): """ #Full OTU Counts #OTU ID PC.354 PC.355 PC.356 Consensus Lineage 0 0 1 0 Root;Bacteria;Firmicutes;"Clostridia";Clostridiales 1 1 3 1 Root;Bacteria 2 0 2 2 Root;Bacteria;Bacteroidetes """ MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimeotutable' def init_meta( self, dataset, copy_from=None ): tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): self.set_column_names(dataset) def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() line = dataset_fh.readline() if line.startswith('#OTU ID'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() dataset.metadata.comment_lines = 2 class QiimeDistanceMatrix(Tabular): """ PC.354 PC.355 PC.356 PC.354 0.0 3.177 1.955 PC.355 3.177 0.0 3.444 PC.356 1.955 3.444 0.0 """ file_ext = 'qiimedistmat' def init_meta( self, dataset, copy_from=None ): tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): self.set_column_names(dataset) def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() # first line contains the names dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() dataset.metadata.comment_lines = 1 class QiimePCA(Tabular): """ Principal Coordinate Analysis Data The principal coordinate (PC) axes (columns) for each sample (rows). Pairs of PCs can then be graphed to view the relationships between samples. The bottom of the output file contains the eigenvalues and % variation explained for each PC. Example: pc vector number 1 2 3 PC.354 -0.309063936588 0.0398252112257 0.0744672231759 PC.355 -0.106593922619 0.141125998277 0.0780204374172 PC.356 -0.219869362955 0.00917241121781 0.0357281314115 eigvals 0.480220500471 0.163567082874 0.125594470811 % variation explained 51.6955484555 17.6079322939 """ file_ext = 'qiimepca' class QiimeParams(Tabular): """ ###pick_otus_through_otu_table.py parameters### # OTU picker parameters pick_otus:otu_picking_method uclust pick_otus:clustering_algorithm furthest # Representative set picker parameters pick_rep_set:rep_set_picking_method first pick_rep_set:sort_by otu """ file_ext = 'qiimeparams' class QiimePrefs(Text): """ A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py. Example: { 'background_color':'black', 'sample_coloring': { 'Treatment': { 'column':'Treatment', 'colors':(('red',(0,100,100)),('blue',(240,100,100))) }, 'DOB': { 'column':'DOB', 'colors':(('red',(0,100,100)),('blue',(240,100,100))) } }, 'MONTE_CARLO_GROUP_DISTANCES': { 'Treatment': 10, 'DOB': 10 } } """ file_ext = 'qiimeprefs' class QiimeTaxaSummary(Tabular): """ Taxon PC.354 PC.355 PC.356 Root;Bacteria;Actinobacteria 0.0 0.177 0.955 Root;Bacteria;Firmicutes 0.177 0.0 0.444 Root;Bacteria;Proteobacteria 0.955 0.444 0.0 """ MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimetaxsummary' def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() if line.startswith('Taxon'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) self.set_column_names(dataset) if __name__ == '__main__': import doctest, sys doctest.testmod(sys.modules[__name__])