Mercurial > repos > jjohnson > mothur_toolsuite
view mothur/lib/galaxy/datatypes/metagenomics.py @ 7:7bfe1f843858
Support Mothur v1.20
trim.seqs - added name parameter and optional trim.names output
phylo.diversity - group optional, put group and groups in conditional - breaks
get.lineage remove.lineage - allow multiple taxons
dist.shared - added processors
consensus.seqs - add cutoff parameter
trim.seqs,phylo.diversity,get.lineage,remove.lineage,dist.shared,consensus.seqs
new tools - chimera.uchime deunique.tree count.seqs
shared/relabund files - Column headings
refactor lib/galaxy/datatypes/metagenomics.py
add filters to label and group selects in tool configs
mothur_wrapper.py updated with new tools params
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Mon, 27 Jun 2011 10:12:25 -0500 |
parents | e990ac8a0f58 |
children | a6189f58fedb |
line wrap: on
line source
""" metagenomics datatypes James E Johnson - University of Minnesota for Mothur """ import data import logging, os, sys, time, tempfile, shutil, string, glob, re import galaxy.model from galaxy.datatypes import metadata from galaxy.datatypes import tabular from galaxy.datatypes import sequence from galaxy.datatypes.metadata import MetadataElement from galaxy.datatypes.tabular import Tabular from galaxy.datatypes.sequence import Fasta from galaxy import util from galaxy.datatypes.images import Html from sniff import * log = logging.getLogger(__name__) ## Mothur Classes class Otu( Tabular ): file_ext = 'otu' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 2: return False try: check = int(linePieces[1]) if check + 2 != len(linePieces): return False except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class OtuList( Otu ): file_ext = 'list' class Sabund( Otu ): file_ext = 'sabund' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format label<TAB>count[<TAB>value(1..n)] """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 2: return False try: check = int(linePieces[1]) if check + 2 != len(linePieces): return False for i in range( 2, len(linePieces)): ival = int(linePieces[i]) except ValueError: return False count += 1 if count >= 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class Rabund( Sabund ): file_ext = 'rabund' class GroupAbund( Otu ): file_ext = 'grpabund' def init_meta( self, dataset, copy_from=None ): Otu.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ): # See if file starts with header line if dataset.has_data(): try: fh = open( dataset.file_name ) line = fh.readline() line = line.strip() linePieces = line.split('\t') if linePieces[0] == 'label' and linePieces[1] == 'Group': skip=1 else: skip=0 finally: fh.close() Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd) def sniff( self, filename, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int) try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) < 3: return False if count > 0 or linePieces[0] != 'label': try: check = int(linePieces[2]) if check + 3 != len(linePieces): return False for i in range( 3, len(linePieces)): if vals_are_int: ival = int(linePieces[i]) else: fval = float(linePieces[i]) except ValueError: return False count += 1 if count >= 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SharedRabund( GroupAbund ): file_ext = 'shared' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ # return GroupAbund.sniff(self,filename,True) isme = GroupAbund.sniff(self,filename,True) log.info( "is SharedRabund %s" % isme) return isme class RelAbund( GroupAbund ): file_ext = 'relabund' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.20 """ # return GroupAbund.sniff(self,filename,False) isme = GroupAbund.sniff(self,filename,False) log.info( "is RelAbund %s" % isme) return isme class SecondaryStructureMap(Tabular): file_ext = 'map' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Map'] def sniff( self, filename ): """ Determines whether the file is a secondary structure map format A single column with an integer value which indicates the row that this row maps to. check you make sure is structMap[10] = 380 then structMap[380] = 10. """ try: fh = open( filename ) line_num = 0 rowidxmap = {} while True: line = fh.readline() line_num += 1 line = line.strip() if not line: break #EOF if line: try: pointer = int(line) if pointer > 0: if pointer > line_num: rowidxmap[line_num] = pointer elif pointer < line_num & rowidxmap[pointer] != line_num: return False except ValueError: return False fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SequenceAlignment( Fasta ): file_ext = 'align' def __init__(self, **kwd): Fasta.__init__( self, **kwd ) """Initialize AlignCheck datatype""" def sniff( self, filename ): """ Determines whether the file is in Mothur align fasta format Each sequence line must be the same length """ try: fh = open( filename ) len = -1 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: #first non-empty line if line.startswith( '>' ): #The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == '' or line.startswith( '>' ): break if len < 0: len = len(line) elif len != len(line): return False else: break #we found a non-empty line, but its not a fasta header if len > 0: return True except: pass finally: fh.close() return False class AlignCheck( Tabular ): file_ext = 'align.check' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] self.column_types = ['str','int','int','int','int','int','int','int'] self.comment_lines = 1 def set_meta( self, dataset, overwrite = True, **kwd ): # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 ) data_lines = 0 if dataset.has_data(): dataset_fh = open( dataset.file_name ) while True: line = dataset_fh.readline() if not line: break data_lines += 1 dataset_fh.close() dataset.metadata.comment_lines = 1 dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 dataset.metadata.column_names = self.column_names dataset.metadata.column_types = self.column_types class AlignReport(Tabular): """ QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6 """ file_ext = 'align.report' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore', 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd', 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template' ] class BellerophonChimera( Tabular ): file_ext = 'bellerophon.chimera' def __init__(self, **kwd): """Initialize AlignCheck datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Name','Score','Left','Right'] class SecondaryStructureMatch(Tabular): """ name pound dash plus equal loop tilde total 9_1_12 42 68 8 28 275 420 872 9_1_14 36 68 6 26 266 422 851 9_1_15 44 68 8 28 276 418 873 9_1_16 34 72 6 30 267 430 860 9_1_18 46 80 2 36 261 """ def __init__(self, **kwd): """Initialize SecondaryStructureMatch datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] class DistanceMatrix(data.Text): file_ext = 'dist' """Add metadata elements""" MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 ) class LowerTriangleDistanceMatrix(DistanceMatrix): file_ext = 'lower.dist' def __init__(self, **kwd): """Initialize secondary structure map datatype""" DistanceMatrix.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is a lower-triangle distance matrix (phylip) format The first line has the number of sequences in the matrix. The remaining lines have the sequence name followed by a list of distances from all preceeding sequences 5 U68589 U68590 0.3371 U68591 0.3609 0.3782 U68592 0.4155 0.3197 0.4148 U68593 0.2872 0.1690 0.3361 0.2842 """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != 3: return False try: check = float(linePieces[2]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class SquareDistanceMatrix(DistanceMatrix,Tabular): file_ext = 'square.dist' sequence_count = -1 def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): data.Text.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): dataset.metadata.sequences = 0 def sniff( self, filename ): """ Determines whether the file is a square distance matrix (Column-formatted distance matrix) format The first line has the number of sequences in the matrix. The following lines have the sequence name in the first column plus a column for the distance to each sequence in the row order in which they appear in the matrix. 3 U68589 0.0000 0.3371 0.3610 U68590 0.3371 0.0000 0.3783 U68590 0.3371 0.0000 0.3783 """ try: fh = open( filename ) count = 0 line = fh.readline() line = line.strip() sequence_count = int(line) col_cnt = seq_cnt + 1 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != col_cnt : return False try: for i in range(1, col_cnt): check = float(linePieces[i]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class PairwiseDistanceMatrix(DistanceMatrix,Tabular): file_ext = 'pair.dist' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['Sequence','Sequence','Distance'] self.column_types = ['str','str','float'] self.comment_lines = 1 def sniff( self, filename ): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: if line[0] != '@': linePieces = line.split('\t') if len(linePieces) != 3: return False try: check = float(linePieces[2]) except ValueError: return False count += 1 if count == 5: return True fh.close() if count < 5 and count > 0: return True except: pass finally: fh.close() return False class AlignCheck(Tabular): file_ext = 'align.check' def __init__(self, **kwd): """Initialize secondary structure map datatype""" Tabular.__init__( self, **kwd ) self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] self.columns = 8 class Names(Tabular): file_ext = 'names' def __init__(self, **kwd): """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" Tabular.__init__( self, **kwd ) self.column_names = ['name','representatives'] self.columns = 2 class Summary(Tabular): file_ext = 'summary' def __init__(self, **kwd): """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file""" Tabular.__init__( self, **kwd ) self.column_names = ['seqname','start','end','nbases','ambigs','polymer'] self.columns = 6 class Group(Tabular): file_ext = 'groups' def __init__(self, **kwd): """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" Tabular.__init__( self, **kwd ) self.column_names = ['name','group'] self.columns = 2 class Design(Tabular): file_ext = 'design' def __init__(self, **kwd): """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" Tabular.__init__( self, **kwd ) self.column_names = ['group','grouping'] self.columns = 2 class AccNos(Tabular): file_ext = 'accnos' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['name'] self.columns = 1 class Oligos( data.Text ): file_ext = 'oligos' def sniff( self, filename ): """ Determines whether the file is a otu (operational taxonomic unit) format """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': linePieces = line.split('\t') if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]): count += 1 continue elif len(linePieces) == 3 and re.match('barcode',linePieces[0]): count += 1 continue else: return False if count > 20: return True if count > 0: return True except: pass finally: fh.close() return False class Frequency(Tabular): file_ext = 'freq' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['position','frequency'] self.column_types = ['int','float'] def sniff( self, filename ): """ Determines whether the file is a frequency tabular format for chimera analysis #1.14.0 0 0.000 1 0.000 ... 155 0.975 """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': try: linePieces = line.split('\t') i = int(linePieces[0]) f = float(linePieces[1]) count += 1 continue except: return False if count > 20: return True if count > 0: return True except: pass finally: fh.close() return False class Quantile(Tabular): file_ext = 'quan' MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True) MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True) def __init__(self, **kwd): """Quantiles for chimera analysis""" Tabular.__init__( self, **kwd ) self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine'] self.column_types = ['int','float','float','float','float','float','float'] def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): log.info( "Mothur Quantile set_meta %s" % kwd) def sniff( self, filename ): """ Determines whether the file is a quantiles tabular format for chimera analysis 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... """ try: fh = open( filename ) count = 0 while True: line = fh.readline() line = line.strip() if not line: break #EOF else: if line[0] != '#': try: linePieces = line.split('\t') i = int(linePieces[0]) f = float(linePieces[1]) f = float(linePieces[2]) f = float(linePieces[3]) f = float(linePieces[4]) f = float(linePieces[5]) f = float(linePieces[6]) count += 1 continue except: return False if count > 10: return True if count > 0: return True except: pass finally: fh.close() return False class FilteredQuantile(Quantile): file_ext = 'filtered.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.filtered = True class MaskedQuantile(Quantile): file_ext = 'masked.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.masked = True self.filtered = False class FilteredMaskedQuantile(Quantile): file_ext = 'filtered.masked.quan' def __init__(self, **kwd): """Quantiles for chimera analysis""" Quantile.__init__( self, **kwd ) self.masked = True self.filtered = True class LaneMask(data.Text): file_ext = 'filter' def sniff( self, filename ): """ Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. """ try: fh = open( filename ) while True: buff = fh.read(1000) if not buff: break #EOF else: if not re.match('^[01]+$',line): return False return True except: pass finally: close(fh) return False class SequenceTaxonomy(Tabular): file_ext = 'seq.taxonomy' """ A table with 2 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) Example: X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; """ def __init__(self, **kwd): Tabular.__init__( self, **kwd ) self.column_names = ['name','taxonomy'] def sniff( self, filename ): """ Determines whether the file is a SequenceTaxonomy """ try: pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$' fh = open( filename ) count = 0 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: fields = line.split('\t') if len(fields) != 2: return False if not re.match(pat,fields[1]): return False count += 1 if count > 10: break if count > 0: return True except: pass finally: fh.close() return False class RDPSequenceTaxonomy(SequenceTaxonomy): file_ext = 'rdp.taxonomy' """ A table with 2 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep) Example: AB001518.1 Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales; AB001724.1 Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa; AB001774.1 Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila; """ def sniff( self, filename ): """ Determines whether the file is a SequenceTaxonomy """ try: pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$' fh = open( filename ) count = 0 while True: line = fh.readline() if not line: break #EOF line = line.strip() if line: fields = line.split('\t') if len(fields) != 2: return False if not re.match(pat,fields[1]): return False count += 1 if count > 10: break if count > 0: return True except: pass finally: fh.close() return False class ConsensusTaxonomy(Tabular): file_ext = 'cons.taxonomy' def __init__(self, **kwd): """A list of names""" Tabular.__init__( self, **kwd ) self.column_names = ['OTU','count','taxonomy'] class TaxonomySummary(Tabular): file_ext = 'tax.summary' def __init__(self, **kwd): """A Summary of taxon classification""" Tabular.__init__( self, **kwd ) self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total'] class Phylip(data.Text): file_ext = 'phy' def sniff( self, filename ): """ Determines whether the file is in Phylip format (Interleaved or Sequential) The first line of the input file contains the number of species and the number of characters, in free format, separated by blanks (not by commas). The information for each species follows, starting with a ten-character species name (which can include punctuation marks and blanks), and continuing with the characters for that species. http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles Interleaved Example: 6 39 Archaeopt CGATGCTTAC CGCCGATGCT HesperorniCGTTACTCGT TGTCGTTACT BaluchitheTAATGTTAAT TGTTAATGTT B. virginiTAATGTTCGT TGTTAATGTT BrontosaurCAAAACCCAT CATCAAAACC B.subtilisGGCAGCCAAT CACGGCAGCC TACCGCCGAT GCTTACCGC CGTTGTCGTT ACTCGTTGT AATTGTTAAT GTTAATTGT CGTTGTTAAT GTTCGTTGT CATCATCAAA ACCCATCAT AATCACGGCA GCCAATCAC """ try: fh = open( filename ) # counts line line = fh.readline().strip() linePieces = line.split() count = int(linePieces[0]) seq_len = int(linePieces[1]) # data lines """ TODO check data lines while True: line = fh.readline() # name is the first 10 characters name = line[0:10] seq = line[10:].strip() # nucleic base or amino acid 1-char designators (spaces allowed) bases = ''.join(seq.split()) # float per base (each separated by space) """ return True except: pass finally: close(fh) return False class Axes(Tabular): file_ext = 'axes' def __init__(self, **kwd): """Initialize axes datatype""" Tabular.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is an axes format The first line may have column headings. The following lines have the name in the first column plus float columns for each axis. ==> 98_sq_phylip_amazon.fn.unique.pca.axes <== group axis1 axis2 forest 0.000000 0.145743 pasture 0.145743 0.000000 ==> 98_sq_phylip_amazon.nmds.axes <== axis1 axis2 U68589 0.262608 -0.077498 U68590 0.027118 0.195197 U68591 0.329854 0.014395 """ try: fh = open( filename ) count = 0 line = fh.readline() line = line.strip() col_cnt = None while True: line = fh.readline() line = line.strip() if not line: break #EOF if line: fields = line.split('\t') if col_cnt == None: # ignore values in first line as they may be column headings col_cnt = len(fields) else: if len(fields) != col_cnt : return False try: for i in range(1, col_cnt): check = float(fields[i]) except ValueError: return False count += 1 if count > 10: return True if count > 0: return True except: pass finally: fh.close() return False ## Qiime Classes class QiimeMetadataMapping(Tabular): MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimemapping' def __init__(self, **kwd): """ http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview Information about the samples necessary to perform the data analysis. # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description'] """ Tabular.__init__( self, **kwd ) def sniff( self, filename ): """ Determines whether the file is a qiime mapping file Just checking for an appropriate header line for now, could be improved """ try: pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription' fh = open( filename ) while True: line = dataset_fh.readline() if re.match(pat,line): return True except: pass finally: close(fh) return False def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() if line.startswith('#SampleID'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) self.set_column_names(dataset) class QiimeOTU(Tabular): """ Associates OTUs with sequence IDs Example: 0 FLP3FBN01C2MYD FLP3FBN01B2ALM 1 FLP3FBN01DF6NE FLP3FBN01CKW1J FLP3FBN01CHVM4 2 FLP3FBN01AXQ2Z """ file_ext = 'qiimeotu' class QiimeOTUTable(Tabular): """ #Full OTU Counts #OTU ID PC.354 PC.355 PC.356 Consensus Lineage 0 0 1 0 Root;Bacteria;Firmicutes;"Clostridia";Clostridiales 1 1 3 1 Root;Bacteria 2 0 2 2 Root;Bacteria;Bacteroidetes """ MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimeotutable' def init_meta( self, dataset, copy_from=None ): tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): self.set_column_names(dataset) def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() line = dataset_fh.readline() if line.startswith('#OTU ID'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() dataset.metadata.comment_lines = 2 class QiimeDistanceMatrix(Tabular): """ PC.354 PC.355 PC.356 PC.354 0.0 3.177 1.955 PC.355 3.177 0.0 3.444 PC.356 1.955 3.444 0.0 """ file_ext = 'qiimedistmat' def init_meta( self, dataset, copy_from=None ): tabular.Tabular.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): self.set_column_names(dataset) def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() # first line contains the names dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() dataset.metadata.comment_lines = 1 class QiimePCA(Tabular): """ Principal Coordinate Analysis Data The principal coordinate (PC) axes (columns) for each sample (rows). Pairs of PCs can then be graphed to view the relationships between samples. The bottom of the output file contains the eigenvalues and % variation explained for each PC. Example: pc vector number 1 2 3 PC.354 -0.309063936588 0.0398252112257 0.0744672231759 PC.355 -0.106593922619 0.141125998277 0.0780204374172 PC.356 -0.219869362955 0.00917241121781 0.0357281314115 eigvals 0.480220500471 0.163567082874 0.125594470811 % variation explained 51.6955484555 17.6079322939 """ file_ext = 'qiimepca' class QiimeParams(Tabular): """ ###pick_otus_through_otu_table.py parameters### # OTU picker parameters pick_otus:otu_picking_method uclust pick_otus:clustering_algorithm furthest # Representative set picker parameters pick_rep_set:rep_set_picking_method first pick_rep_set:sort_by otu """ file_ext = 'qiimeparams' class QiimePrefs(data.Text): """ A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py. Example: { 'background_color':'black', 'sample_coloring': { 'Treatment': { 'column':'Treatment', 'colors':(('red',(0,100,100)),('blue',(240,100,100))) }, 'DOB': { 'column':'DOB', 'colors':(('red',(0,100,100)),('blue',(240,100,100))) } }, 'MONTE_CARLO_GROUP_DISTANCES': { 'Treatment': 10, 'DOB': 10 } } """ file_ext = 'qiimeprefs' class QiimeTaxaSummary(Tabular): """ Taxon PC.354 PC.355 PC.356 Root;Bacteria;Actinobacteria 0.0 0.177 0.955 Root;Bacteria;Firmicutes 0.177 0.0 0.444 Root;Bacteria;Proteobacteria 0.955 0.444 0.0 """ MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) file_ext = 'qiimetaxsummary' def set_column_names(self, dataset): if dataset.has_data(): dataset_fh = open( dataset.file_name ) line = dataset_fh.readline() if line.startswith('Taxon'): dataset.metadata.column_names = line.strip().split('\t'); dataset_fh.close() def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) self.set_column_names(dataset) if __name__ == '__main__': import doctest, sys doctest.testmod(sys.modules[__name__])