Mercurial > repos > jjohnson > qiime

diff lib/galaxy/datatypes/metagenomics.py @ 0:e5c3175506b7 default tip
Initial tool configs for qiime, most need work.
author: Jim Johnson <jj@umn.edu>
date: Sun, 17 Jul 2011 10:30:11 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/metagenomics.py	Sun Jul 17 10:30:11 2011 -0500
@@ -0,0 +1,1121 @@
+"""
+metagenomics datatypes
+James E Johnson - University of Minnesota
+for Mothur
+"""
+
+import data
+import logging, os, sys, time, tempfile, shutil, string, glob, re
+import galaxy.model
+from galaxy.datatypes import metadata
+from galaxy.datatypes import tabular
+from galaxy.datatypes import sequence
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes.tabular import Tabular
+from galaxy.datatypes.sequence import Fasta
+from galaxy import util
+from galaxy.datatypes.images import Html
+from sniff import *
+
+log = logging.getLogger(__name__)
+
+
+## Mothur Classes 
+
+class Otu( Tabular ):
+    file_ext = 'otu'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) format
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 2:
+                            return False
+                        try:
+                            check = int(linePieces[1])
+                            if check + 2 != len(linePieces):
+                                return False
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count == 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class OtuList( Otu ):
+    file_ext = 'list'
+
+class Sabund( Otu ):
+    file_ext = 'sabund'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) format
+        label<TAB>count[<TAB>value(1..n)]
+        
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 2:
+                            return False
+                        try:
+                            check = int(linePieces[1])
+                            if check + 2 != len(linePieces):
+                                return False
+                            for i in range( 2, len(linePieces)):
+                                ival = int(linePieces[i])
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count >= 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class Rabund( Sabund ):
+    file_ext = 'rabund'
+
+class GroupAbund( Otu ):
+    file_ext = 'grpabund'
+    def init_meta( self, dataset, copy_from=None ):
+        Otu.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ):
+        # See if file starts with header line
+        if dataset.has_data():
+            try:
+                fh = open( dataset.file_name )
+                line = fh.readline()
+                line = line.strip()
+                linePieces = line.split('\t')
+                if linePieces[0] == 'label' and linePieces[1] == 'Group':
+                    skip=1
+                else:
+                    skip=0
+            finally:
+                fh.close()
+        Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd)
+    def sniff( self, filename, vals_are_int=False):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Shared format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int)
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) < 3:
+                            return False
+                        if count > 0 or linePieces[0] != 'label':
+                            try:
+                                check = int(linePieces[2])
+                                if check + 3 != len(linePieces):
+                                    return False
+                                for i in range( 3, len(linePieces)):
+                                    if vals_are_int:
+                                        ival = int(linePieces[i])
+                                    else:
+                                        fval = float(linePieces[i])
+                            except ValueError:
+                                return False
+                        count += 1
+                        if count >= 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class SharedRabund( GroupAbund ):
+    file_ext = 'shared'
+
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Shared format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        # return GroupAbund.sniff(self,filename,True)
+        isme = GroupAbund.sniff(self,filename,True)
+        log.info( "is SharedRabund %s" % isme)
+        return isme
+        
+
+class RelAbund( GroupAbund ):
+    file_ext = 'relabund'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
+        label<TAB>group<TAB>count[<TAB>value(1..n)]
+        The first line is column headings as of Mothur v 1.20
+        """
+        # return GroupAbund.sniff(self,filename,False)
+        isme = GroupAbund.sniff(self,filename,False)
+        log.info( "is RelAbund %s" % isme)
+        return isme
+
+class SecondaryStructureMap(Tabular):
+    file_ext = 'map'
+    def __init__(self, **kwd):
+        """Initialize secondary structure map datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['Map']
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a secondary structure map format
+        A single column with an integer value which indicates the row that this row maps to.
+        check you make sure is structMap[10] = 380 then structMap[380] = 10.
+        """
+        try:
+            fh = open( filename )
+            line_num = 0
+            rowidxmap = {}
+            while True:
+                line = fh.readline()
+                line_num += 1
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    try:
+                        pointer = int(line)
+                        if pointer > 0:
+                            if pointer > line_num:
+                                rowidxmap[line_num] = pointer 
+                            elif pointer < line_num & rowidxmap[pointer] != line_num:
+                                return False
+                    except ValueError:
+                        return False
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class SequenceAlignment( Fasta ):
+    file_ext = 'align'
+    def __init__(self, **kwd):
+        Fasta.__init__( self, **kwd )
+        """Initialize AlignCheck datatype"""
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is in Mothur align fasta format
+        Each sequence line must be the same length
+        """
+        
+        try:
+            fh = open( filename )
+            len = -1
+            while True:
+                line = fh.readline()
+                if not line:
+                    break #EOF
+                line = line.strip()
+                if line: #first non-empty line
+                    if line.startswith( '>' ):
+                        #The next line.strip() must not be '', nor startwith '>'
+                        line = fh.readline().strip()
+                        if line == '' or line.startswith( '>' ):
+                            break
+                        if len < 0:
+                            len = len(line)
+                        elif len != len(line):
+                            return False
+                    else:
+                        break #we found a non-empty line, but its not a fasta header
+            if len > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class AlignCheck( Tabular ):
+    file_ext = 'align.check'
+    def __init__(self, **kwd):
+        """Initialize AlignCheck datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+        self.column_types = ['str','int','int','int','int','int','int','int']
+        self.comment_lines = 1
+
+    def set_meta( self, dataset, overwrite = True, **kwd ):
+        # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
+        data_lines = 0
+        if dataset.has_data():
+            dataset_fh = open( dataset.file_name )
+            while True:
+                line = dataset_fh.readline()
+                if not line: break
+                data_lines += 1
+            dataset_fh.close()
+        dataset.metadata.comment_lines = 1
+        dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
+        dataset.metadata.column_names = self.column_names
+        dataset.metadata.column_types = self.column_types
+
+class AlignReport(Tabular):
+    """
+QueryName	QueryLength	TemplateName	TemplateLength	SearchMethod	SearchScore	AlignmentMethod	QueryStart	QueryEnd	TemplateStart	TemplateEnd	PairwiseAlignmentLength	GapsInQuery	GapsInTemplate	LongestInsert	SimBtwnQuery&Template
+AY457915	501		82283		1525		kmer		89.07		needleman	5		501		1		499		499			2		0		0		97.6
+    """
+    file_ext = 'align.report'
+    def __init__(self, **kwd):
+        """Initialize AlignCheck datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
+                             'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
+                             'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
+                             ]
+
+class BellerophonChimera( Tabular ):
+    file_ext = 'bellerophon.chimera'
+    def __init__(self, **kwd):
+        """Initialize AlignCheck datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['Name','Score','Left','Right']
+
+class SecondaryStructureMatch(Tabular):
+    """
+	name	pound	dash	plus	equal	loop	tilde	total
+	9_1_12	42	68	8	28	275	420	872
+	9_1_14	36	68	6	26	266	422	851
+	9_1_15	44	68	8	28	276	418	873
+	9_1_16	34	72	6	30	267	430	860
+	9_1_18	46	80	2	36	261	
+    """
+    def __init__(self, **kwd):
+        """Initialize SecondaryStructureMatch datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+
+class DistanceMatrix(data.Text):
+    file_ext = 'dist'
+    """Add metadata elements"""
+    MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )
+
+
+class LowerTriangleDistanceMatrix(DistanceMatrix):
+    file_ext = 'lower.dist'
+    def __init__(self, **kwd):
+        """Initialize secondary structure map datatype"""
+        DistanceMatrix.__init__( self, **kwd )
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a lower-triangle distance matrix (phylip) format
+        The first line has the number of sequences in the matrix.
+        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
+                5
+                U68589
+                U68590	0.3371
+                U68591	0.3609	0.3782
+                U68592	0.4155	0.3197	0.4148
+                U68593	0.2872	0.1690	0.3361	0.2842
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) != 3:
+                            return False
+                        try:
+                            check = float(linePieces[2])
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count == 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class SquareDistanceMatrix(DistanceMatrix,Tabular):
+    file_ext = 'square.dist'
+    sequence_count = -1
+
+    def __init__(self, **kwd):
+        """Initialize secondary structure map datatype"""
+        Tabular.__init__( self, **kwd )
+    def init_meta( self, dataset, copy_from=None ):
+        data.Text.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+        dataset.metadata.sequences = 0 
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
+        The first line has the number of sequences in the matrix.
+        The following lines have the sequence name in the first column plus a column for the distance to each sequence 
+        in the row order in which they appear in the matrix.
+               3
+               U68589  0.0000  0.3371  0.3610
+               U68590  0.3371  0.0000  0.3783
+               U68590  0.3371  0.0000  0.3783
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            line = fh.readline()
+            line = line.strip()
+            sequence_count = int(line)
+            col_cnt = seq_cnt + 1
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) != col_cnt :
+                            return False
+                        try:
+                            for i in range(1, col_cnt):
+                                check = float(linePieces[i])
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count == 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
+    file_ext = 'pair.dist'
+    def __init__(self, **kwd):
+        """Initialize secondary structure map datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['Sequence','Sequence','Distance']
+        self.column_types = ['str','str','float']
+        self.comment_lines = 1
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
+        The first and second columns have the sequence names and the third column is the distance between those sequences.
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    if line[0] != '@':
+                        linePieces = line.split('\t')
+                        if len(linePieces) != 3:
+                            return False
+                        try:
+                            check = float(linePieces[2])
+                        except ValueError:
+                            return False
+                        count += 1
+                        if count == 5:
+                            return True
+            fh.close()
+            if count < 5 and count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class AlignCheck(Tabular):
+    file_ext = 'align.check'
+    def __init__(self, **kwd):
+        """Initialize secondary structure map datatype"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+        self.columns = 8
+
+class Names(Tabular):
+    file_ext = 'names'
+    def __init__(self, **kwd):
+        """Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','representatives']
+        self.columns = 2
+
+class Summary(Tabular):
+    file_ext = 'summary'
+    def __init__(self, **kwd):
+        """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['seqname','start','end','nbases','ambigs','polymer']
+        self.columns = 6
+
+class Group(Tabular):
+    file_ext = 'groups'
+    def __init__(self, **kwd):
+        """Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','group']
+        self.columns = 2
+
+class Design(Tabular):
+    file_ext = 'design'
+    def __init__(self, **kwd):
+        """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['group','grouping']
+        self.columns = 2
+
+class AccNos(Tabular):
+    file_ext = 'accnos'
+    def __init__(self, **kwd):
+        """A list of names"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name']
+        self.columns = 1
+
+class Oligos( data.Text ):
+    file_ext = 'oligos'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a otu (operational taxonomic unit) format
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                else:
+                    if line[0] != '#':
+                        linePieces = line.split('\t')
+                        if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
+                            count += 1
+                            continue
+                        elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
+                            count += 1
+                            continue
+                        else:
+                            return False
+                        if count > 20:
+                            return True
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class Frequency(Tabular):
+    file_ext = 'freq'
+    def __init__(self, **kwd):
+        """A list of names"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['position','frequency']
+        self.column_types = ['int','float']
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a frequency tabular format for chimera analysis
+        #1.14.0
+        0	0.000
+        1	0.000
+        ...
+        155	0.975
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                else:
+                    if line[0] != '#':
+                        try:
+                            linePieces = line.split('\t')
+                            i = int(linePieces[0])
+                            f = float(linePieces[1])
+                            count += 1
+                            continue
+                        except:
+                            return False
+                        if count > 20:
+                            return True
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class Quantile(Tabular):
+    file_ext = 'quan'
+    MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
+    MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
+    def __init__(self, **kwd):
+        """Quantiles for chimera analysis"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
+        self.column_types = ['int','float','float','float','float','float','float']
+    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+        log.info( "Mothur Quantile set_meta %s" % kwd)
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a quantiles tabular format for chimera analysis
+        1	0	0	0	0	0	0
+        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
+        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
+        ...
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                else:
+                    if line[0] != '#':
+                        try:
+                            linePieces = line.split('\t')
+                            i = int(linePieces[0])
+                            f = float(linePieces[1])
+                            f = float(linePieces[2])
+                            f = float(linePieces[3])
+                            f = float(linePieces[4])
+                            f = float(linePieces[5])
+                            f = float(linePieces[6])
+                            count += 1
+                            continue
+                        except:
+                            return False
+                        if count > 10:
+                            return True
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class FilteredQuantile(Quantile):
+    file_ext = 'filtered.quan'
+    def __init__(self, **kwd):
+        """Quantiles for chimera analysis"""
+        Quantile.__init__( self, **kwd )
+        self.filtered = True
+
+class MaskedQuantile(Quantile):
+    file_ext = 'masked.quan'
+    def __init__(self, **kwd):
+        """Quantiles for chimera analysis"""
+        Quantile.__init__( self, **kwd )
+        self.masked = True
+        self.filtered = False
+
+class FilteredMaskedQuantile(Quantile):
+    file_ext = 'filtered.masked.quan'
+    def __init__(self, **kwd):
+        """Quantiles for chimera analysis"""
+        Quantile.__init__( self, **kwd )
+        self.masked = True
+        self.filtered = True
+
+class LaneMask(data.Text):
+    file_ext = 'filter'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.
+        """
+        try:
+            fh = open( filename )
+            while True:
+                buff = fh.read(1000)
+                if not buff:
+                    break #EOF
+                else:
+                    if not re.match('^[01]+$',line):
+                        return False
+            return True
+        except:
+            pass
+        finally:
+            close(fh)
+        return False
+
+class SequenceTaxonomy(Tabular):
+    file_ext = 'seq.taxonomy'
+    """
+        A table with 2 columns:
+        - SequenceName
+        - Taxonomy (semicolon-separated taxonomy in descending order)
+        Example:
+          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
+          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
+          AF052717.1      Eukaryota;Parabasalidea;
+    """
+    def __init__(self, **kwd):
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['name','taxonomy']
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a SequenceTaxonomy
+        """
+        try:
+            pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$'
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                if not line:
+                    break #EOF
+                line = line.strip()
+                if line:
+                    fields = line.split('\t')
+                    if len(fields) != 2:
+                        return False
+                    if not re.match(pat,fields[1]):
+                        return False
+                    count += 1
+                    if count > 10:
+                        break
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class RDPSequenceTaxonomy(SequenceTaxonomy):
+    file_ext = 'rdp.taxonomy'
+    """
+        A table with 2 columns:
+        - SequenceName
+        - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep)
+        Example:
+          AB001518.1      Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales;
+          AB001724.1      Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa;
+          AB001774.1      Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila;
+    """
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a SequenceTaxonomy
+        """
+        try:
+            pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$'
+            fh = open( filename )
+            count = 0
+            while True:
+                line = fh.readline()
+                if not line:
+                    break #EOF
+                line = line.strip()
+                if line:
+                    fields = line.split('\t')
+                    if len(fields) != 2:
+                        return False
+                    if not re.match(pat,fields[1]):
+                        return False
+                    count += 1
+                    if count > 10:
+                        break
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+class ConsensusTaxonomy(Tabular):
+    file_ext = 'cons.taxonomy'
+    def __init__(self, **kwd):
+        """A list of names"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['OTU','count','taxonomy']
+
+class TaxonomySummary(Tabular):
+    file_ext = 'tax.summary'
+    def __init__(self, **kwd):
+        """A Summary of taxon classification"""
+        Tabular.__init__( self, **kwd )
+        self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']
+
+class Phylip(data.Text):
+    file_ext = 'phy'
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is in Phylip format (Interleaved or Sequential)
+        The first line of the input file contains the number of species and the
+        number of characters, in free format, separated by blanks (not by
+        commas). The information for each species follows, starting with a
+        ten-character species name (which can include punctuation marks and blanks),
+        and continuing with the characters for that species.
+        http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+        Interleaved Example:
+            6   39
+        Archaeopt CGATGCTTAC CGCCGATGCT
+        HesperorniCGTTACTCGT TGTCGTTACT
+        BaluchitheTAATGTTAAT TGTTAATGTT
+        B. virginiTAATGTTCGT TGTTAATGTT
+        BrontosaurCAAAACCCAT CATCAAAACC
+        B.subtilisGGCAGCCAAT CACGGCAGCC
+        
+        TACCGCCGAT GCTTACCGC
+        CGTTGTCGTT ACTCGTTGT
+        AATTGTTAAT GTTAATTGT
+        CGTTGTTAAT GTTCGTTGT
+        CATCATCAAA ACCCATCAT
+        AATCACGGCA GCCAATCAC
+        """
+        try:
+            fh = open( filename )
+            # counts line
+            line = fh.readline().strip()
+            linePieces = line.split()
+            count = int(linePieces[0])
+            seq_len = int(linePieces[1])
+            # data lines
+            """
+            TODO check data lines
+            while True:
+                line = fh.readline()
+                # name is the first 10 characters
+                name = line[0:10]
+                seq = line[10:].strip()
+                # nucleic base or amino acid 1-char designators (spaces allowed)
+                bases = ''.join(seq.split())
+                # float per base (each separated by space)
+            """
+            return True
+        except:
+            pass
+        finally:
+            close(fh)
+        return False
+
+
+class Axes(Tabular):
+    file_ext = 'axes'
+
+    def __init__(self, **kwd):
+        """Initialize axes datatype"""
+        Tabular.__init__( self, **kwd )
+    def sniff( self, filename ):
+        """
+        Determines whether the file is an axes format
+        The first line may have column headings.
+        The following lines have the name in the first column plus float columns for each axis.
+		==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
+		group   axis1   axis2
+		forest  0.000000        0.145743        
+		pasture 0.145743        0.000000        
+		
+		==> 98_sq_phylip_amazon.nmds.axes <==
+        		axis1   axis2   
+		U68589  0.262608        -0.077498       
+		U68590  0.027118        0.195197        
+		U68591  0.329854        0.014395        
+        """
+        try:
+            fh = open( filename )
+            count = 0
+            line = fh.readline()
+            line = line.strip()
+            col_cnt = None
+            while True:
+                line = fh.readline()
+                line = line.strip()
+                if not line:
+                    break #EOF
+                if line:
+                    fields = line.split('\t')
+                    if col_cnt == None:  # ignore values in first line as they may be column headings
+                        col_cnt = len(fields)
+                    else:  
+                        if len(fields) != col_cnt :
+                            return False
+                        try:
+                            for i in range(1, col_cnt):
+                                check = float(fields[i])
+                        except ValueError:
+                            return False
+                        count += 1
+                    if count > 10:
+                        return True
+            if count > 0:
+                return True
+        except:
+            pass
+        finally:
+            fh.close()
+        return False
+
+## Qiime Classes
+
+class QiimeMetadataMapping(Tabular):
+    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+    file_ext = 'qiimemapping'
+
+    def __init__(self, **kwd):
+        """
+        http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
+        Information about the samples necessary to perform the data analysis. 
+        # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
+        """
+        Tabular.__init__( self, **kwd )
+
+    def sniff( self, filename ):
+        """
+        Determines whether the file is a qiime mapping file
+        Just checking for an appropriate header line for now, could be improved
+        """
+        try:
+            pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
+            fh = open( filename )
+            while True:
+                line = dataset_fh.readline()
+                if re.match(pat,line):
+                    return True
+        except:
+            pass
+        finally:
+            close(fh)
+        return False
+
+    def set_column_names(self, dataset):
+        if dataset.has_data():
+            dataset_fh = open( dataset.file_name )
+            line = dataset_fh.readline()
+            if line.startswith('#SampleID'):
+                dataset.metadata.column_names = line.strip().split('\t');
+            dataset_fh.close()
+
+    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
+        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
+        self.set_column_names(dataset)
+
+class QiimeOTU(Tabular):
+    """
+    Associates OTUs with sequence IDs
+    Example:
+    0	FLP3FBN01C2MYD	FLP3FBN01B2ALM
+    1	FLP3FBN01DF6NE	FLP3FBN01CKW1J	FLP3FBN01CHVM4
+    2	FLP3FBN01AXQ2Z
+    """
+    file_ext = 'qiimeotu'
+
+class QiimeOTUTable(Tabular):
+    """
+        #Full OTU Counts
+        #OTU ID	PC.354	PC.355	PC.356	Consensus Lineage
+        0	0	1	0	Root;Bacteria;Firmicutes;"Clostridia";Clostridiales
+        1	1	3	1	Root;Bacteria
+        2	0	2	2	Root;Bacteria;Bacteroidetes
+    """
+    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+    file_ext = 'qiimeotutable'
+    def init_meta( self, dataset, copy_from=None ):
+        tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+        self.set_column_names(dataset) 
+    def set_column_names(self, dataset):
+        if dataset.has_data():
+            dataset_fh = open( dataset.file_name )
+            line = dataset_fh.readline()
+            line = dataset_fh.readline()
+            if line.startswith('#OTU ID'):
+                dataset.metadata.column_names = line.strip().split('\t');
+            dataset_fh.close()
+            dataset.metadata.comment_lines = 2
+
+class QiimeDistanceMatrix(Tabular):
+    """
+        	PC.354	PC.355	PC.356
+        PC.354	0.0	3.177	1.955	
+        PC.355	3.177	0.0	3.444
+        PC.356	1.955	3.444	0.0
+    """
+    file_ext = 'qiimedistmat'
+    def init_meta( self, dataset, copy_from=None ):
+        tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
+    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+        self.set_column_names(dataset) 
+    def set_column_names(self, dataset):
+        if dataset.has_data():
+            dataset_fh = open( dataset.file_name )
+            line = dataset_fh.readline()
+            # first line contains the names
+            dataset.metadata.column_names = line.strip().split('\t');
+            dataset_fh.close()
+            dataset.metadata.comment_lines = 1
+
+class QiimePCA(Tabular):
+    """
+    Principal Coordinate Analysis Data
+    The principal coordinate (PC) axes (columns) for each sample (rows). 
+    Pairs of PCs can then be graphed to view the relationships between samples. 
+    The bottom of the output file contains the eigenvalues and % variation explained for each PC.
+    Example:
+    pc vector number	1	2	3
+    PC.354	-0.309063936588	0.0398252112257	0.0744672231759
+    PC.355	-0.106593922619	0.141125998277	0.0780204374172
+    PC.356	-0.219869362955	0.00917241121781	0.0357281314115
+    
+    
+    eigvals	0.480220500471	0.163567082874	0.125594470811
+    % variation explained	51.6955484555	17.6079322939
+    """
+    file_ext = 'qiimepca'
+
+class QiimeParams(Tabular):
+    """
+###pick_otus_through_otu_table.py parameters###
+
+# OTU picker parameters
+pick_otus:otu_picking_method    uclust
+pick_otus:clustering_algorithm  furthest
+
+# Representative set picker parameters
+pick_rep_set:rep_set_picking_method     first
+pick_rep_set:sort_by    otu
+    """
+    file_ext = 'qiimeparams'
+
+class QiimePrefs(data.Text):
+    """
+    A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py.
+    Example:
+{
+'background_color':'black',
+
+'sample_coloring':
+        {
+                'Treatment':
+                {
+                        'column':'Treatment',
+                        'colors':(('red',(0,100,100)),('blue',(240,100,100)))
+                },
+                'DOB':
+                {
+                        'column':'DOB',
+                        'colors':(('red',(0,100,100)),('blue',(240,100,100)))
+                }
+        },
+'MONTE_CARLO_GROUP_DISTANCES':
+        {
+                'Treatment': 10,
+                'DOB': 10
+        }
+}
+    """
+    file_ext = 'qiimeprefs'
+
+class QiimeTaxaSummary(Tabular):
+    """
+        Taxon	PC.354	PC.355	PC.356
+        Root;Bacteria;Actinobacteria	0.0	0.177	0.955	
+        Root;Bacteria;Firmicutes	0.177	0.0	0.444
+        Root;Bacteria;Proteobacteria	0.955	0.444	0.0
+    """
+    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+    file_ext = 'qiimetaxsummary'
+
+    def set_column_names(self, dataset):
+        if dataset.has_data():
+            dataset_fh = open( dataset.file_name )
+            line = dataset_fh.readline()
+            if line.startswith('Taxon'):
+                dataset.metadata.column_names = line.strip().split('\t');
+            dataset_fh.close()
+
+    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
+        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
+        self.set_column_names(dataset)
+
+if __name__ == '__main__':
+    import doctest, sys
+    doctest.testmod(sys.modules[__name__])
+
author	Jim Johnson <jj@umn.edu>
date	Sun, 17 Jul 2011 10:30:11 -0500
parents
children