Mercurial > repos > jjohnson > mothur_toolsuite

"""
metagenomics datatypes
James E Johnson - University of Minnesota
for Mothur
"""

import logging, os, os.path, sys, time, tempfile, shutil, string, glob, re
import galaxy.model
from galaxy.datatypes import data
from galaxy.datatypes.sniff import *
from galaxy.datatypes import metadata
from galaxy.datatypes import tabular
from galaxy.datatypes import sequence
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.data import Text
from galaxy.datatypes.tabular import Tabular
from galaxy.datatypes.sequence import Fasta
from galaxy import util
from galaxy.datatypes.images import Html

log = logging.getLogger(__name__)


## Mothur Classes

class Otu( Tabular ):
    file_ext = 'otu'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 2:
                            return False
                        try:
                            check = int(linePieces[1])
                            if check + 2 != len(linePieces):
                                return False
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class OtuList( Otu ):
    file_ext = 'list'

class Sabund( Otu ):
    file_ext = 'sabund'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        label<TAB>count[<TAB>value(1..n)]

        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 2:
                            return False
                        try:
                            check = int(linePieces[1])
                            if check + 2 != len(linePieces):
                                return False
                            for i in range( 2, len(linePieces)):
                                ival = int(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count >= 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Rabund( Sabund ):
    file_ext = 'rabund'

class GroupAbund( Otu ):
    file_ext = 'grpabund'
    def init_meta( self, dataset, copy_from=None ):
        Otu.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, skip=1, max_data_lines = 100000, **kwd ):
        # See if file starts with header line
        if dataset.has_data():
            try:
                fh = open( dataset.file_name )
                line = fh.readline()
                line = line.strip()
                linePieces = line.split('\t')
                if linePieces[0] == 'label' and linePieces[1] == 'Group':
                    skip=1
                else:
                    skip=0
            finally:
                fh.close()
        Otu.set_meta( self, dataset, overwrite, skip, max_data_lines, **kwd)
    def sniff( self, filename, vals_are_int=False):
        """
        Determines whether the file is a otu (operational taxonomic unit) Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.20
        """
        log.info( "sniff GroupAbund vals_are_int %s" % vals_are_int)
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 3:
                            return False
                        if count > 0 or linePieces[0] != 'label':
                            try:
                                check = int(linePieces[2])
                                if check + 3 != len(linePieces):
                                    return False
                                for i in range( 3, len(linePieces)):
                                    if vals_are_int:
                                        ival = int(linePieces[i])
                                    else:
                                        fval = float(linePieces[i])
                            except ValueError:
                                return False
                        count += 1
                        if count >= 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SharedRabund( GroupAbund ):
    file_ext = 'shared'


    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.20
        """
        # return GroupAbund.sniff(self,filename,True)
        isme = GroupAbund.sniff(self,filename,True)
        log.info( "is SharedRabund %s" % isme)
        return isme


class RelAbund( GroupAbund ):
    file_ext = 'relabund'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.20
        """
        # return GroupAbund.sniff(self,filename,False)
        isme = GroupAbund.sniff(self,filename,False)
        log.info( "is RelAbund %s" % isme)
        return isme

class SecondaryStructureMap(Tabular):
    file_ext = 'map'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Map']

    def sniff( self, filename ):
        """
        Determines whether the file is a secondary structure map format
        A single column with an integer value which indicates the row that this row maps to.
        check you make sure is structMap[10] = 380 then structMap[380] = 10.
        """
        try:
            fh = open( filename )
            line_num = 0
            rowidxmap = {}
            while True:
                line = fh.readline()
                line_num += 1
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    try:
                        pointer = int(line)
                        if pointer > 0:
                            if pointer > line_num:
                                rowidxmap[line_num] = pointer
                            elif pointer < line_num & rowidxmap[pointer] != line_num:
                                return False
                    except ValueError:
                        return False
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SequenceAlignment( Fasta ):
    file_ext = 'align'
    def __init__(self, **kwd):
        Fasta.__init__( self, **kwd )
        """Initialize AlignCheck datatype"""

    def sniff( self, filename ):
        """
        Determines whether the file is in Mothur align fasta format
        Each sequence line must be the same length
        """

        try:
            fh = open( filename )
            len = -1
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line: #first non-empty line
                    if line.startswith( '>' ):
                        #The next line.strip() must not be '', nor startwith '>'
                        line = fh.readline().strip()
                        if line == '' or line.startswith( '>' ):
                            break
                        if len < 0:
                            len = len(line)
                        elif len != len(line):
                            return False
                    else:
                        break #we found a non-empty line, but its not a fasta header
            if len > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class AlignCheck( Tabular ):
    file_ext = 'align.check'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
        self.column_types = ['str','int','int','int','int','int','int','int']
        self.comment_lines = 1

    def set_meta( self, dataset, overwrite = True, **kwd ):
        # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
        data_lines = 0
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            while True:
                line = dataset_fh.readline()
                if not line: break
                data_lines += 1
            dataset_fh.close()
        dataset.metadata.comment_lines = 1
        dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
        dataset.metadata.column_names = self.column_names
        dataset.metadata.column_types = self.column_types

class AlignReport(Tabular):
    """
QueryName	QueryLength	TemplateName	TemplateLength	SearchMethod	SearchScore	AlignmentMethod	QueryStart	QueryEnd	TemplateStart	TemplateEnd	PairwiseAlignmentLength	GapsInQuery	GapsInTemplate	LongestInsert	SimBtwnQuery&Template
AY457915	501		82283		1525		kmer		89.07		needleman	5		501		1		499		499			2		0		0		97.6
    """
    file_ext = 'align.report'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
                             'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
                             'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
                             ]

class BellerophonChimera( Tabular ):
    file_ext = 'bellerophon.chimera'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Name','Score','Left','Right']

class SecondaryStructureMatch(Tabular):
    """
	name	pound	dash	plus	equal	loop	tilde	total
	9_1_12	42	68	8	28	275	420	872
	9_1_14	36	68	6	26	266	422	851
	9_1_15	44	68	8	28	276	418	873
	9_1_16	34	72	6	30	267	430	860
	9_1_18	46	80	2	36	261
    """
    def __init__(self, **kwd):
        """Initialize SecondaryStructureMatch datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']

class DistanceMatrix( Text ):
    file_ext = 'dist'
    """Add metadata elements"""
    MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )


class LowerTriangleDistanceMatrix(DistanceMatrix):
    file_ext = 'lower.dist'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        DistanceMatrix.__init__( self, **kwd )

    def sniff( self, filename ):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
                5
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != 3:
                            return False
                        try:
                            check = float(linePieces[2])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SquareDistanceMatrix(DistanceMatrix,Tabular):
    file_ext = 'square.dist'
    sequence_count = -1

    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
    def init_meta( self, dataset, copy_from=None ):
        Text.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        dataset.metadata.sequences = 0

    def sniff( self, filename ):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.
               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783
        """
        try:
            fh = open( filename )
            count = 0
            line = fh.readline()
            line = line.strip()
            sequence_count = int(line)
            col_cnt = seq_cnt + 1
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != col_cnt :
                            return False
                        try:
                            for i in range(1, col_cnt):
                                check = float(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
    file_ext = 'pair.dist'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Sequence','Sequence','Distance']
        self.column_types = ['str','str','float']
        self.comment_lines = 1

    def sniff( self, filename ):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != 3:
                            return False
                        try:
                            check = float(linePieces[2])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class AlignCheck(Tabular):
    file_ext = 'align.check'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
        self.columns = 8

class Names(Tabular):
    file_ext = 'names'
    def __init__(self, **kwd):
        """Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','representatives']
        self.columns = 2

class Summary(Tabular):
    file_ext = 'summary'
    def __init__(self, **kwd):
        """summarizes the quality of sequences in an unaligned or aligned fasta-formatted sequence file"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['seqname','start','end','nbases','ambigs','polymer']
        self.columns = 6

class Group(Tabular):
    file_ext = 'groups'
    def __init__(self, **kwd):
        """Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','group']
        self.columns = 2

class Design(Tabular):
    file_ext = 'design'
    def __init__(self, **kwd):
        """Name file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
        Tabular.__init__( self, **kwd )
        self.column_names = ['group','grouping']
        self.columns = 2

class AccNos(Tabular):
    file_ext = 'accnos'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name']
        self.columns = 1

class Oligos( Text ):
    file_ext = 'oligos'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        linePieces = line.split('\t')
                        if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
                            count += 1
                            continue
                        elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
                            count += 1
                            continue
                        else:
                            return False
                        if count > 20:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Frequency(Tabular):
    file_ext = 'freq'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['position','frequency']
        self.column_types = ['int','float']

    def sniff( self, filename ):
        """
        Determines whether the file is a frequency tabular format for chimera analysis
        #1.14.0
        0	0.000
        1	0.000
        ...
        155	0.975
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        try:
                            linePieces = line.split('\t')
                            i = int(linePieces[0])
                            f = float(linePieces[1])
                            count += 1
                            continue
                        except:
                            return False
                        if count > 20:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Quantile(Tabular):
    file_ext = 'quan'
    MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
    MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
        self.column_types = ['int','float','float','float','float','float','float']
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        log.info( "Mothur Quantile set_meta %s" % kwd)
    def sniff( self, filename ):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis
        1	0	0	0	0	0	0
        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
        ...
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        try:
                            linePieces = line.split('\t')
                            i = int(linePieces[0])
                            f = float(linePieces[1])
                            f = float(linePieces[2])
                            f = float(linePieces[3])
                            f = float(linePieces[4])
                            f = float(linePieces[5])
                            f = float(linePieces[6])
                            count += 1
                            continue
                        except:
                            return False
                        if count > 10:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class FilteredQuantile(Quantile):
    file_ext = 'filtered.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.filtered = True

class MaskedQuantile(Quantile):
    file_ext = 'masked.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.masked = True
        self.filtered = False

class FilteredMaskedQuantile(Quantile):
    file_ext = 'filtered.masked.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.masked = True
        self.filtered = True

class LaneMask(Text):
    file_ext = 'filter'

    def sniff( self, filename ):
        """
        Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.
        """
        try:
            fh = open( filename )
            while True:
                buff = fh.read(1000)
                if not buff:
                    break #EOF
                else:
                    if not re.match('^[01]+$',line):
                        return False
            return True
        except:
            pass
        finally:
            close(fh)
        return False

class RefTaxonomy(Tabular):
    file_ext = 'ref.taxonomy'
    """
        A table with 2 or 3 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        - integer ?
        Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline )
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
        Example: 3-column ( http://vamps.mbl.edu/resources/databases.php )
          v3_AA008	Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus	5
          v3_AA016	Bacteria	120
          v3_AA019	Archaea;Crenarchaeota;Marine_Group_I	1
    """
    def __init__(self, **kwd):
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','taxonomy']

    def sniff( self, filename ):
        """
        Determines whether the file is a SequenceTaxonomy
        """
        try:
            pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$'
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line:
                    fields = line.split('\t')
                    if 2 <= len(fields) <= 3:
                        return False
                    if not re.match(pat,fields[1]):
                        return False
                    count += 1
                    if count > 10:
                        break
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SequenceTaxonomy(RefTaxonomy):
    file_ext = 'seq.taxonomy'
    """
        A table with 2 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        Example:
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
    """
    def __init__(self, **kwd):
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','taxonomy']

    def sniff( self, filename ):
        """
        Determines whether the file is a SequenceTaxonomy
        """
        try:
            pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;])+$'
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line:
                    fields = line.split('\t')
                    if len(fields) != 2:
                        return False
                    if not re.match(pat,fields[1]):
                        return False
                    count += 1
                    if count > 10:
                        break
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class RDPSequenceTaxonomy(SequenceTaxonomy):
    file_ext = 'rdp.taxonomy'
    """
        A table with 2 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order, RDP requires exactly 6 levels deep)
        Example:
          AB001518.1      Bacteria;Bacteroidetes;Sphingobacteria;Sphingobacteriales;unclassified_Sphingobacteriales;
          AB001724.1      Bacteria;Cyanobacteria;Cyanobacteria;Family_II;GpIIa;
          AB001774.1      Bacteria;Chlamydiae;Chlamydiae;Chlamydiales;Chlamydiaceae;Chlamydophila;
    """
    def sniff( self, filename ):
        """
        Determines whether the file is a SequenceTaxonomy
        """
        try:
            pat = '^([^ \t\n\r\f\v;]+([(]\d+[)])?[;]){6}$'
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line:
                    fields = line.split('\t')
                    if len(fields) != 2:
                        return False
                    if not re.match(pat,fields[1]):
                        return False
                    count += 1
                    if count > 10:
                        break
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class ConsensusTaxonomy(Tabular):
    file_ext = 'cons.taxonomy'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['OTU','count','taxonomy']

class TaxonomySummary(Tabular):
    file_ext = 'tax.summary'
    def __init__(self, **kwd):
        """A Summary of taxon classification"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']

class Phylip(Text):
    file_ext = 'phy'

    def sniff( self, filename ):
        """
        Determines whether the file is in Phylip format (Interleaved or Sequential)
        The first line of the input file contains the number of species and the
        number of characters, in free format, separated by blanks (not by
        commas). The information for each species follows, starting with a
        ten-character species name (which can include punctuation marks and blanks),
        and continuing with the characters for that species.
        http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
        Interleaved Example:
            6   39
        Archaeopt CGATGCTTAC CGCCGATGCT
        HesperorniCGTTACTCGT TGTCGTTACT
        BaluchitheTAATGTTAAT TGTTAATGTT
        B. virginiTAATGTTCGT TGTTAATGTT
        BrontosaurCAAAACCCAT CATCAAAACC
        B.subtilisGGCAGCCAAT CACGGCAGCC

        TACCGCCGAT GCTTACCGC
        CGTTGTCGTT ACTCGTTGT
        AATTGTTAAT GTTAATTGT
        CGTTGTTAAT GTTCGTTGT
        CATCATCAAA ACCCATCAT
        AATCACGGCA GCCAATCAC
        """
        try:
            fh = open( filename )
            # counts line
            line = fh.readline().strip()
            linePieces = line.split()
            count = int(linePieces[0])
            seq_len = int(linePieces[1])
            # data lines
            """
            TODO check data lines
            while True:
                line = fh.readline()
                # name is the first 10 characters
                name = line[0:10]
                seq = line[10:].strip()
                # nucleic base or amino acid 1-char designators (spaces allowed)
                bases = ''.join(seq.split())
                # float per base (each separated by space)
            """
            return True
        except:
            pass
        finally:
            close(fh)
        return False


class Axes(Tabular):
    file_ext = 'axes'

    def __init__(self, **kwd):
        """Initialize axes datatype"""
        Tabular.__init__( self, **kwd )
    def sniff( self, filename ):
        """
        Determines whether the file is an axes format
        The first line may have column headings.
        The following lines have the name in the first column plus float columns for each axis.
		==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
		group   axis1   axis2
		forest  0.000000        0.145743
		pasture 0.145743        0.000000

		==> 98_sq_phylip_amazon.nmds.axes <==
        		axis1   axis2
		U68589  0.262608        -0.077498
		U68590  0.027118        0.195197
		U68591  0.329854        0.014395
        """
        try:
            fh = open( filename )
            count = 0
            line = fh.readline()
            line = line.strip()
            col_cnt = None
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    fields = line.split('\t')
                    if col_cnt == None:  # ignore values in first line as they may be column headings
                        col_cnt = len(fields)
                    else:
                        if len(fields) != col_cnt :
                            return False
                        try:
                            for i in range(1, col_cnt):
                                check = float(fields[i])
                        except ValueError:
                            return False
                        count += 1
                    if count > 10:
                        return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SffFlow(Tabular):
    MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True)
    MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False)
    file_ext = 'sff.flow'
    """
        The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400.
        Following lines contain:
        - SequenceName
        - the number of useable flows as defined by 454's software
        - the flow intensity for each base going in the order of TACG.
        Example:
          800
          GQY1XT001CQL4K 85 1.04 0.00 1.00 0.02 0.03 1.02 0.05 ...
          GQY1XT001CQIRF 84 1.02 0.06 0.98 0.06 0.09 1.05 0.07 ...
          GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ...
    """
    def __init__(self, **kwd):
        Tabular.__init__( self, **kwd )

    def set_meta( self, dataset, overwrite = True, skip = 1, max_data_lines = None, **kwd ):
        Tabular.set_meta(self, dataset, overwrite, 1, max_data_lines)
        try:
            fh = open( dataset.file_name )
            line = fh.readline()
            line = line.strip()
            flow_values = int(line)
            dataset.metadata.flow_values = flow_values
        finally:
            fh.close()

    def make_html_table( self, dataset, skipchars=[] ):
        """Create HTML table, used for displaying peek"""
        out = ['<table cellspacing="0" cellpadding="3">']
        comments = []
        try:
            # Generate column header
            out.append('<tr>')
            out.append( '<th>%d. Name</th>' % 1 )
            out.append( '<th>%d. Flows</th>' % 2 )
            for i in range( 3, dataset.metadata.columns+1 ):
                base = dataset.metadata.flow_order[(i+1)%4]
                out.append( '<th>%d. %d %s</th>' % (i-2,base) )
            out.append('</tr>')
            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
            out.append( '</table>' )
            out = "".join( out )
        except Exception, exc:
            out = "Can't create peek %s" % str( exc )
        return out

class Newick( Text ):
    """
    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
    http://evolution.genetics.washington.edu/phylip/newicktree.html
    http://en.wikipedia.org/wiki/Newick_format
    Example:
    (B,(A,C,E),D);
    or example with branch lengths:
    (B:6.0,(A:5.0,C:3.0,E:4.0):5.0,D:11.0);
    or an example with embedded comments but no branch lengths:
    ((a [&&PRIME S=x], b [&&PRIME S=y]), c [&&PRIME S=z]);
    Example with named interior noe:
    (B:6.0,(A:5.0,C:3.0,E:4.0)Ancestor1:5.0,D:11.0);
    """
    file_ext = 'tre'

    def __init__(self, **kwd):
        Text.__init__( self, **kwd )

    def sniff( self, filename ):   ## TODO
        """
        Determine whether the file is in Newick format
        Note: Last non-space char of a tree should be a semicolon: ';'
        Usually the first char will be a open parenthesis: '('
        (,,(,));                               no nodes are named
        (A,B,(C,D));                           leaf nodes are named
        (A,B,(C,D)E)F;                         all nodes are named
        (:0.1,:0.2,(:0.3,:0.4):0.5);           all but root node have a distance to parent
        (:0.1,:0.2,(:0.3,:0.4):0.5):0.0;       all have a distance to parent
        (A:0.1,B:0.2,(C:0.3,D:0.4):0.5);       distances and leaf names (popular)
        (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;     distances and all names
        ((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;    a tree rooted on a leaf node (rare)
        """
        if not os.path.exists(filename):
            return False
        try:
            ## For now, guess this is a Newick file if it starts with a '(' and ends with a ';'
            flen = os.path.getsize(filename)
            fh = open( filename )
            len = min(flen,2000)
            # check end of the file for a semicolon
            fh.seek(-len,os.SEEK_END)
            buf = fh.read(len).strip()
            buf = buf.strip()
            if not buf.endswith(';'):
                return False
            # See if this starts with a open parenthesis
            if len < flen:
                fh.seek(0)
                buf = fh.read(len).strip()
            if buf.startswith('('):
                return True
        except:
            pass
        finally:
            close(fh)
        return False

class Nhx( Newick ):
    """
    New Hampshire eXtended  Newick with embedded
    The Newick Standard for representing trees in computer-readable form makes use of the correspondence between trees and nested parentheses.
    http://evolution.genetics.washington.edu/phylip/newicktree.html
    http://en.wikipedia.org/wiki/Newick_format
    Example:
    (gene1_Hu[&&NHX:S=Hu_Homo_sapiens], (gene2_Hu[&&NHX:S=Hu_Homo_sapiens], gene2_Mu[&&NHX:S=Mu_Mus_musculus]));
    """
    file_ext = 'nhx'

class Nexus( Text ):
    """
    http://en.wikipedia.org/wiki/Nexus_file
    Example:
    #NEXUS
    BEGIN TAXA;
          Dimensions NTax=4;
          TaxLabels fish frog snake mouse;
    END;

    BEGIN CHARACTERS;
          Dimensions NChar=20;
          Format DataType=DNA;
          Matrix
            fish   ACATA GAGGG TACCT CTAAG
            frog   ACATA GAGGG TACCT CTAAG
            snake  ACATA GAGGG TACCT CTAAG
            mouse  ACATA GAGGG TACCT CTAAG
    END;

    BEGIN TREES;
          Tree best=(fish, (frog, (snake, mouse)));
    END;
    """
    file_ext = 'nex'

    def __init__(self, **kwd):
        Text.__init__( self, **kwd )

    def sniff( self, filename ):
        """
        Determines whether the file is in nexus format
        First line should be:
        #NEXUS
        """
        try:
            fh = open( filename )
            count = 0
            line = fh.readline()
            line = line.strip()
            if line and line == '#NEXUS':
                fh.close()
                return True
        except:
            pass
        finally:
            fh.close()
        return False


## Qiime Classes

class QiimeMetadataMapping(Tabular):
    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
    file_ext = 'qiimemapping'

    def __init__(self, **kwd):
        """
        http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
        Information about the samples necessary to perform the data analysis.
        # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
        """
        Tabular.__init__( self, **kwd )

    def sniff( self, filename ):
        """
        Determines whether the file is a qiime mapping file
        Just checking for an appropriate header line for now, could be improved
        """
        try:
            pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
            fh = open( filename )
            while True:
                line = dataset_fh.readline()
                if re.match(pat,line):
                    return True
        except:
            pass
        finally:
            close(fh)
        return False

    def set_column_names(self, dataset):
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            line = dataset_fh.readline()
            if line.startswith('#SampleID'):
                dataset.metadata.column_names = line.strip().split('\t');
            dataset_fh.close()

    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
        self.set_column_names(dataset)

class QiimeOTU(Tabular):
    """
    Associates OTUs with sequence IDs
    Example:
    0	FLP3FBN01C2MYD	FLP3FBN01B2ALM
    1	FLP3FBN01DF6NE	FLP3FBN01CKW1J	FLP3FBN01CHVM4
    2	FLP3FBN01AXQ2Z
    """
    file_ext = 'qiimeotu'

class QiimeOTUTable(Tabular):
    """
        #Full OTU Counts
        #OTU ID	PC.354	PC.355	PC.356	Consensus Lineage
        0	0	1	0	Root;Bacteria;Firmicutes;"Clostridia";Clostridiales
        1	1	3	1	Root;Bacteria
        2	0	2	2	Root;Bacteria;Bacteroidetes
    """
    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
    file_ext = 'qiimeotutable'
    def init_meta( self, dataset, copy_from=None ):
        tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        self.set_column_names(dataset)
    def set_column_names(self, dataset):
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            line = dataset_fh.readline()
            line = dataset_fh.readline()
            if line.startswith('#OTU ID'):
                dataset.metadata.column_names = line.strip().split('\t');
            dataset_fh.close()
            dataset.metadata.comment_lines = 2

class QiimeDistanceMatrix(Tabular):
    """
        	PC.354	PC.355	PC.356
        PC.354	0.0	3.177	1.955
        PC.355	3.177	0.0	3.444
        PC.356	1.955	3.444	0.0
    """
    file_ext = 'qiimedistmat'
    def init_meta( self, dataset, copy_from=None ):
        tabular.Tabular.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        self.set_column_names(dataset)
    def set_column_names(self, dataset):
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            line = dataset_fh.readline()
            # first line contains the names
            dataset.metadata.column_names = line.strip().split('\t');
            dataset_fh.close()
            dataset.metadata.comment_lines = 1

class QiimePCA(Tabular):
    """
    Principal Coordinate Analysis Data
    The principal coordinate (PC) axes (columns) for each sample (rows).
    Pairs of PCs can then be graphed to view the relationships between samples.
    The bottom of the output file contains the eigenvalues and % variation explained for each PC.
    Example:
    pc vector number	1	2	3
    PC.354	-0.309063936588	0.0398252112257	0.0744672231759
    PC.355	-0.106593922619	0.141125998277	0.0780204374172
    PC.356	-0.219869362955	0.00917241121781	0.0357281314115


    eigvals	0.480220500471	0.163567082874	0.125594470811
    % variation explained	51.6955484555	17.6079322939
    """
    file_ext = 'qiimepca'

class QiimeParams(Tabular):
    """
###pick_otus_through_otu_table.py parameters###

# OTU picker parameters
pick_otus:otu_picking_method    uclust
pick_otus:clustering_algorithm  furthest

# Representative set picker parameters
pick_rep_set:rep_set_picking_method     first
pick_rep_set:sort_by    otu
    """
    file_ext = 'qiimeparams'

class QiimePrefs(Text):
    """
    A text file, containing coloring preferences to be used by make_distance_histograms.py, make_2d_plots.py and make_3d_plots.py.
    Example:
{
'background_color':'black',

'sample_coloring':
        {
                'Treatment':
                {
                        'column':'Treatment',
                        'colors':(('red',(0,100,100)),('blue',(240,100,100)))
                },
                'DOB':
                {
                        'column':'DOB',
                        'colors':(('red',(0,100,100)),('blue',(240,100,100)))
                }
        },
'MONTE_CARLO_GROUP_DISTANCES':
        {
                'Treatment': 10,
                'DOB': 10
        }
}
    """
    file_ext = 'qiimeprefs'

class QiimeTaxaSummary(Tabular):
    """
        Taxon	PC.354	PC.355	PC.356
        Root;Bacteria;Actinobacteria	0.0	0.177	0.955
        Root;Bacteria;Firmicutes	0.177	0.0	0.444
        Root;Bacteria;Proteobacteria	0.955	0.444	0.0
    """
    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
    file_ext = 'qiimetaxsummary'

    def set_column_names(self, dataset):
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            line = dataset_fh.readline()
            if line.startswith('Taxon'):
                dataset.metadata.column_names = line.strip().split('\t');
            dataset_fh.close()

    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
        self.set_column_names(dataset)

if __name__ == '__main__':
    import doctest, sys
    doctest.testmod(sys.modules[__name__])
author	Jim Johnson <jj@umn.edu>
date	Tue, 17 Jan 2012 14:42:27 -0600
parents	57df76d861e4
children	bfbaf823be4c