Mercurial > repos > jjohnson > mothur_toolsuite

"""
metagenomics datatypes
James E Johnson - University of Minnesota
for Mothur
"""

import data
import logging, os, sys, time, tempfile, shutil, string, glob, re
import galaxy.model
from galaxy.datatypes import metadata
from galaxy.datatypes import tabular
from galaxy.datatypes import sequence
from galaxy.datatypes.metadata import MetadataElement
from galaxy.datatypes.tabular import Tabular
from galaxy.datatypes.sequence import Fasta
from galaxy import util
from galaxy.datatypes.images import Html
from sniff import *

log = logging.getLogger(__name__)


## Mothur Classes

class Otu( data.Text ):
    file_ext = 'otu'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 2:
                            return False
                        try:
                            check = int(linePieces[1])
                            if check + 2 != len(linePieces):
                                return False
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class OtuList( Otu ):
    file_ext = 'list'

class Sabund( Otu ):
    file_ext = 'sabund'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        label<TAB>count[<TAB>value(1..n)]
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 2:
                            return False
                        try:
                            check = int(linePieces[1])
                            if check + 2 != len(linePieces):
                                return False
                            for i in range( 2, len(linePieces)):
                                ival = int(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count >= 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Rabund( Sabund ):
    file_ext = 'rabund'


class SharedRabund( Rabund ):
    file_ext = 'shared'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 3:
                            return False
                        try:
                            check = int(linePieces[2])
                            if check + 3 != len(linePieces):
                                return False
                            for i in range( 3, len(linePieces)):
                                ival = int(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count >= 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class RelAbund( Rabund ):
    file_ext = 'relabund'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) < 3:
                            return False
                        try:
                            check = int(linePieces[2])
                            if check + 3 != len(linePieces):
                                return False
                            for i in range( 3, len(linePieces)):
                                fval = float(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count >= 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SecondaryStructureMap(Tabular):
    file_ext = 'map'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Map']

    def sniff( self, filename ):
        """
        Determines whether the file is a secondary structure map format
        A single column with an integer value which indicates the row that this row maps to.
        check you make sure is structMap[10] = 380 then structMap[380] = 10.
        """
        try:
            fh = open( filename )
            line_num = 0
            rowidxmap = {}
            while True:
                line = fh.readline()
                line_num += 1
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    try:
                        pointer = int(line)
                        if pointer > 0:
                            if pointer > line_num:
                                rowidxmap[line_num] = pointer
                            elif pointer < line_num & rowidxmap[pointer] != line_num:
                                return False
                    except ValueError:
                        return False
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SequenceAlignment( Fasta ):
    file_ext = 'align'
    def __init__(self, **kwd):
        Fasta.__init__( self, **kwd )
        """Initialize AlignCheck datatype"""

    def sniff( self, filename ):
        """
        Determines whether the file is in Mothur align fasta format
        Each sequence line must be the same length
        """

        try:
            fh = open( filename )
            len = -1
            while True:
                line = fh.readline()
                if not line:
                    break #EOF
                line = line.strip()
                if line: #first non-empty line
                    if line.startswith( '>' ):
                        #The next line.strip() must not be '', nor startwith '>'
                        line = fh.readline().strip()
                        if line == '' or line.startswith( '>' ):
                            break
                        if len < 0:
                            len = len(line)
                        elif len != len(line):
                            return False
                    else:
                        break #we found a non-empty line, but its not a fasta header
            if len > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class AlignCheck( Tabular ):
    file_ext = 'align.check'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
        self.column_types = ['str','int','int','int','int','int','int','int']
        self.comment_lines = 1

    def set_meta( self, dataset, overwrite = True, **kwd ):
        # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
        data_lines = 0
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            while True:
                line = dataset_fh.readline()
                if not line: break
                data_lines += 1
            dataset_fh.close()
        dataset.metadata.comment_lines = 1
        dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
        dataset.metadata.column_names = self.column_names
        dataset.metadata.column_types = self.column_types

class AlignReport(Tabular):
    """
QueryName	QueryLength	TemplateName	TemplateLength	SearchMethod	SearchScore	AlignmentMethod	QueryStart	QueryEnd	TemplateStart	TemplateEnd	PairwiseAlignmentLength	GapsInQuery	GapsInTemplate	LongestInsert	SimBtwnQuery&Template
AY457915	501		82283		1525		kmer		89.07		needleman	5		501		1		499		499			2		0		0		97.6
    """
    file_ext = 'align.report'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
                             'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
                             'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
                             ]

class BellerophonChimera( Tabular ):
    file_ext = 'bellerophon.chimera'
    def __init__(self, **kwd):
        """Initialize AlignCheck datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Name','Score','Left','Right']

class SecondaryStructureMatch(Tabular):
    """
	name	pound	dash	plus	equal	loop	tilde	total
	9_1_12	42	68	8	28	275	420	872
	9_1_14	36	68	6	26	266	422	851
	9_1_15	44	68	8	28	276	418	873
	9_1_16	34	72	6	30	267	430	860
	9_1_18	46	80	2	36	261
    """
    def __init__(self, **kwd):
        """Initialize SecondaryStructureMatch datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']

class DistanceMatrix(data.Text):
    file_ext = 'dist'
    """Add metadata elements"""
    MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )


class LowerTriangleDistanceMatrix(DistanceMatrix):
    file_ext = 'lower.dist'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        DistanceMatrix.__init__( self, **kwd )

    def sniff( self, filename ):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
                5
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != 3:
                            return False
                        try:
                            check = float(linePieces[2])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class SquareDistanceMatrix(DistanceMatrix,Tabular):
    file_ext = 'square.dist'
    sequence_count = -1

    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
    def init_meta( self, dataset, copy_from=None ):
        data.Text.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        dataset.metadata.sequences = 0

    def sniff( self, filename ):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.
               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783
        """
        try:
            fh = open( filename )
            count = 0
            line = fh.readline()
            line = line.strip()
            sequence_count = int(line)
            col_cnt = seq_cnt + 1
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != col_cnt :
                            return False
                        try:
                            for i in range(1, col_cnt):
                                check = float(linePieces[i])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
    file_ext = 'pair.dist'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['Sequence','Sequence','Distance']
        self.column_types = ['str','str','float']
        self.comment_lines = 1

    def sniff( self, filename ):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                if line:
                    if line[0] != '@':
                        linePieces = line.split('\t')
                        if len(linePieces) != 3:
                            return False
                        try:
                            check = float(linePieces[2])
                        except ValueError:
                            return False
                        count += 1
                        if count == 5:
                            return True
            fh.close()
            if count < 5 and count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Alignment(Tabular):
    file_ext = 'align'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']

class AlignCheck(Tabular):
    file_ext = 'align.check'
    def __init__(self, **kwd):
        """Initialize secondary structure map datatype"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']

class Names(Tabular):
    file_ext = 'names'
    def __init__(self, **kwd):
        """Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','representatives']

class Summary(Tabular):
    file_ext = 'summary'
    def __init__(self, **kwd):
        """Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['seqname','start','end','nbases','ambigs','polymer']

class Group(Tabular):
    file_ext = 'groups'
    def __init__(self, **kwd):
        """Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','group']

class AccNos(Tabular):
    file_ext = 'accnos'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name']

class Oligos( data.Text ):
    file_ext = 'oligos'

    def sniff( self, filename ):
        """
        Determines whether the file is a otu (operational taxonomic unit) format
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        linePieces = line.split('\t')
                        if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
                            count += 1
                            continue
                        elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
                            count += 1
                            continue
                        else:
                            return False
                        if count > 20:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Frequency(Tabular):
    file_ext = 'freq'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['position','frequency']
        self.column_types = ['int','float']

    def sniff( self, filename ):
        """
        Determines whether the file is a frequency tabular format for chimera analysis
        #1.14.0
        0	0.000
        1	0.000
        ...
        155	0.975
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        try:
                            linePieces = line.split('\t')
                            i = int(linePieces[0])
                            f = float(linePieces[1])
                            count += 1
                            continue
                        except:
                            return False
                        if count > 20:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class Quantile(Tabular):
    file_ext = 'quan'
    MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
    MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
        self.column_types = ['int','float','float','float','float','float','float']
    def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
        log.info( "Mothur Quantile set_meta %s" % kwd)
    def sniff( self, filename ):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis
        1	0	0	0	0	0	0
        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
        ...
        """
        try:
            fh = open( filename )
            count = 0
            while True:
                line = fh.readline()
                line = line.strip()
                if not line:
                    break #EOF
                else:
                    if line[0] != '#':
                        try:
                            linePieces = line.split('\t')
                            i = int(linePieces[0])
                            f = float(linePieces[1])
                            f = float(linePieces[2])
                            f = float(linePieces[3])
                            f = float(linePieces[4])
                            f = float(linePieces[5])
                            f = float(linePieces[6])
                            count += 1
                            continue
                        except:
                            return False
                        if count > 10:
                            return True
            if count > 0:
                return True
        except:
            pass
        finally:
            fh.close()
        return False

class FilteredQuantile(Quantile):
    file_ext = 'filtered.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.filtered = True

class MaskedQuantile(Quantile):
    file_ext = 'masked.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.masked = True
        self.filtered = False

class FilteredMaskedQuantile(Quantile):
    file_ext = 'filtered.masked.quan'
    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        Quantile.__init__( self, **kwd )
        self.masked = True
        self.filtered = True

class LaneMask(data.Text):
    file_ext = 'filter'

    def sniff( self, filename ):
        """
        Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.
        """
        try:
            fh = open( filename )
            while True:
                buff = fh.read(1000)
                if not buff:
                    break #EOF
                else:
                    if not re.match('^[01]+$',line):
                        return False
            return True
        except:
            pass
        finally:
            close(fh)
        return False

class SequenceTaxonomy(Tabular):
    file_ext = 'taxonomy'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['name','taxonomy']

class ConsensusTaxonomy(Tabular):
    file_ext = 'cons.taxonomy'
    def __init__(self, **kwd):
        """A list of names"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['OTU','count','taxonomy']

class TaxonomySummary(Tabular):
    file_ext = 'tax.summary'
    def __init__(self, **kwd):
        """A Summary of taxon classification"""
        Tabular.__init__( self, **kwd )
        self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']

class Phylip(data.Text):
    file_ext = 'phy'

    def sniff( self, filename ):
        """
        Determines whether the file is in Phylip format (Interleaved or Sequential)
        The first line of the input file contains the number of species and the
        number of characters, in free format, separated by blanks (not by
        commas). The information for each species follows, starting with a
        ten-character species name (which can include punctuation marks and blanks),
        and continuing with the characters for that species.
        http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
        Interleaved Example:
            6   39
        Archaeopt CGATGCTTAC CGCCGATGCT
        HesperorniCGTTACTCGT TGTCGTTACT
        BaluchitheTAATGTTAAT TGTTAATGTT
        B. virginiTAATGTTCGT TGTTAATGTT
        BrontosaurCAAAACCCAT CATCAAAACC
        B.subtilisGGCAGCCAAT CACGGCAGCC

        TACCGCCGAT GCTTACCGC
        CGTTGTCGTT ACTCGTTGT
        AATTGTTAAT GTTAATTGT
        CGTTGTTAAT GTTCGTTGT
        CATCATCAAA ACCCATCAT
        AATCACGGCA GCCAATCAC
        """
        try:
            fh = open( filename )
            # counts line
            line = fh.readline().strip()
            linePieces = line.split()
            count = int(linePieces[0])
            seq_len = int(linePieces[1])
            # data lines
            """
            TODO check data lines
            while True:
                line = fh.readline()
                # name is the first 10 characters
                name = line[0:10]
                seq = line[10:].strip()
                # nucleic base or amino acid 1-char designators (spaces allowed)
                bases = ''.join(seq.split())
                # float per base (each separated by space)
            """
            return True
        except:
            pass
        finally:
            close(fh)
        return False


## Qiime Classes

class MetadataMapping(Tabular):
    MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
    file_ext = 'mapping'

    def __init__(self, **kwd):
        """
        http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
        Information about the samples necessary to perform the data analysis.
        # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
        """
        Tabular.__init__( self, **kwd )

    def sniff( self, filename ):
        """
        Determines whether the file is a qiime mapping file
        Just checking for an appropriate header line for now, could be improved
        """
        try:
            pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
            fh = open( filename )
            while True:
                line = dataset_fh.readline()
                if re.match(pat,line):
                    return True
        except:
            pass
        finally:
            close(fh)
        return False

    def set_column_names(self, dataset):
        if dataset.has_data():
            dataset_fh = open( dataset.file_name )
            line = dataset_fh.readline()
            if line.startswith('#SampleID'):
                dataset.metadata.column_names = line.strip().split('\t');
            dataset_fh.close()

    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
        self.set_column_names(dataset)

if __name__ == '__main__':
    import doctest, sys
    doctest.testmod(sys.modules[__name__])
author	jjohnson
date	Tue, 07 Jun 2011 17:32:23 -0400
parents
children	fcc0778f6987