Mercurial > repos > jjohnson > mothur_toolsuite
diff mothur/lib/galaxy/datatypes/metagenomics.py @ 0:3202a38e44d9
Migrated tool version 1.15.1 from old tool shed archive to new tool shed repository
author | jjohnson |
---|---|
date | Tue, 07 Jun 2011 17:32:23 -0400 |
parents | |
children | fcc0778f6987 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mothur/lib/galaxy/datatypes/metagenomics.py Tue Jun 07 17:32:23 2011 -0400 @@ -0,0 +1,831 @@ +""" +metagenomics datatypes +James E Johnson - University of Minnesota +for Mothur +""" + +import data +import logging, os, sys, time, tempfile, shutil, string, glob, re +import galaxy.model +from galaxy.datatypes import metadata +from galaxy.datatypes import tabular +from galaxy.datatypes import sequence +from galaxy.datatypes.metadata import MetadataElement +from galaxy.datatypes.tabular import Tabular +from galaxy.datatypes.sequence import Fasta +from galaxy import util +from galaxy.datatypes.images import Html +from sniff import * + +log = logging.getLogger(__name__) + + +## Mothur Classes + +class Otu( data.Text ): + file_ext = 'otu' + + def sniff( self, filename ): + """ + Determines whether the file is a otu (operational taxonomic unit) format + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) < 2: + return False + try: + check = int(linePieces[1]) + if check + 2 != len(linePieces): + return False + except ValueError: + return False + count += 1 + if count == 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class OtuList( Otu ): + file_ext = 'list' + +class Sabund( Otu ): + file_ext = 'sabund' + + def sniff( self, filename ): + """ + Determines whether the file is a otu (operational taxonomic unit) format + label<TAB>count[<TAB>value(1..n)] + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) < 2: + return False + try: + check = int(linePieces[1]) + if check + 2 != len(linePieces): + return False + for i in range( 2, len(linePieces)): + ival = int(linePieces[i]) + except ValueError: + return False + count += 1 + if count >= 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class Rabund( Sabund ): + file_ext = 'rabund' + + +class SharedRabund( Rabund ): + file_ext = 'shared' + + def sniff( self, filename ): + """ + Determines whether the file is a otu (operational taxonomic unit) Shared format + label<TAB>group<TAB>count[<TAB>value(1..n)] + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) < 3: + return False + try: + check = int(linePieces[2]) + if check + 3 != len(linePieces): + return False + for i in range( 3, len(linePieces)): + ival = int(linePieces[i]) + except ValueError: + return False + count += 1 + if count >= 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class RelAbund( Rabund ): + file_ext = 'relabund' + + def sniff( self, filename ): + """ + Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format + label<TAB>group<TAB>count[<TAB>value(1..n)] + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) < 3: + return False + try: + check = int(linePieces[2]) + if check + 3 != len(linePieces): + return False + for i in range( 3, len(linePieces)): + fval = float(linePieces[i]) + except ValueError: + return False + count += 1 + if count >= 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class SecondaryStructureMap(Tabular): + file_ext = 'map' + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['Map'] + + def sniff( self, filename ): + """ + Determines whether the file is a secondary structure map format + A single column with an integer value which indicates the row that this row maps to. + check you make sure is structMap[10] = 380 then structMap[380] = 10. + """ + try: + fh = open( filename ) + line_num = 0 + rowidxmap = {} + while True: + line = fh.readline() + line_num += 1 + line = line.strip() + if not line: + break #EOF + if line: + try: + pointer = int(line) + if pointer > 0: + if pointer > line_num: + rowidxmap[line_num] = pointer + elif pointer < line_num & rowidxmap[pointer] != line_num: + return False + except ValueError: + return False + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class SequenceAlignment( Fasta ): + file_ext = 'align' + def __init__(self, **kwd): + Fasta.__init__( self, **kwd ) + """Initialize AlignCheck datatype""" + + def sniff( self, filename ): + """ + Determines whether the file is in Mothur align fasta format + Each sequence line must be the same length + """ + + try: + fh = open( filename ) + len = -1 + while True: + line = fh.readline() + if not line: + break #EOF + line = line.strip() + if line: #first non-empty line + if line.startswith( '>' ): + #The next line.strip() must not be '', nor startwith '>' + line = fh.readline().strip() + if line == '' or line.startswith( '>' ): + break + if len < 0: + len = len(line) + elif len != len(line): + return False + else: + break #we found a non-empty line, but its not a fasta header + if len > 0: + return True + except: + pass + finally: + fh.close() + return False + +class AlignCheck( Tabular ): + file_ext = 'align.check' + def __init__(self, **kwd): + """Initialize AlignCheck datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] + self.column_types = ['str','int','int','int','int','int','int','int'] + self.comment_lines = 1 + + def set_meta( self, dataset, overwrite = True, **kwd ): + # Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 ) + data_lines = 0 + if dataset.has_data(): + dataset_fh = open( dataset.file_name ) + while True: + line = dataset_fh.readline() + if not line: break + data_lines += 1 + dataset_fh.close() + dataset.metadata.comment_lines = 1 + dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0 + dataset.metadata.column_names = self.column_names + dataset.metadata.column_types = self.column_types + +class AlignReport(Tabular): + """ +QueryName QueryLength TemplateName TemplateLength SearchMethod SearchScore AlignmentMethod QueryStart QueryEnd TemplateStart TemplateEnd PairwiseAlignmentLength GapsInQuery GapsInTemplate LongestInsert SimBtwnQuery&Template +AY457915 501 82283 1525 kmer 89.07 needleman 5 501 1 499 499 2 0 0 97.6 + """ + file_ext = 'align.report' + def __init__(self, **kwd): + """Initialize AlignCheck datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore', + 'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd', + 'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template' + ] + +class BellerophonChimera( Tabular ): + file_ext = 'bellerophon.chimera' + def __init__(self, **kwd): + """Initialize AlignCheck datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['Name','Score','Left','Right'] + +class SecondaryStructureMatch(Tabular): + """ + name pound dash plus equal loop tilde total + 9_1_12 42 68 8 28 275 420 872 + 9_1_14 36 68 6 26 266 422 851 + 9_1_15 44 68 8 28 276 418 873 + 9_1_16 34 72 6 30 267 430 860 + 9_1_18 46 80 2 36 261 + """ + def __init__(self, **kwd): + """Initialize SecondaryStructureMatch datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] + +class DistanceMatrix(data.Text): + file_ext = 'dist' + """Add metadata elements""" + MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 ) + + +class LowerTriangleDistanceMatrix(DistanceMatrix): + file_ext = 'lower.dist' + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + DistanceMatrix.__init__( self, **kwd ) + + def sniff( self, filename ): + """ + Determines whether the file is a lower-triangle distance matrix (phylip) format + The first line has the number of sequences in the matrix. + The remaining lines have the sequence name followed by a list of distances from all preceeding sequences + 5 + U68589 + U68590 0.3371 + U68591 0.3609 0.3782 + U68592 0.4155 0.3197 0.4148 + U68593 0.2872 0.1690 0.3361 0.2842 + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) != 3: + return False + try: + check = float(linePieces[2]) + except ValueError: + return False + count += 1 + if count == 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class SquareDistanceMatrix(DistanceMatrix,Tabular): + file_ext = 'square.dist' + sequence_count = -1 + + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + Tabular.__init__( self, **kwd ) + def init_meta( self, dataset, copy_from=None ): + data.Text.init_meta( self, dataset, copy_from=copy_from ) + def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): + dataset.metadata.sequences = 0 + + def sniff( self, filename ): + """ + Determines whether the file is a square distance matrix (Column-formatted distance matrix) format + The first line has the number of sequences in the matrix. + The following lines have the sequence name in the first column plus a column for the distance to each sequence + in the row order in which they appear in the matrix. + 3 + U68589 0.0000 0.3371 0.3610 + U68590 0.3371 0.0000 0.3783 + U68590 0.3371 0.0000 0.3783 + """ + try: + fh = open( filename ) + count = 0 + line = fh.readline() + line = line.strip() + sequence_count = int(line) + col_cnt = seq_cnt + 1 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) != col_cnt : + return False + try: + for i in range(1, col_cnt): + check = float(linePieces[i]) + except ValueError: + return False + count += 1 + if count == 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class PairwiseDistanceMatrix(DistanceMatrix,Tabular): + file_ext = 'pair.dist' + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['Sequence','Sequence','Distance'] + self.column_types = ['str','str','float'] + self.comment_lines = 1 + + def sniff( self, filename ): + """ + Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format + The first and second columns have the sequence names and the third column is the distance between those sequences. + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + if line: + if line[0] != '@': + linePieces = line.split('\t') + if len(linePieces) != 3: + return False + try: + check = float(linePieces[2]) + except ValueError: + return False + count += 1 + if count == 5: + return True + fh.close() + if count < 5 and count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class Alignment(Tabular): + file_ext = 'align' + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] + +class AlignCheck(Tabular): + file_ext = 'align.check' + def __init__(self, **kwd): + """Initialize secondary structure map datatype""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total'] + +class Names(Tabular): + file_ext = 'names' + def __init__(self, **kwd): + """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','representatives'] + +class Summary(Tabular): + file_ext = 'summary' + def __init__(self, **kwd): + """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" + Tabular.__init__( self, **kwd ) + self.column_names = ['seqname','start','end','nbases','ambigs','polymer'] + +class Group(Tabular): + file_ext = 'groups' + def __init__(self, **kwd): + """Name file shows the relationship between a representative sequence(col 1) and the sequences it represents(col 2)""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','group'] + +class AccNos(Tabular): + file_ext = 'accnos' + def __init__(self, **kwd): + """A list of names""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name'] + +class Oligos( data.Text ): + file_ext = 'oligos' + + def sniff( self, filename ): + """ + Determines whether the file is a otu (operational taxonomic unit) format + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + else: + if line[0] != '#': + linePieces = line.split('\t') + if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]): + count += 1 + continue + elif len(linePieces) == 3 and re.match('barcode',linePieces[0]): + count += 1 + continue + else: + return False + if count > 20: + return True + if count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class Frequency(Tabular): + file_ext = 'freq' + def __init__(self, **kwd): + """A list of names""" + Tabular.__init__( self, **kwd ) + self.column_names = ['position','frequency'] + self.column_types = ['int','float'] + + def sniff( self, filename ): + """ + Determines whether the file is a frequency tabular format for chimera analysis + #1.14.0 + 0 0.000 + 1 0.000 + ... + 155 0.975 + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + else: + if line[0] != '#': + try: + linePieces = line.split('\t') + i = int(linePieces[0]) + f = float(linePieces[1]) + count += 1 + continue + except: + return False + if count > 20: + return True + if count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class Quantile(Tabular): + file_ext = 'quan' + MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True) + MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True) + def __init__(self, **kwd): + """Quantiles for chimera analysis""" + Tabular.__init__( self, **kwd ) + self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine'] + self.column_types = ['int','float','float','float','float','float','float'] + def set_meta( self, dataset, overwrite = True, skip = None, **kwd ): + log.info( "Mothur Quantile set_meta %s" % kwd) + def sniff( self, filename ): + """ + Determines whether the file is a quantiles tabular format for chimera analysis + 1 0 0 0 0 0 0 + 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 + 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 + ... + """ + try: + fh = open( filename ) + count = 0 + while True: + line = fh.readline() + line = line.strip() + if not line: + break #EOF + else: + if line[0] != '#': + try: + linePieces = line.split('\t') + i = int(linePieces[0]) + f = float(linePieces[1]) + f = float(linePieces[2]) + f = float(linePieces[3]) + f = float(linePieces[4]) + f = float(linePieces[5]) + f = float(linePieces[6]) + count += 1 + continue + except: + return False + if count > 10: + return True + if count > 0: + return True + except: + pass + finally: + fh.close() + return False + +class FilteredQuantile(Quantile): + file_ext = 'filtered.quan' + def __init__(self, **kwd): + """Quantiles for chimera analysis""" + Quantile.__init__( self, **kwd ) + self.filtered = True + +class MaskedQuantile(Quantile): + file_ext = 'masked.quan' + def __init__(self, **kwd): + """Quantiles for chimera analysis""" + Quantile.__init__( self, **kwd ) + self.masked = True + self.filtered = False + +class FilteredMaskedQuantile(Quantile): + file_ext = 'filtered.masked.quan' + def __init__(self, **kwd): + """Quantiles for chimera analysis""" + Quantile.__init__( self, **kwd ) + self.masked = True + self.filtered = True + +class LaneMask(data.Text): + file_ext = 'filter' + + def sniff( self, filename ): + """ + Determines whether the file is a lane mask filter: 1 line consisting of zeros and ones. + """ + try: + fh = open( filename ) + while True: + buff = fh.read(1000) + if not buff: + break #EOF + else: + if not re.match('^[01]+$',line): + return False + return True + except: + pass + finally: + close(fh) + return False + +class SequenceTaxonomy(Tabular): + file_ext = 'taxonomy' + def __init__(self, **kwd): + """A list of names""" + Tabular.__init__( self, **kwd ) + self.column_names = ['name','taxonomy'] + +class ConsensusTaxonomy(Tabular): + file_ext = 'cons.taxonomy' + def __init__(self, **kwd): + """A list of names""" + Tabular.__init__( self, **kwd ) + self.column_names = ['OTU','count','taxonomy'] + +class TaxonomySummary(Tabular): + file_ext = 'tax.summary' + def __init__(self, **kwd): + """A Summary of taxon classification""" + Tabular.__init__( self, **kwd ) + self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total'] + +class Phylip(data.Text): + file_ext = 'phy' + + def sniff( self, filename ): + """ + Determines whether the file is in Phylip format (Interleaved or Sequential) + The first line of the input file contains the number of species and the + number of characters, in free format, separated by blanks (not by + commas). The information for each species follows, starting with a + ten-character species name (which can include punctuation marks and blanks), + and continuing with the characters for that species. + http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles + Interleaved Example: + 6 39 + Archaeopt CGATGCTTAC CGCCGATGCT + HesperorniCGTTACTCGT TGTCGTTACT + BaluchitheTAATGTTAAT TGTTAATGTT + B. virginiTAATGTTCGT TGTTAATGTT + BrontosaurCAAAACCCAT CATCAAAACC + B.subtilisGGCAGCCAAT CACGGCAGCC + + TACCGCCGAT GCTTACCGC + CGTTGTCGTT ACTCGTTGT + AATTGTTAAT GTTAATTGT + CGTTGTTAAT GTTCGTTGT + CATCATCAAA ACCCATCAT + AATCACGGCA GCCAATCAC + """ + try: + fh = open( filename ) + # counts line + line = fh.readline().strip() + linePieces = line.split() + count = int(linePieces[0]) + seq_len = int(linePieces[1]) + # data lines + """ + TODO check data lines + while True: + line = fh.readline() + # name is the first 10 characters + name = line[0:10] + seq = line[10:].strip() + # nucleic base or amino acid 1-char designators (spaces allowed) + bases = ''.join(seq.split()) + # float per base (each separated by space) + """ + return True + except: + pass + finally: + close(fh) + return False + + +## Qiime Classes + +class MetadataMapping(Tabular): + MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] ) + file_ext = 'mapping' + + def __init__(self, **kwd): + """ + http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview + Information about the samples necessary to perform the data analysis. + # self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description'] + """ + Tabular.__init__( self, **kwd ) + + def sniff( self, filename ): + """ + Determines whether the file is a qiime mapping file + Just checking for an appropriate header line for now, could be improved + """ + try: + pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription' + fh = open( filename ) + while True: + line = dataset_fh.readline() + if re.match(pat,line): + return True + except: + pass + finally: + close(fh) + return False + + def set_column_names(self, dataset): + if dataset.has_data(): + dataset_fh = open( dataset.file_name ) + line = dataset_fh.readline() + if line.startswith('#SampleID'): + dataset.metadata.column_names = line.strip().split('\t'); + dataset_fh.close() + + def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): + Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) + self.set_column_names(dataset) + +if __name__ == '__main__': + import doctest, sys + doctest.testmod(sys.modules[__name__]) +