mothur_toolsuite: mothur/lib/galaxy/datatypes/metagenomics.py comparison

comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 0:3202a38e44d9

Migrated tool version 1.15.1 from old tool shed archive to new tool shed repository

author	jjohnson
date	Tue, 07 Jun 2011 17:32:23 -0400
parents
children	fcc0778f6987

comparison

equal deleted inserted replaced

--1:000000000000
+:3202a38e44d9
+"""
+metagenomics datatypes
+James E Johnson - University of Minnesota
+for Mothur
+"""
+import data
+import logging, os, sys, time, tempfile, shutil, string, glob, re
+import galaxy.model
+from galaxy.datatypes import metadata
+from galaxy.datatypes import tabular
+from galaxy.datatypes import sequence
+from galaxy.datatypes.metadata import MetadataElement
+from galaxy.datatypes.tabular import Tabular
+from galaxy.datatypes.sequence import Fasta
+from galaxy import util
+from galaxy.datatypes.images import Html
+from sniff import *
+log = logging.getLogger(__name__)
+## Mothur Classes
+class Otu( data.Text ):
+file_ext = 'otu'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 2:
+return False
+try:
+check = int(linePieces[1])
+if check + 2 != len(linePieces):
+return False
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class OtuList( Otu ):
+file_ext = 'list'
+class Sabund( Otu ):
+file_ext = 'sabund'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+label<TAB>count[<TAB>value(1..n)]
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 2:
+return False
+try:
+check = int(linePieces[1])
+if check + 2 != len(linePieces):
+return False
+for i in range( 2, len(linePieces)):
+ival = int(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count >= 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Rabund( Sabund ):
+file_ext = 'rabund'
+class SharedRabund( Rabund ):
+file_ext = 'shared'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) Shared format
+label<TAB>group<TAB>count[<TAB>value(1..n)]
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 3:
+return False
+try:
+check = int(linePieces[2])
+if check + 3 != len(linePieces):
+return False
+for i in range( 3, len(linePieces)):
+ival = int(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count >= 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class RelAbund( Rabund ):
+file_ext = 'relabund'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) Relative Abundance format
+label<TAB>group<TAB>count[<TAB>value(1..n)]
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) < 3:
+return False
+try:
+check = int(linePieces[2])
+if check + 3 != len(linePieces):
+return False
+for i in range( 3, len(linePieces)):
+fval = float(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count >= 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SecondaryStructureMap(Tabular):
+file_ext = 'map'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Map']
+def sniff( self, filename ):
+"""
+Determines whether the file is a secondary structure map format
+A single column with an integer value which indicates the row that this row maps to.
+check you make sure is structMap[10] = 380 then structMap[380] = 10.
+"""
+try:
+fh = open( filename )
+line_num = 0
+rowidxmap = {}
+while True:
+line = fh.readline()
+line_num += 1
+line = line.strip()
+if not line:
+break #EOF
+if line:
+try:
+pointer = int(line)
+if pointer > 0:
+if pointer > line_num:
+rowidxmap[line_num] = pointer
+elif pointer < line_num & rowidxmap[pointer] != line_num:
+return False
+except ValueError:
+return False
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SequenceAlignment( Fasta ):
+file_ext = 'align'
+def __init__(self, **kwd):
+Fasta.__init__( self, **kwd )
+"""Initialize AlignCheck datatype"""
+def sniff( self, filename ):
+"""
+Determines whether the file is in Mothur align fasta format
+Each sequence line must be the same length
+"""
+try:
+fh = open( filename )
+len = -1
+while True:
+line = fh.readline()
+if not line:
+break #EOF
+line = line.strip()
+if line: #first non-empty line
+if line.startswith( '>' ):
+#The next line.strip() must not be '', nor startwith '>'
+line = fh.readline().strip()
+if line == '' or line.startswith( '>' ):
+break
+if len < 0:
+len = len(line)
+elif len != len(line):
+return False
+else:
+break #we found a non-empty line, but its not a fasta header
+if len > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class AlignCheck( Tabular ):
+file_ext = 'align.check'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+self.column_types = ['str','int','int','int','int','int','int','int']
+self.comment_lines = 1
+def set_meta( self, dataset, overwrite = True, **kwd ):
+# Tabular.set_meta( self, dataset, overwrite = overwrite, first_line_is_header = True, skip = 1 )
+data_lines = 0
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+while True:
+line = dataset_fh.readline()
+if not line: break
+data_lines += 1
+dataset_fh.close()
+dataset.metadata.comment_lines = 1
+dataset.metadata.data_lines = data_lines - 1 if data_lines > 0 else 0
+dataset.metadata.column_names = self.column_names
+dataset.metadata.column_types = self.column_types
+class AlignReport(Tabular):
+"""
+QueryName	QueryLength	TemplateName	TemplateLength	SearchMethod	SearchScore	AlignmentMethod	QueryStart	QueryEnd	TemplateStart	TemplateEnd	PairwiseAlignmentLength	GapsInQuery	GapsInTemplate	LongestInsert	SimBtwnQuery&Template
+AY457915	501		82283		1525		kmer		89.07		needleman	5		501		1		499		499			2		0		0		97.6
+"""
+file_ext = 'align.report'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['QueryName','QueryLength','TemplateName','TemplateLength','SearchMethod','SearchScore',
+'AlignmentMethod','QueryStart','QueryEnd','TemplateStart','TemplateEnd',
+'PairwiseAlignmentLength','GapsInQuery','GapsInTemplate','LongestInsert','SimBtwnQuery&Template'
+]
+class BellerophonChimera( Tabular ):
+file_ext = 'bellerophon.chimera'
+def __init__(self, **kwd):
+"""Initialize AlignCheck datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Name','Score','Left','Right']
+class SecondaryStructureMatch(Tabular):
+"""
+	name	pound	dash	plus	equal	loop	tilde	total
+	9_1_12	42	68	8	28	275	420	872
+	9_1_14	36	68	6	26	266	422	851
+	9_1_15	44	68	8	28	276	418	873
+	9_1_16	34	72	6	30	267	430	860
+	9_1_18	46	80	2	36	261
+"""
+def __init__(self, **kwd):
+"""Initialize SecondaryStructureMatch datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+class DistanceMatrix(data.Text):
+file_ext = 'dist'
+"""Add metadata elements"""
+MetadataElement( name="sequence_count", default=0, desc="Number of sequences", readonly=False, optional=True, no_value=0 )
+class LowerTriangleDistanceMatrix(DistanceMatrix):
+file_ext = 'lower.dist'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+DistanceMatrix.__init__( self, **kwd )
+def sniff( self, filename ):
+"""
+Determines whether the file is a lower-triangle distance matrix (phylip) format
+The first line has the number of sequences in the matrix.
+The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
+5
+U68589
+U68590	0.3371
+U68591	0.3609	0.3782
+U68592	0.4155	0.3197	0.4148
+U68593	0.2872	0.1690	0.3361	0.2842
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != 3:
+return False
+try:
+check = float(linePieces[2])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class SquareDistanceMatrix(DistanceMatrix,Tabular):
+file_ext = 'square.dist'
+sequence_count = -1
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+def init_meta( self, dataset, copy_from=None ):
+data.Text.init_meta( self, dataset, copy_from=copy_from )
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+dataset.metadata.sequences = 0
+def sniff( self, filename ):
+"""
+Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
+The first line has the number of sequences in the matrix.
+The following lines have the sequence name in the first column plus a column for the distance to each sequence
+in the row order in which they appear in the matrix.
+3
+U68589  0.0000  0.3371  0.3610
+U68590  0.3371  0.0000  0.3783
+U68590  0.3371  0.0000  0.3783
+"""
+try:
+fh = open( filename )
+count = 0
+line = fh.readline()
+line = line.strip()
+sequence_count = int(line)
+col_cnt = seq_cnt + 1
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != col_cnt :
+return False
+try:
+for i in range(1, col_cnt):
+check = float(linePieces[i])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class PairwiseDistanceMatrix(DistanceMatrix,Tabular):
+file_ext = 'pair.dist'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['Sequence','Sequence','Distance']
+self.column_types = ['str','str','float']
+self.comment_lines = 1
+def sniff( self, filename ):
+"""
+Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
+The first and second columns have the sequence names and the third column is the distance between those sequences.
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+if line:
+if line[0] != '@':
+linePieces = line.split('\t')
+if len(linePieces) != 3:
+return False
+try:
+check = float(linePieces[2])
+except ValueError:
+return False
+count += 1
+if count == 5:
+return True
+fh.close()
+if count < 5 and count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Alignment(Tabular):
+file_ext = 'align'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+class AlignCheck(Tabular):
+file_ext = 'align.check'
+def __init__(self, **kwd):
+"""Initialize secondary structure map datatype"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','pound','dash','plus','equal','loop','tilde','total']
+class Names(Tabular):
+file_ext = 'names'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','representatives']
+class Summary(Tabular):
+file_ext = 'summary'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['seqname','start','end','nbases','ambigs','polymer']
+class Group(Tabular):
+file_ext = 'groups'
+def __init__(self, **kwd):
+"""Name file shows the relationship between a representative sequence(col 1)  and the sequences it represents(col 2)"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','group']
+class AccNos(Tabular):
+file_ext = 'accnos'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name']
+class Oligos( data.Text ):
+file_ext = 'oligos'
+def sniff( self, filename ):
+"""
+Determines whether the file is a otu (operational taxonomic unit) format
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+linePieces = line.split('\t')
+if len(linePieces) == 2 and re.match('forward|reverse',linePieces[0]):
+count += 1
+continue
+elif len(linePieces) == 3 and re.match('barcode',linePieces[0]):
+count += 1
+continue
+else:
+return False
+if count > 20:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Frequency(Tabular):
+file_ext = 'freq'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['position','frequency']
+self.column_types = ['int','float']
+def sniff( self, filename ):
+"""
+Determines whether the file is a frequency tabular format for chimera analysis
+#1.14.0
+0	0.000
+1	0.000
+...
+155	0.975
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+try:
+linePieces = line.split('\t')
+i = int(linePieces[0])
+f = float(linePieces[1])
+count += 1
+continue
+except:
+return False
+if count > 20:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class Quantile(Tabular):
+file_ext = 'quan'
+MetadataElement( name="filtered", default=False, no_value=False, optional=True , desc="Quantiles calculated using a mask", readonly=True)
+MetadataElement( name="masked", default=False, no_value=False, optional=True , desc="Quantiles calculated using a frequency filter", readonly=True)
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['num','ten','twentyfive','fifty','seventyfive','ninetyfive','ninetynine']
+self.column_types = ['int','float','float','float','float','float','float']
+def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
+log.info( "Mothur Quantile set_meta %s" % kwd)
+def sniff( self, filename ):
+"""
+Determines whether the file is a quantiles tabular format for chimera analysis
+1	0	0	0	0	0	0
+2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
+3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
+...
+"""
+try:
+fh = open( filename )
+count = 0
+while True:
+line = fh.readline()
+line = line.strip()
+if not line:
+break #EOF
+else:
+if line[0] != '#':
+try:
+linePieces = line.split('\t')
+i = int(linePieces[0])
+f = float(linePieces[1])
+f = float(linePieces[2])
+f = float(linePieces[3])
+f = float(linePieces[4])
+f = float(linePieces[5])
+f = float(linePieces[6])
+count += 1
+continue
+except:
+return False
+if count > 10:
+return True
+if count > 0:
+return True
+except:
+pass
+finally:
+fh.close()
+return False
+class FilteredQuantile(Quantile):
+file_ext = 'filtered.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.filtered = True
+class MaskedQuantile(Quantile):
+file_ext = 'masked.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.masked = True
+self.filtered = False
+class FilteredMaskedQuantile(Quantile):
+file_ext = 'filtered.masked.quan'
+def __init__(self, **kwd):
+"""Quantiles for chimera analysis"""
+Quantile.__init__( self, **kwd )
+self.masked = True
+self.filtered = True
+class LaneMask(data.Text):
+file_ext = 'filter'
+def sniff( self, filename ):
+"""
+Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.
+"""
+try:
+fh = open( filename )
+while True:
+buff = fh.read(1000)
+if not buff:
+break #EOF
+else:
+if not re.match('^[01]+$',line):
+return False
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+class SequenceTaxonomy(Tabular):
+file_ext = 'taxonomy'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['name','taxonomy']
+class ConsensusTaxonomy(Tabular):
+file_ext = 'cons.taxonomy'
+def __init__(self, **kwd):
+"""A list of names"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['OTU','count','taxonomy']
+class TaxonomySummary(Tabular):
+file_ext = 'tax.summary'
+def __init__(self, **kwd):
+"""A Summary of taxon classification"""
+Tabular.__init__( self, **kwd )
+self.column_names = ['taxlevel','rankID','taxon','daughterlevels','total']
+class Phylip(data.Text):
+file_ext = 'phy'
+def sniff( self, filename ):
+"""
+Determines whether the file is in Phylip format (Interleaved or Sequential)
+The first line of the input file contains the number of species and the
+number of characters, in free format, separated by blanks (not by
+commas). The information for each species follows, starting with a
+ten-character species name (which can include punctuation marks and blanks),
+and continuing with the characters for that species.
+http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles
+Interleaved Example:
+6   39
+Archaeopt CGATGCTTAC CGCCGATGCT
+HesperorniCGTTACTCGT TGTCGTTACT
+BaluchitheTAATGTTAAT TGTTAATGTT
+B. virginiTAATGTTCGT TGTTAATGTT
+BrontosaurCAAAACCCAT CATCAAAACC
+B.subtilisGGCAGCCAAT CACGGCAGCC
+TACCGCCGAT GCTTACCGC
+CGTTGTCGTT ACTCGTTGT
+AATTGTTAAT GTTAATTGT
+CGTTGTTAAT GTTCGTTGT
+CATCATCAAA ACCCATCAT
+AATCACGGCA GCCAATCAC
+"""
+try:
+fh = open( filename )
+# counts line
+line = fh.readline().strip()
+linePieces = line.split()
+count = int(linePieces[0])
+seq_len = int(linePieces[1])
+# data lines
+"""
+TODO check data lines
+while True:
+line = fh.readline()
+# name is the first 10 characters
+name = line[0:10]
+seq = line[10:].strip()
+# nucleic base or amino acid 1-char designators (spaces allowed)
+bases = ''.join(seq.split())
+# float per base (each separated by space)
+"""
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+## Qiime Classes
+class MetadataMapping(Tabular):
+MetadataElement( name="column_names", default=[], desc="Column Names", readonly=False, visible=True, no_value=[] )
+file_ext = 'mapping'
+def __init__(self, **kwd):
+"""
+http://qiime.sourceforge.net/documentation/file_formats.html#mapping-file-overview
+Information about the samples necessary to perform the data analysis.
+# self.column_names = ['#SampleID','BarcodeSequence','LinkerPrimerSequence','Description']
+"""
+Tabular.__init__( self, **kwd )
+def sniff( self, filename ):
+"""
+Determines whether the file is a qiime mapping file
+Just checking for an appropriate header line for now, could be improved
+"""
+try:
+pat = '#SampleID(\t[a-zA-Z][a-zA-Z0-9_]*)*\tDescription'
+fh = open( filename )
+while True:
+line = dataset_fh.readline()
+if re.match(pat,line):
+return True
+except:
+pass
+finally:
+close(fh)
+return False
+def set_column_names(self, dataset):
+if dataset.has_data():
+dataset_fh = open( dataset.file_name )
+line = dataset_fh.readline()
+if line.startswith('#SampleID'):
+dataset.metadata.column_names = line.strip().split('\t');
+dataset_fh.close()
+def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
+Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
+self.set_column_names(dataset)
+if __name__ == '__main__':
+import doctest, sys
+doctest.testmod(sys.modules[__name__])

Mercurial > repos > jjohnson > mothur_toolsuite

comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 0:3202a38e44d9