mothur_toolsuite: mothur/lib/galaxy/datatypes/metagenomics.py comparison

comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 30:a90d1915a176

metagenomics.py - require ref.taxonomy sniff to find at least 1 multi-level tax assignment with semicolon separators

author	Jim Johnson <jj@umn.edu>
date	Thu, 30 May 2013 08:59:17 -0500
parents	9c0cd3b92295
children	ec8df51e841a

comparison

equal deleted inserted replaced

-:9c0cd3b92295
+:a90d1915a176
 return False
 class OtuList( Otu ):
 file_ext = 'list'
 def __init__(self, **kwd):
+"""
+# http://www.mothur.org/wiki/List_file
+The first column is a label that represents the distance that sequences were assigned to OTUs.
+The number in the second column is the number of OTUs that have been formed.
+Subsequent columns contain the names of sequences within each OTU separated by a comma.
+distance_label	otu_count	OTU1	OTU2	OTUn
+"""
 Otu.__init__( self, **kwd )
 def init_meta( self, dataset, copy_from=None ):
 Otu.init_meta( self, dataset, copy_from=copy_from )
 def set_meta( self, dataset, overwrite = True, **kwd ):
 Otu.set_meta(self,dataset, overwrite = True, **kwd )
 """
 class Sabund( Otu ):
 file_ext = 'sabund'
 def __init__(self, **kwd):
+"""
+# http://www.mothur.org/wiki/Sabund_file
+"""
 Otu.__init__( self, **kwd )
 def init_meta( self, dataset, copy_from=None ):
 Otu.init_meta( self, dataset, copy_from=copy_from )
 def sniff( self, filename ):
 """
 return False
 class Rabund( Sabund ):
 file_ext = 'rabund'
 def __init__(self, **kwd):
+"""
+# http://www.mothur.org/wiki/Rabund_file
+"""
 Sabund.__init__( self, **kwd )
 def init_meta( self, dataset, copy_from=None ):
 Sabund.init_meta( self, dataset, copy_from=copy_from )
 class GroupAbund( Otu ):
 return False
 class SharedRabund( GroupAbund ):
 file_ext = 'shared'
 def __init__(self, **kwd):
+"""
+# http://www.mothur.org/wiki/Shared_file
+A shared file is analogous to an rabund file.
+The data in a shared file represent the number of times that an OTU is observed in multiple samples.
+The structure of a shared file is analogous to an rabund file.
+The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file.
+The second column contains the group name that designates where the data is coming from for that row.
+The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow.
+Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group.
+"""
 GroupAbund.__init__( self, **kwd )
 def init_meta( self, dataset, copy_from=None ):
 GroupAbund.init_meta( self, dataset, copy_from=copy_from )
 def sniff( self, filename ):
 """
 class RelAbund( GroupAbund ):
 file_ext = 'relabund'
 def __init__(self, **kwd):
+"""
+# http://www.mothur.org/wiki/Relabund_file
+The structure of a relabund file is analogous to an shared file.
+The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list).
+The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow.
+Finally, the remaining columns indicate the relative abundance of each OTU from that group.
+"""
 GroupAbund.__init__( self, **kwd )
 def init_meta( self, dataset, copy_from=None ):
 GroupAbund.init_meta( self, dataset, copy_from=copy_from )
 def sniff( self, filename ):
 """
 self.columns = 8
 class Names(Tabular):
 file_ext = 'names'
 def __init__(self, **kwd):
-"""Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)"""
+"""
+# http://www.mothur.org/wiki/Name_file
+Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)
+"""
 Tabular.__init__( self, **kwd )
 self.column_names = ['name','representatives']
 self.columns = 2
 class Summary(Tabular):
 class Group(Tabular):
 file_ext = 'groups'
 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
 def __init__(self, **kwd):
-"""Group file assigns sequence (col 1)  to a group (col 2)"""
+"""
+# http://www.mothur.org/wiki/Groups_file
+Group file assigns sequence (col 1)  to a group (col 2)
+"""
 Tabular.__init__( self, **kwd )
 self.column_names = ['name','group']
 self.columns = 2
 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
 fh.close()
 class Design(Group):
 file_ext = 'design'
 def __init__(self, **kwd):
-"""Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
+"""
+# http://www.mothur.org/wiki/Design_File
+Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.
+"""
 Group.__init__( self, **kwd )
 class AccNos(Tabular):
 file_ext = 'accnos'
 def __init__(self, **kwd):
 class Oligos( Text ):
 file_ext = 'oligos'
 def sniff( self, filename ):
 """
+# http://www.mothur.org/wiki/Oligos_File
 Determines whether the file is a otu (operational taxonomic unit) format
 """
 try:
 fh = open( filename )
 count = 0
 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
 file_ext = 'count_table'
 def __init__(self, **kwd):
 """
+# http://www.mothur.org/wiki/Count_File
 A table with first column names and following columns integer counts
 # Example 1:
 Representative_Sequence total
 U68630  1
 U68595  1
 close(fh)
 class RefTaxonomy(Tabular):
 file_ext = 'ref.taxonomy'
 """
+# http://www.mothur.org/wiki/Taxonomy_outline
 A table with 2 or 3 columns:
 - SequenceName
 - Taxonomy (semicolon-separated taxonomy in descending order)
 - integer ?
 Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline )
 Tabular.__init__( self, **kwd )
 self.column_names = ['name','taxonomy']
 def sniff( self, filename ):
 """
-Determines whether the file is a SequenceTaxonomy
+Determines whether the file is a Reference Taxonomy
 """
 try:
 pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$'
 fh = open( filename )
 count = 0
+# VAMPS  taxonomy files do not require a semicolon after the last taxonomy category
+# but assume assume the file will have some multi-level taxonomy assignments
+found_semicolons = False
 while True:
 line = fh.readline()
 if not line:
 break #EOF
 line = line.strip()
 fields = line.split('\t')
 if not (2 <= len(fields) <= 3):
 return False
 if not re.match(pat,fields[1]):
 return False
+if not found_semicolons and str(fields[1]).count(';') > 0:
+found_semicolons = True
 if len(fields) == 3:
 check = int(fields[2])
 count += 1
-if count > 10:
+if count > 100:
 break
 if count > 0:
-return True
+# This will be true if at least one entry
+# has semicolons in the 2nd column
+return found_semicolons
 except:
 pass
 finally:
 fh.close()
 return False
 class SequenceTaxonomy(RefTaxonomy):
 file_ext = 'seq.taxonomy'
 """
+# http://www.mothur.org/wiki/Taxonomy_outline
 A table with 2 columns:
 - SequenceName
 - Taxonomy (semicolon-separated taxonomy in descending order)
 Example:
 X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
 fh = open( filename )
 count = 0
 line = fh.readline()
 line = line.strip()
 col_cnt = None
+all_integers = True
 while True:
 line = fh.readline()
 line = line.strip()
 if not line:
 break #EOF
 if len(fields) != col_cnt :
 return False
 try:
 for i in range(1, col_cnt):
 check = float(fields[i])
+# Also test for whether value is an integer
+try:
+check = int(fields[i])
+except ValueError:
+all_integers = False
 except ValueError:
 return False
 count += 1
 if count > 10:
-return True
+break
 if count > 0:
-return True
+if not all_integers:
+# At least one value was a float
+return True
+else:
+return False
 except:
 pass
 finally:
 fh.close()
 return False
 class SffFlow(Tabular):
 MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True)
 MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False)
 file_ext = 'sff.flow'
 """
+# http://www.mothur.org/wiki/Flow_file
 The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400.
 Following lines contain:
 - SequenceName
 - the number of useable flows as defined by 454's software
 - the flow intensity for each base going in the order of TACG.

Mercurial > repos > jjohnson > mothur_toolsuite

comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 30:a90d1915a176