changeset 30:a90d1915a176

metagenomics.py - require ref.taxonomy sniff to find at least 1 multi-level tax assignment with semicolon separators
author Jim Johnson <jj@umn.edu>
date Thu, 30 May 2013 08:59:17 -0500
parents 9c0cd3b92295
children a3eed59297ea
files mothur/lib/galaxy/datatypes/metagenomics.py
diffstat 1 files changed, 69 insertions(+), 8 deletions(-) [+]
line wrap: on
line diff
--- a/mothur/lib/galaxy/datatypes/metagenomics.py	Tue May 28 07:43:37 2013 -0500
+++ b/mothur/lib/galaxy/datatypes/metagenomics.py	Thu May 30 08:59:17 2013 -0500
@@ -90,6 +90,13 @@
 class OtuList( Otu ):
     file_ext = 'list'
     def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/List_file
+        The first column is a label that represents the distance that sequences were assigned to OTUs.
+        The number in the second column is the number of OTUs that have been formed. 
+        Subsequent columns contain the names of sequences within each OTU separated by a comma.
+        distance_label	otu_count	OTU1	OTU2	OTUn
+        """
         Otu.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
         Otu.init_meta( self, dataset, copy_from=copy_from )
@@ -105,6 +112,9 @@
 class Sabund( Otu ):
     file_ext = 'sabund'
     def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Sabund_file
+        """
         Otu.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
         Otu.init_meta( self, dataset, copy_from=copy_from )
@@ -150,6 +160,9 @@
 class Rabund( Sabund ):
     file_ext = 'rabund'
     def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Rabund_file
+        """
         Sabund.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
         Sabund.init_meta( self, dataset, copy_from=copy_from )
@@ -254,6 +267,16 @@
 class SharedRabund( GroupAbund ):
     file_ext = 'shared'
     def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Shared_file
+        A shared file is analogous to an rabund file. 
+        The data in a shared file represent the number of times that an OTU is observed in multiple samples. 
+        The structure of a shared file is analogous to an rabund file. 
+        The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file. 
+        The second column contains the group name that designates where the data is coming from for that row. 
+        The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow. 
+        Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group. 
+        """
         GroupAbund.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
         GroupAbund.init_meta( self, dataset, copy_from=copy_from )
@@ -271,6 +294,13 @@
 class RelAbund( GroupAbund ):
     file_ext = 'relabund'
     def __init__(self, **kwd):
+        """
+        # http://www.mothur.org/wiki/Relabund_file
+        The structure of a relabund file is analogous to an shared file. 
+        The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list). 
+        The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow. 
+        Finally, the remaining columns indicate the relative abundance of each OTU from that group.
+        """
         GroupAbund.__init__( self, **kwd )
     def init_meta( self, dataset, copy_from=None ):
         GroupAbund.init_meta( self, dataset, copy_from=copy_from )
@@ -625,7 +655,10 @@
 class Names(Tabular):
     file_ext = 'names'
     def __init__(self, **kwd):
-        """Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)"""
+        """
+        # http://www.mothur.org/wiki/Name_file
+        Name file shows the relationship between a representative sequence(col 1)  and the sequences(comma-separated) it represents(col 2)
+        """
         Tabular.__init__( self, **kwd )
         self.column_names = ['name','representatives']
         self.columns = 2
@@ -642,7 +675,10 @@
     file_ext = 'groups'
     MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
     def __init__(self, **kwd):
-        """Group file assigns sequence (col 1)  to a group (col 2)"""
+        """
+        # http://www.mothur.org/wiki/Groups_file
+        Group file assigns sequence (col 1)  to a group (col 2)
+        """
         Tabular.__init__( self, **kwd )
         self.column_names = ['name','group']
         self.columns = 2
@@ -662,7 +698,10 @@
 class Design(Group):
     file_ext = 'design'
     def __init__(self, **kwd):
-        """Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups."""
+        """
+        # http://www.mothur.org/wiki/Design_File
+        Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.
+        """
         Group.__init__( self, **kwd )
 
 class AccNos(Tabular):
@@ -678,6 +717,7 @@
 
     def sniff( self, filename ):
         """
+        # http://www.mothur.org/wiki/Oligos_File
         Determines whether the file is a otu (operational taxonomic unit) format
         """
         try:
@@ -856,6 +896,7 @@
 
     def __init__(self, **kwd):
         """
+        # http://www.mothur.org/wiki/Count_File
         A table with first column names and following columns integer counts
         # Example 1:
         Representative_Sequence total   
@@ -901,6 +942,7 @@
 class RefTaxonomy(Tabular):
     file_ext = 'ref.taxonomy'
     """
+        # http://www.mothur.org/wiki/Taxonomy_outline
         A table with 2 or 3 columns:
         - SequenceName
         - Taxonomy (semicolon-separated taxonomy in descending order)
@@ -920,12 +962,15 @@
 
     def sniff( self, filename ):
         """
-        Determines whether the file is a SequenceTaxonomy
+        Determines whether the file is a Reference Taxonomy
         """
         try:
             pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$'
             fh = open( filename )
             count = 0
+            # VAMPS  taxonomy files do not require a semicolon after the last taxonomy category
+            # but assume assume the file will have some multi-level taxonomy assignments
+            found_semicolons = False
             while True:
                 line = fh.readline()
                 if not line:
@@ -937,13 +982,17 @@
                         return False
                     if not re.match(pat,fields[1]):
                         return False
+                    if not found_semicolons and str(fields[1]).count(';') > 0:
+                        found_semicolons = True
                     if len(fields) == 3:
                         check = int(fields[2])
                     count += 1
-                    if count > 10:
+                    if count > 100:
                         break
             if count > 0:
-                return True
+                # This will be true if at least one entry
+                # has semicolons in the 2nd column
+                return found_semicolons
         except:
             pass
         finally:
@@ -953,6 +1002,7 @@
 class SequenceTaxonomy(RefTaxonomy):
     file_ext = 'seq.taxonomy'
     """
+        # http://www.mothur.org/wiki/Taxonomy_outline
         A table with 2 columns:
         - SequenceName
         - Taxonomy (semicolon-separated taxonomy in descending order)
@@ -1133,6 +1183,7 @@
             line = fh.readline()
             line = line.strip()
             col_cnt = None
+            all_integers = True
             while True:
                 line = fh.readline()
                 line = line.strip()
@@ -1151,13 +1202,22 @@
                         try:
                             for i in range(1, col_cnt):
                                 check = float(fields[i])
+                                # Also test for whether value is an integer
+                                try:
+                                    check = int(fields[i])
+                                except ValueError:
+                                    all_integers = False
                         except ValueError:
                             return False
                         count += 1
                     if count > 10:
-                        return True
+                        break
             if count > 0:
-                return True
+                if not all_integers:
+                    # At least one value was a float
+                    return True
+                else:
+                    return False
         except:
             pass
         finally:
@@ -1169,6 +1229,7 @@
     MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False)
     file_ext = 'sff.flow'
     """
+        # http://www.mothur.org/wiki/Flow_file
         The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. 
         Following lines contain:
         - SequenceName