Mercurial > repos > jjohnson > mothur_toolsuite
changeset 30:a90d1915a176
metagenomics.py - require ref.taxonomy sniff to find at least 1 multi-level tax assignment with semicolon separators
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 30 May 2013 08:59:17 -0500 |
parents | 9c0cd3b92295 |
children | a3eed59297ea |
files | mothur/lib/galaxy/datatypes/metagenomics.py |
diffstat | 1 files changed, 69 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/mothur/lib/galaxy/datatypes/metagenomics.py Tue May 28 07:43:37 2013 -0500 +++ b/mothur/lib/galaxy/datatypes/metagenomics.py Thu May 30 08:59:17 2013 -0500 @@ -90,6 +90,13 @@ class OtuList( Otu ): file_ext = 'list' def __init__(self, **kwd): + """ + # http://www.mothur.org/wiki/List_file + The first column is a label that represents the distance that sequences were assigned to OTUs. + The number in the second column is the number of OTUs that have been formed. + Subsequent columns contain the names of sequences within each OTU separated by a comma. + distance_label otu_count OTU1 OTU2 OTUn + """ Otu.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): Otu.init_meta( self, dataset, copy_from=copy_from ) @@ -105,6 +112,9 @@ class Sabund( Otu ): file_ext = 'sabund' def __init__(self, **kwd): + """ + # http://www.mothur.org/wiki/Sabund_file + """ Otu.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): Otu.init_meta( self, dataset, copy_from=copy_from ) @@ -150,6 +160,9 @@ class Rabund( Sabund ): file_ext = 'rabund' def __init__(self, **kwd): + """ + # http://www.mothur.org/wiki/Rabund_file + """ Sabund.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): Sabund.init_meta( self, dataset, copy_from=copy_from ) @@ -254,6 +267,16 @@ class SharedRabund( GroupAbund ): file_ext = 'shared' def __init__(self, **kwd): + """ + # http://www.mothur.org/wiki/Shared_file + A shared file is analogous to an rabund file. + The data in a shared file represent the number of times that an OTU is observed in multiple samples. + The structure of a shared file is analogous to an rabund file. + The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file. + The second column contains the group name that designates where the data is coming from for that row. + The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow. + Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group. + """ GroupAbund.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): GroupAbund.init_meta( self, dataset, copy_from=copy_from ) @@ -271,6 +294,13 @@ class RelAbund( GroupAbund ): file_ext = 'relabund' def __init__(self, **kwd): + """ + # http://www.mothur.org/wiki/Relabund_file + The structure of a relabund file is analogous to an shared file. + The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list). + The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow. + Finally, the remaining columns indicate the relative abundance of each OTU from that group. + """ GroupAbund.__init__( self, **kwd ) def init_meta( self, dataset, copy_from=None ): GroupAbund.init_meta( self, dataset, copy_from=copy_from ) @@ -625,7 +655,10 @@ class Names(Tabular): file_ext = 'names' def __init__(self, **kwd): - """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" + """ + # http://www.mothur.org/wiki/Name_file + Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2) + """ Tabular.__init__( self, **kwd ) self.column_names = ['name','representatives'] self.columns = 2 @@ -642,7 +675,10 @@ file_ext = 'groups' MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) def __init__(self, **kwd): - """Group file assigns sequence (col 1) to a group (col 2)""" + """ + # http://www.mothur.org/wiki/Groups_file + Group file assigns sequence (col 1) to a group (col 2) + """ Tabular.__init__( self, **kwd ) self.column_names = ['name','group'] self.columns = 2 @@ -662,7 +698,10 @@ class Design(Group): file_ext = 'design' def __init__(self, **kwd): - """Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" + """ + # http://www.mothur.org/wiki/Design_File + Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups. + """ Group.__init__( self, **kwd ) class AccNos(Tabular): @@ -678,6 +717,7 @@ def sniff( self, filename ): """ + # http://www.mothur.org/wiki/Oligos_File Determines whether the file is a otu (operational taxonomic unit) format """ try: @@ -856,6 +896,7 @@ def __init__(self, **kwd): """ + # http://www.mothur.org/wiki/Count_File A table with first column names and following columns integer counts # Example 1: Representative_Sequence total @@ -901,6 +942,7 @@ class RefTaxonomy(Tabular): file_ext = 'ref.taxonomy' """ + # http://www.mothur.org/wiki/Taxonomy_outline A table with 2 or 3 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) @@ -920,12 +962,15 @@ def sniff( self, filename ): """ - Determines whether the file is a SequenceTaxonomy + Determines whether the file is a Reference Taxonomy """ try: pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' fh = open( filename ) count = 0 + # VAMPS taxonomy files do not require a semicolon after the last taxonomy category + # but assume assume the file will have some multi-level taxonomy assignments + found_semicolons = False while True: line = fh.readline() if not line: @@ -937,13 +982,17 @@ return False if not re.match(pat,fields[1]): return False + if not found_semicolons and str(fields[1]).count(';') > 0: + found_semicolons = True if len(fields) == 3: check = int(fields[2]) count += 1 - if count > 10: + if count > 100: break if count > 0: - return True + # This will be true if at least one entry + # has semicolons in the 2nd column + return found_semicolons except: pass finally: @@ -953,6 +1002,7 @@ class SequenceTaxonomy(RefTaxonomy): file_ext = 'seq.taxonomy' """ + # http://www.mothur.org/wiki/Taxonomy_outline A table with 2 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) @@ -1133,6 +1183,7 @@ line = fh.readline() line = line.strip() col_cnt = None + all_integers = True while True: line = fh.readline() line = line.strip() @@ -1151,13 +1202,22 @@ try: for i in range(1, col_cnt): check = float(fields[i]) + # Also test for whether value is an integer + try: + check = int(fields[i]) + except ValueError: + all_integers = False except ValueError: return False count += 1 if count > 10: - return True + break if count > 0: - return True + if not all_integers: + # At least one value was a float + return True + else: + return False except: pass finally: @@ -1169,6 +1229,7 @@ MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) file_ext = 'sff.flow' """ + # http://www.mothur.org/wiki/Flow_file The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. Following lines contain: - SequenceName