comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 30:a90d1915a176

metagenomics.py - require ref.taxonomy sniff to find at least 1 multi-level tax assignment with semicolon separators
author Jim Johnson <jj@umn.edu>
date Thu, 30 May 2013 08:59:17 -0500
parents 9c0cd3b92295
children ec8df51e841a
comparison
equal deleted inserted replaced
29:9c0cd3b92295 30:a90d1915a176
88 return False 88 return False
89 89
90 class OtuList( Otu ): 90 class OtuList( Otu ):
91 file_ext = 'list' 91 file_ext = 'list'
92 def __init__(self, **kwd): 92 def __init__(self, **kwd):
93 """
94 # http://www.mothur.org/wiki/List_file
95 The first column is a label that represents the distance that sequences were assigned to OTUs.
96 The number in the second column is the number of OTUs that have been formed.
97 Subsequent columns contain the names of sequences within each OTU separated by a comma.
98 distance_label otu_count OTU1 OTU2 OTUn
99 """
93 Otu.__init__( self, **kwd ) 100 Otu.__init__( self, **kwd )
94 def init_meta( self, dataset, copy_from=None ): 101 def init_meta( self, dataset, copy_from=None ):
95 Otu.init_meta( self, dataset, copy_from=copy_from ) 102 Otu.init_meta( self, dataset, copy_from=copy_from )
96 def set_meta( self, dataset, overwrite = True, **kwd ): 103 def set_meta( self, dataset, overwrite = True, **kwd ):
97 Otu.set_meta(self,dataset, overwrite = True, **kwd ) 104 Otu.set_meta(self,dataset, overwrite = True, **kwd )
103 """ 110 """
104 111
105 class Sabund( Otu ): 112 class Sabund( Otu ):
106 file_ext = 'sabund' 113 file_ext = 'sabund'
107 def __init__(self, **kwd): 114 def __init__(self, **kwd):
115 """
116 # http://www.mothur.org/wiki/Sabund_file
117 """
108 Otu.__init__( self, **kwd ) 118 Otu.__init__( self, **kwd )
109 def init_meta( self, dataset, copy_from=None ): 119 def init_meta( self, dataset, copy_from=None ):
110 Otu.init_meta( self, dataset, copy_from=copy_from ) 120 Otu.init_meta( self, dataset, copy_from=copy_from )
111 def sniff( self, filename ): 121 def sniff( self, filename ):
112 """ 122 """
148 return False 158 return False
149 159
150 class Rabund( Sabund ): 160 class Rabund( Sabund ):
151 file_ext = 'rabund' 161 file_ext = 'rabund'
152 def __init__(self, **kwd): 162 def __init__(self, **kwd):
163 """
164 # http://www.mothur.org/wiki/Rabund_file
165 """
153 Sabund.__init__( self, **kwd ) 166 Sabund.__init__( self, **kwd )
154 def init_meta( self, dataset, copy_from=None ): 167 def init_meta( self, dataset, copy_from=None ):
155 Sabund.init_meta( self, dataset, copy_from=copy_from ) 168 Sabund.init_meta( self, dataset, copy_from=copy_from )
156 169
157 class GroupAbund( Otu ): 170 class GroupAbund( Otu ):
252 return False 265 return False
253 266
254 class SharedRabund( GroupAbund ): 267 class SharedRabund( GroupAbund ):
255 file_ext = 'shared' 268 file_ext = 'shared'
256 def __init__(self, **kwd): 269 def __init__(self, **kwd):
270 """
271 # http://www.mothur.org/wiki/Shared_file
272 A shared file is analogous to an rabund file.
273 The data in a shared file represent the number of times that an OTU is observed in multiple samples.
274 The structure of a shared file is analogous to an rabund file.
275 The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file.
276 The second column contains the group name that designates where the data is coming from for that row.
277 The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow.
278 Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group.
279 """
257 GroupAbund.__init__( self, **kwd ) 280 GroupAbund.__init__( self, **kwd )
258 def init_meta( self, dataset, copy_from=None ): 281 def init_meta( self, dataset, copy_from=None ):
259 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) 282 GroupAbund.init_meta( self, dataset, copy_from=copy_from )
260 def sniff( self, filename ): 283 def sniff( self, filename ):
261 """ 284 """
269 292
270 293
271 class RelAbund( GroupAbund ): 294 class RelAbund( GroupAbund ):
272 file_ext = 'relabund' 295 file_ext = 'relabund'
273 def __init__(self, **kwd): 296 def __init__(self, **kwd):
297 """
298 # http://www.mothur.org/wiki/Relabund_file
299 The structure of a relabund file is analogous to an shared file.
300 The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list).
301 The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow.
302 Finally, the remaining columns indicate the relative abundance of each OTU from that group.
303 """
274 GroupAbund.__init__( self, **kwd ) 304 GroupAbund.__init__( self, **kwd )
275 def init_meta( self, dataset, copy_from=None ): 305 def init_meta( self, dataset, copy_from=None ):
276 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) 306 GroupAbund.init_meta( self, dataset, copy_from=copy_from )
277 def sniff( self, filename ): 307 def sniff( self, filename ):
278 """ 308 """
623 self.columns = 8 653 self.columns = 8
624 654
625 class Names(Tabular): 655 class Names(Tabular):
626 file_ext = 'names' 656 file_ext = 'names'
627 def __init__(self, **kwd): 657 def __init__(self, **kwd):
628 """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" 658 """
659 # http://www.mothur.org/wiki/Name_file
660 Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)
661 """
629 Tabular.__init__( self, **kwd ) 662 Tabular.__init__( self, **kwd )
630 self.column_names = ['name','representatives'] 663 self.column_names = ['name','representatives']
631 self.columns = 2 664 self.columns = 2
632 665
633 class Summary(Tabular): 666 class Summary(Tabular):
640 673
641 class Group(Tabular): 674 class Group(Tabular):
642 file_ext = 'groups' 675 file_ext = 'groups'
643 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) 676 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
644 def __init__(self, **kwd): 677 def __init__(self, **kwd):
645 """Group file assigns sequence (col 1) to a group (col 2)""" 678 """
679 # http://www.mothur.org/wiki/Groups_file
680 Group file assigns sequence (col 1) to a group (col 2)
681 """
646 Tabular.__init__( self, **kwd ) 682 Tabular.__init__( self, **kwd )
647 self.column_names = ['name','group'] 683 self.column_names = ['name','group']
648 self.columns = 2 684 self.columns = 2
649 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): 685 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
650 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) 686 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
660 fh.close() 696 fh.close()
661 697
662 class Design(Group): 698 class Design(Group):
663 file_ext = 'design' 699 file_ext = 'design'
664 def __init__(self, **kwd): 700 def __init__(self, **kwd):
665 """Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" 701 """
702 # http://www.mothur.org/wiki/Design_File
703 Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.
704 """
666 Group.__init__( self, **kwd ) 705 Group.__init__( self, **kwd )
667 706
668 class AccNos(Tabular): 707 class AccNos(Tabular):
669 file_ext = 'accnos' 708 file_ext = 'accnos'
670 def __init__(self, **kwd): 709 def __init__(self, **kwd):
676 class Oligos( Text ): 715 class Oligos( Text ):
677 file_ext = 'oligos' 716 file_ext = 'oligos'
678 717
679 def sniff( self, filename ): 718 def sniff( self, filename ):
680 """ 719 """
720 # http://www.mothur.org/wiki/Oligos_File
681 Determines whether the file is a otu (operational taxonomic unit) format 721 Determines whether the file is a otu (operational taxonomic unit) format
682 """ 722 """
683 try: 723 try:
684 fh = open( filename ) 724 fh = open( filename )
685 count = 0 725 count = 0
854 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) 894 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] )
855 file_ext = 'count_table' 895 file_ext = 'count_table'
856 896
857 def __init__(self, **kwd): 897 def __init__(self, **kwd):
858 """ 898 """
899 # http://www.mothur.org/wiki/Count_File
859 A table with first column names and following columns integer counts 900 A table with first column names and following columns integer counts
860 # Example 1: 901 # Example 1:
861 Representative_Sequence total 902 Representative_Sequence total
862 U68630 1 903 U68630 1
863 U68595 1 904 U68595 1
899 close(fh) 940 close(fh)
900 941
901 class RefTaxonomy(Tabular): 942 class RefTaxonomy(Tabular):
902 file_ext = 'ref.taxonomy' 943 file_ext = 'ref.taxonomy'
903 """ 944 """
945 # http://www.mothur.org/wiki/Taxonomy_outline
904 A table with 2 or 3 columns: 946 A table with 2 or 3 columns:
905 - SequenceName 947 - SequenceName
906 - Taxonomy (semicolon-separated taxonomy in descending order) 948 - Taxonomy (semicolon-separated taxonomy in descending order)
907 - integer ? 949 - integer ?
908 Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline ) 950 Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline )
918 Tabular.__init__( self, **kwd ) 960 Tabular.__init__( self, **kwd )
919 self.column_names = ['name','taxonomy'] 961 self.column_names = ['name','taxonomy']
920 962
921 def sniff( self, filename ): 963 def sniff( self, filename ):
922 """ 964 """
923 Determines whether the file is a SequenceTaxonomy 965 Determines whether the file is a Reference Taxonomy
924 """ 966 """
925 try: 967 try:
926 pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' 968 pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$'
927 fh = open( filename ) 969 fh = open( filename )
928 count = 0 970 count = 0
971 # VAMPS taxonomy files do not require a semicolon after the last taxonomy category
972 # but assume assume the file will have some multi-level taxonomy assignments
973 found_semicolons = False
929 while True: 974 while True:
930 line = fh.readline() 975 line = fh.readline()
931 if not line: 976 if not line:
932 break #EOF 977 break #EOF
933 line = line.strip() 978 line = line.strip()
935 fields = line.split('\t') 980 fields = line.split('\t')
936 if not (2 <= len(fields) <= 3): 981 if not (2 <= len(fields) <= 3):
937 return False 982 return False
938 if not re.match(pat,fields[1]): 983 if not re.match(pat,fields[1]):
939 return False 984 return False
985 if not found_semicolons and str(fields[1]).count(';') > 0:
986 found_semicolons = True
940 if len(fields) == 3: 987 if len(fields) == 3:
941 check = int(fields[2]) 988 check = int(fields[2])
942 count += 1 989 count += 1
943 if count > 10: 990 if count > 100:
944 break 991 break
945 if count > 0: 992 if count > 0:
946 return True 993 # This will be true if at least one entry
994 # has semicolons in the 2nd column
995 return found_semicolons
947 except: 996 except:
948 pass 997 pass
949 finally: 998 finally:
950 fh.close() 999 fh.close()
951 return False 1000 return False
952 1001
953 class SequenceTaxonomy(RefTaxonomy): 1002 class SequenceTaxonomy(RefTaxonomy):
954 file_ext = 'seq.taxonomy' 1003 file_ext = 'seq.taxonomy'
955 """ 1004 """
1005 # http://www.mothur.org/wiki/Taxonomy_outline
956 A table with 2 columns: 1006 A table with 2 columns:
957 - SequenceName 1007 - SequenceName
958 - Taxonomy (semicolon-separated taxonomy in descending order) 1008 - Taxonomy (semicolon-separated taxonomy in descending order)
959 Example: 1009 Example:
960 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; 1010 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
1131 fh = open( filename ) 1181 fh = open( filename )
1132 count = 0 1182 count = 0
1133 line = fh.readline() 1183 line = fh.readline()
1134 line = line.strip() 1184 line = line.strip()
1135 col_cnt = None 1185 col_cnt = None
1186 all_integers = True
1136 while True: 1187 while True:
1137 line = fh.readline() 1188 line = fh.readline()
1138 line = line.strip() 1189 line = line.strip()
1139 if not line: 1190 if not line:
1140 break #EOF 1191 break #EOF
1149 if len(fields) != col_cnt : 1200 if len(fields) != col_cnt :
1150 return False 1201 return False
1151 try: 1202 try:
1152 for i in range(1, col_cnt): 1203 for i in range(1, col_cnt):
1153 check = float(fields[i]) 1204 check = float(fields[i])
1205 # Also test for whether value is an integer
1206 try:
1207 check = int(fields[i])
1208 except ValueError:
1209 all_integers = False
1154 except ValueError: 1210 except ValueError:
1155 return False 1211 return False
1156 count += 1 1212 count += 1
1157 if count > 10: 1213 if count > 10:
1158 return True 1214 break
1159 if count > 0: 1215 if count > 0:
1160 return True 1216 if not all_integers:
1217 # At least one value was a float
1218 return True
1219 else:
1220 return False
1161 except: 1221 except:
1162 pass 1222 pass
1163 finally: 1223 finally:
1164 fh.close() 1224 fh.close()
1165 return False 1225 return False
1167 class SffFlow(Tabular): 1227 class SffFlow(Tabular):
1168 MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True) 1228 MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True)
1169 MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) 1229 MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False)
1170 file_ext = 'sff.flow' 1230 file_ext = 'sff.flow'
1171 """ 1231 """
1232 # http://www.mothur.org/wiki/Flow_file
1172 The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. 1233 The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400.
1173 Following lines contain: 1234 Following lines contain:
1174 - SequenceName 1235 - SequenceName
1175 - the number of useable flows as defined by 454's software 1236 - the number of useable flows as defined by 454's software
1176 - the flow intensity for each base going in the order of TACG. 1237 - the flow intensity for each base going in the order of TACG.