Mercurial > repos > jjohnson > mothur_toolsuite
comparison mothur/lib/galaxy/datatypes/metagenomics.py @ 30:a90d1915a176
metagenomics.py - require ref.taxonomy sniff to find at least 1 multi-level tax assignment with semicolon separators
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Thu, 30 May 2013 08:59:17 -0500 |
parents | 9c0cd3b92295 |
children | ec8df51e841a |
comparison
equal
deleted
inserted
replaced
29:9c0cd3b92295 | 30:a90d1915a176 |
---|---|
88 return False | 88 return False |
89 | 89 |
90 class OtuList( Otu ): | 90 class OtuList( Otu ): |
91 file_ext = 'list' | 91 file_ext = 'list' |
92 def __init__(self, **kwd): | 92 def __init__(self, **kwd): |
93 """ | |
94 # http://www.mothur.org/wiki/List_file | |
95 The first column is a label that represents the distance that sequences were assigned to OTUs. | |
96 The number in the second column is the number of OTUs that have been formed. | |
97 Subsequent columns contain the names of sequences within each OTU separated by a comma. | |
98 distance_label otu_count OTU1 OTU2 OTUn | |
99 """ | |
93 Otu.__init__( self, **kwd ) | 100 Otu.__init__( self, **kwd ) |
94 def init_meta( self, dataset, copy_from=None ): | 101 def init_meta( self, dataset, copy_from=None ): |
95 Otu.init_meta( self, dataset, copy_from=copy_from ) | 102 Otu.init_meta( self, dataset, copy_from=copy_from ) |
96 def set_meta( self, dataset, overwrite = True, **kwd ): | 103 def set_meta( self, dataset, overwrite = True, **kwd ): |
97 Otu.set_meta(self,dataset, overwrite = True, **kwd ) | 104 Otu.set_meta(self,dataset, overwrite = True, **kwd ) |
103 """ | 110 """ |
104 | 111 |
105 class Sabund( Otu ): | 112 class Sabund( Otu ): |
106 file_ext = 'sabund' | 113 file_ext = 'sabund' |
107 def __init__(self, **kwd): | 114 def __init__(self, **kwd): |
115 """ | |
116 # http://www.mothur.org/wiki/Sabund_file | |
117 """ | |
108 Otu.__init__( self, **kwd ) | 118 Otu.__init__( self, **kwd ) |
109 def init_meta( self, dataset, copy_from=None ): | 119 def init_meta( self, dataset, copy_from=None ): |
110 Otu.init_meta( self, dataset, copy_from=copy_from ) | 120 Otu.init_meta( self, dataset, copy_from=copy_from ) |
111 def sniff( self, filename ): | 121 def sniff( self, filename ): |
112 """ | 122 """ |
148 return False | 158 return False |
149 | 159 |
150 class Rabund( Sabund ): | 160 class Rabund( Sabund ): |
151 file_ext = 'rabund' | 161 file_ext = 'rabund' |
152 def __init__(self, **kwd): | 162 def __init__(self, **kwd): |
163 """ | |
164 # http://www.mothur.org/wiki/Rabund_file | |
165 """ | |
153 Sabund.__init__( self, **kwd ) | 166 Sabund.__init__( self, **kwd ) |
154 def init_meta( self, dataset, copy_from=None ): | 167 def init_meta( self, dataset, copy_from=None ): |
155 Sabund.init_meta( self, dataset, copy_from=copy_from ) | 168 Sabund.init_meta( self, dataset, copy_from=copy_from ) |
156 | 169 |
157 class GroupAbund( Otu ): | 170 class GroupAbund( Otu ): |
252 return False | 265 return False |
253 | 266 |
254 class SharedRabund( GroupAbund ): | 267 class SharedRabund( GroupAbund ): |
255 file_ext = 'shared' | 268 file_ext = 'shared' |
256 def __init__(self, **kwd): | 269 def __init__(self, **kwd): |
270 """ | |
271 # http://www.mothur.org/wiki/Shared_file | |
272 A shared file is analogous to an rabund file. | |
273 The data in a shared file represent the number of times that an OTU is observed in multiple samples. | |
274 The structure of a shared file is analogous to an rabund file. | |
275 The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file. | |
276 The second column contains the group name that designates where the data is coming from for that row. | |
277 The third column is the number of OTUs that were found between each of the groups and is the number of columns that follow. | |
278 Finally, the remaining columns indicate the number of sequences that belonged to each OTU from that group. | |
279 """ | |
257 GroupAbund.__init__( self, **kwd ) | 280 GroupAbund.__init__( self, **kwd ) |
258 def init_meta( self, dataset, copy_from=None ): | 281 def init_meta( self, dataset, copy_from=None ): |
259 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) | 282 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) |
260 def sniff( self, filename ): | 283 def sniff( self, filename ): |
261 """ | 284 """ |
269 | 292 |
270 | 293 |
271 class RelAbund( GroupAbund ): | 294 class RelAbund( GroupAbund ): |
272 file_ext = 'relabund' | 295 file_ext = 'relabund' |
273 def __init__(self, **kwd): | 296 def __init__(self, **kwd): |
297 """ | |
298 # http://www.mothur.org/wiki/Relabund_file | |
299 The structure of a relabund file is analogous to an shared file. | |
300 The first column contains the label for the comparison - this will be the value for the first column of each line from the original list file (e.g. final.an.list). | |
301 The second column contains the group name that designates where the data is coming from for that row. Next is the number of OTUs that were found between each of the groups and is the number of columns that follow. | |
302 Finally, the remaining columns indicate the relative abundance of each OTU from that group. | |
303 """ | |
274 GroupAbund.__init__( self, **kwd ) | 304 GroupAbund.__init__( self, **kwd ) |
275 def init_meta( self, dataset, copy_from=None ): | 305 def init_meta( self, dataset, copy_from=None ): |
276 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) | 306 GroupAbund.init_meta( self, dataset, copy_from=copy_from ) |
277 def sniff( self, filename ): | 307 def sniff( self, filename ): |
278 """ | 308 """ |
623 self.columns = 8 | 653 self.columns = 8 |
624 | 654 |
625 class Names(Tabular): | 655 class Names(Tabular): |
626 file_ext = 'names' | 656 file_ext = 'names' |
627 def __init__(self, **kwd): | 657 def __init__(self, **kwd): |
628 """Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2)""" | 658 """ |
659 # http://www.mothur.org/wiki/Name_file | |
660 Name file shows the relationship between a representative sequence(col 1) and the sequences(comma-separated) it represents(col 2) | |
661 """ | |
629 Tabular.__init__( self, **kwd ) | 662 Tabular.__init__( self, **kwd ) |
630 self.column_names = ['name','representatives'] | 663 self.column_names = ['name','representatives'] |
631 self.columns = 2 | 664 self.columns = 2 |
632 | 665 |
633 class Summary(Tabular): | 666 class Summary(Tabular): |
640 | 673 |
641 class Group(Tabular): | 674 class Group(Tabular): |
642 file_ext = 'groups' | 675 file_ext = 'groups' |
643 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) | 676 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) |
644 def __init__(self, **kwd): | 677 def __init__(self, **kwd): |
645 """Group file assigns sequence (col 1) to a group (col 2)""" | 678 """ |
679 # http://www.mothur.org/wiki/Groups_file | |
680 Group file assigns sequence (col 1) to a group (col 2) | |
681 """ | |
646 Tabular.__init__( self, **kwd ) | 682 Tabular.__init__( self, **kwd ) |
647 self.column_names = ['name','group'] | 683 self.column_names = ['name','group'] |
648 self.columns = 2 | 684 self.columns = 2 |
649 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): | 685 def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ): |
650 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) | 686 Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) |
660 fh.close() | 696 fh.close() |
661 | 697 |
662 class Design(Group): | 698 class Design(Group): |
663 file_ext = 'design' | 699 file_ext = 'design' |
664 def __init__(self, **kwd): | 700 def __init__(self, **kwd): |
665 """Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups.""" | 701 """ |
702 # http://www.mothur.org/wiki/Design_File | |
703 Design file shows the relationship between a group(col 1) and a grouping (col 2), providing a way to merge groups. | |
704 """ | |
666 Group.__init__( self, **kwd ) | 705 Group.__init__( self, **kwd ) |
667 | 706 |
668 class AccNos(Tabular): | 707 class AccNos(Tabular): |
669 file_ext = 'accnos' | 708 file_ext = 'accnos' |
670 def __init__(self, **kwd): | 709 def __init__(self, **kwd): |
676 class Oligos( Text ): | 715 class Oligos( Text ): |
677 file_ext = 'oligos' | 716 file_ext = 'oligos' |
678 | 717 |
679 def sniff( self, filename ): | 718 def sniff( self, filename ): |
680 """ | 719 """ |
720 # http://www.mothur.org/wiki/Oligos_File | |
681 Determines whether the file is a otu (operational taxonomic unit) format | 721 Determines whether the file is a otu (operational taxonomic unit) format |
682 """ | 722 """ |
683 try: | 723 try: |
684 fh = open( filename ) | 724 fh = open( filename ) |
685 count = 0 | 725 count = 0 |
854 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) | 894 MetadataElement( name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[] ) |
855 file_ext = 'count_table' | 895 file_ext = 'count_table' |
856 | 896 |
857 def __init__(self, **kwd): | 897 def __init__(self, **kwd): |
858 """ | 898 """ |
899 # http://www.mothur.org/wiki/Count_File | |
859 A table with first column names and following columns integer counts | 900 A table with first column names and following columns integer counts |
860 # Example 1: | 901 # Example 1: |
861 Representative_Sequence total | 902 Representative_Sequence total |
862 U68630 1 | 903 U68630 1 |
863 U68595 1 | 904 U68595 1 |
899 close(fh) | 940 close(fh) |
900 | 941 |
901 class RefTaxonomy(Tabular): | 942 class RefTaxonomy(Tabular): |
902 file_ext = 'ref.taxonomy' | 943 file_ext = 'ref.taxonomy' |
903 """ | 944 """ |
945 # http://www.mothur.org/wiki/Taxonomy_outline | |
904 A table with 2 or 3 columns: | 946 A table with 2 or 3 columns: |
905 - SequenceName | 947 - SequenceName |
906 - Taxonomy (semicolon-separated taxonomy in descending order) | 948 - Taxonomy (semicolon-separated taxonomy in descending order) |
907 - integer ? | 949 - integer ? |
908 Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline ) | 950 Example: 2-column ( http://www.mothur.org/wiki/Taxonomy_outline ) |
918 Tabular.__init__( self, **kwd ) | 960 Tabular.__init__( self, **kwd ) |
919 self.column_names = ['name','taxonomy'] | 961 self.column_names = ['name','taxonomy'] |
920 | 962 |
921 def sniff( self, filename ): | 963 def sniff( self, filename ): |
922 """ | 964 """ |
923 Determines whether the file is a SequenceTaxonomy | 965 Determines whether the file is a Reference Taxonomy |
924 """ | 966 """ |
925 try: | 967 try: |
926 pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' | 968 pat = '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' |
927 fh = open( filename ) | 969 fh = open( filename ) |
928 count = 0 | 970 count = 0 |
971 # VAMPS taxonomy files do not require a semicolon after the last taxonomy category | |
972 # but assume assume the file will have some multi-level taxonomy assignments | |
973 found_semicolons = False | |
929 while True: | 974 while True: |
930 line = fh.readline() | 975 line = fh.readline() |
931 if not line: | 976 if not line: |
932 break #EOF | 977 break #EOF |
933 line = line.strip() | 978 line = line.strip() |
935 fields = line.split('\t') | 980 fields = line.split('\t') |
936 if not (2 <= len(fields) <= 3): | 981 if not (2 <= len(fields) <= 3): |
937 return False | 982 return False |
938 if not re.match(pat,fields[1]): | 983 if not re.match(pat,fields[1]): |
939 return False | 984 return False |
985 if not found_semicolons and str(fields[1]).count(';') > 0: | |
986 found_semicolons = True | |
940 if len(fields) == 3: | 987 if len(fields) == 3: |
941 check = int(fields[2]) | 988 check = int(fields[2]) |
942 count += 1 | 989 count += 1 |
943 if count > 10: | 990 if count > 100: |
944 break | 991 break |
945 if count > 0: | 992 if count > 0: |
946 return True | 993 # This will be true if at least one entry |
994 # has semicolons in the 2nd column | |
995 return found_semicolons | |
947 except: | 996 except: |
948 pass | 997 pass |
949 finally: | 998 finally: |
950 fh.close() | 999 fh.close() |
951 return False | 1000 return False |
952 | 1001 |
953 class SequenceTaxonomy(RefTaxonomy): | 1002 class SequenceTaxonomy(RefTaxonomy): |
954 file_ext = 'seq.taxonomy' | 1003 file_ext = 'seq.taxonomy' |
955 """ | 1004 """ |
1005 # http://www.mothur.org/wiki/Taxonomy_outline | |
956 A table with 2 columns: | 1006 A table with 2 columns: |
957 - SequenceName | 1007 - SequenceName |
958 - Taxonomy (semicolon-separated taxonomy in descending order) | 1008 - Taxonomy (semicolon-separated taxonomy in descending order) |
959 Example: | 1009 Example: |
960 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; | 1010 X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; |
1131 fh = open( filename ) | 1181 fh = open( filename ) |
1132 count = 0 | 1182 count = 0 |
1133 line = fh.readline() | 1183 line = fh.readline() |
1134 line = line.strip() | 1184 line = line.strip() |
1135 col_cnt = None | 1185 col_cnt = None |
1186 all_integers = True | |
1136 while True: | 1187 while True: |
1137 line = fh.readline() | 1188 line = fh.readline() |
1138 line = line.strip() | 1189 line = line.strip() |
1139 if not line: | 1190 if not line: |
1140 break #EOF | 1191 break #EOF |
1149 if len(fields) != col_cnt : | 1200 if len(fields) != col_cnt : |
1150 return False | 1201 return False |
1151 try: | 1202 try: |
1152 for i in range(1, col_cnt): | 1203 for i in range(1, col_cnt): |
1153 check = float(fields[i]) | 1204 check = float(fields[i]) |
1205 # Also test for whether value is an integer | |
1206 try: | |
1207 check = int(fields[i]) | |
1208 except ValueError: | |
1209 all_integers = False | |
1154 except ValueError: | 1210 except ValueError: |
1155 return False | 1211 return False |
1156 count += 1 | 1212 count += 1 |
1157 if count > 10: | 1213 if count > 10: |
1158 return True | 1214 break |
1159 if count > 0: | 1215 if count > 0: |
1160 return True | 1216 if not all_integers: |
1217 # At least one value was a float | |
1218 return True | |
1219 else: | |
1220 return False | |
1161 except: | 1221 except: |
1162 pass | 1222 pass |
1163 finally: | 1223 finally: |
1164 fh.close() | 1224 fh.close() |
1165 return False | 1225 return False |
1167 class SffFlow(Tabular): | 1227 class SffFlow(Tabular): |
1168 MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True) | 1228 MetadataElement( name="flow_values", default="", no_value="", optional=True , desc="Total number of flow values", readonly=True) |
1169 MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) | 1229 MetadataElement( name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) |
1170 file_ext = 'sff.flow' | 1230 file_ext = 'sff.flow' |
1171 """ | 1231 """ |
1232 # http://www.mothur.org/wiki/Flow_file | |
1172 The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. | 1233 The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. |
1173 Following lines contain: | 1234 Following lines contain: |
1174 - SequenceName | 1235 - SequenceName |
1175 - the number of useable flows as defined by 454's software | 1236 - the number of useable flows as defined by 454's software |
1176 - the flow intensity for each base going in the order of TACG. | 1237 - the flow intensity for each base going in the order of TACG. |