annotate src/breadcrumbs/src/SVM.py @ 0:2f4f6f08c8c4 draft

Uploaded
author george-weingart
date Tue, 13 May 2014 21:58:57 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
1 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
2 Author: Timothy Tickle
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
3 Description: Class to Allow Support Vector Machine analysis and to contain associated scripts
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
4 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
5
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
6 #####################################################################################
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
7 #Copyright (C) <2012>
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
8 #
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
9 #Permission is hereby granted, free of charge, to any person obtaining a copy of
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
10 #this software and associated documentation files (the "Software"), to deal in the
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
11 #Software without restriction, including without limitation the rights to use, copy,
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
12 #modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
13 #and to permit persons to whom the Software is furnished to do so, subject to
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
14 #the following conditions:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
15 #
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
16 #The above copyright notice and this permission notice shall be included in all copies
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
17 #or substantial portions of the Software.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
18 #
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
19 #THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
20 #INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
21 #PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
22 #HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
23 #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
24 #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
25 #####################################################################################
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
26
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
27 __author__ = "Timothy Tickle"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
28 __copyright__ = "Copyright 2012"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
29 __credits__ = ["Timothy Tickle"]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
30 __license__ = "MIT"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
31 __maintainer__ = "Timothy Tickle"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
32 __email__ = "ttickle@sph.harvard.edu"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
33 __status__ = "Development"
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
34
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
35 #Libraries
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
36 from AbundanceTable import AbundanceTable
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
37 from ConstantsBreadCrumbs import ConstantsBreadCrumbs
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
38 import csv
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
39 import os
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
40 from random import shuffle
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
41 from ValidateData import ValidateData
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
42
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
43 class SVM:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
44 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
45 Class which holds generic methods for SVM use.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
46 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
47
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
48 #1 Happy Path tested
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
49 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
50 def funcConvertAbundanceTableToSVMFile(abndAbundanceTable, xOutputSVMFile, sMetadataLabel, lsOriginalLabels = None, lsSampleOrdering = None):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
51 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
52 Converts abundance files to input SVM files.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
53
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
54 :param abndAbundanceTable: AbudanceTable object to turn to input SVM file.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
55 :type: AbundanceTable
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
56 :param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
57 :type: FileStream or string file path
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
58 :param sMetadataLabel: The name of the last row in the abundance table representing metadata.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
59 :type: String
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
60 :param: lsOriginalLabels The original labels.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
61 :type: List of strings
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
62 :param lsSampleOrdering: Order of samples to output to output file. If none, the order in the abundance table is used.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
63 :type: List of strings
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
64 :return lsUniqueLabels: List of unique labels.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
65 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
66
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
67 #Create data matrix
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
68 dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
69
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
70 #Add labels
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
71 llData = []
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
72 lsLabels = lsOriginalLabels if lsOriginalLabels else SVM.funcMakeLabels(abndAbundanceTable.funcGetMetadata(sMetadataLabel))
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
73 if not isinstance(xOutputSVMFile,str):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
74 if xOutputSVMFile.closed:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
75 xOutputSVMFile = open(xOutputSVMFile.name,"w")
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
76 ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
77 f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
78
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
79 #This allows the creation of partially known files for stratification purposes
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
80 lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
81 lsOrderingSamples = lsSampleOrdering if lsSampleOrdering else lsCurrentSamples[:]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
82
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
83 iLabelIndex = 0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
84 iSize = len(dataMatrix[0])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
85 iIndexSample = 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
86 for sSample in lsOrderingSamples:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
87 if sSample in lsCurrentSamples:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
88 f.writerow([lsLabels[iLabelIndex]]+
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
89 [ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
90 iLabelIndex += 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
91 iIndexSample += 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
92 #Make blank entry
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
93 else:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
94 f.writerow([ConstantsBreadCrumbs.c_strSVMNoSample]+[ConstantsBreadCrumbs.c_strColon.join([str(tpleNas[0]+1),str(tpleNas[1])])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
95 for tpleNas in enumerate([ConstantsBreadCrumbs.c_strSVMNoSample]*iSize)])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
96 if lsOriginalLabels:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
97 iLabelIndex += 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
98 ostm.close()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
99 return set(lsLabels)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
100
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
101 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
102 def funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable, xOutputSVMFile, lsOriginalLabels, lsSampleOrdering):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
103 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
104 Takes a SVM input file and updates it with an abundance table.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
105 lsOriginalLabels and lsSampleOrdering should be consistent to the input file.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
106 Samples in the abundance table will be used to update the file if the sample name in the abundace table is also in the lsSampleOrdering.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
107 lsOriginalLabels and lsSampleOrdering should be in the same order.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
108
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
109 :param abndAbundanceTable: AbudanceTable object to turn to input SVM file.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
110 :type: AbundanceTable
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
111 :param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
112 :type: FileStream or string file path
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
113 :param lsOriginalLabels: The list of the original labels (as numerics 0,1,2,3,4...as should be in the file).
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
114 :type: List of strings
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
115 :param lsSampleOrdering: Order of samples in the output file.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
116 :type: List of strings
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
117 :return lsUniqueLabels: List of unique labels.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
118 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
119
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
120 #Read in old file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
121 if not isinstance(xOutputSVMFile,str):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
122 if xOutputSVMFile.closed:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
123 xOutputSVMFile = open(xOutputSVMFile.name,"r")
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
124 ostm = open(xOutputSVMFile,"r") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
125 fin = csv.reader(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
126 #Read in contents of file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
127 llsOldContents = [lsRow for lsRow in fin]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
128 ostm.close()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
129
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
130 #Check to make sure this ordering covers all positions in the old file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
131 if not len(llsOldContents) == len(lsSampleOrdering):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
132 print "The length of the original file ("+str(len(llsOldContents))+") does not match the length of the ordering given ("+str(len(lsSampleOrdering))+")."
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
133 return False
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
134
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
135 #Create data matrix from new data
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
136 dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
137
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
138 #Add labels
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
139 llData = []
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
140
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
141 #Write to file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
142 if not isinstance(xOutputSVMFile,str):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
143 if xOutputSVMFile.closed:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
144 xOutputSVMFile = open(xOutputSVMFile.name,"w")
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
145 ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
146 f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
147
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
148 #This allows to know what position to place the new lines
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
149 lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
150
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
151 iSize = len(dataMatrix[0])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
152 iIndexSample = 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
153 iIndexOriginalOrder = 0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
154 for sSample in lsSampleOrdering:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
155 if sSample in lsCurrentSamples:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
156 f.writerow([lsOriginalLabels[iIndexOriginalOrder]]+
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
157 [ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
158 iIndexSample += 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
159 #Make blank entry
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
160 else:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
161 f.writerow(llsOldContents[iIndexOriginalOrder])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
162 iIndexOriginalOrder += 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
163 ostm.close()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
164 return True
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
165
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
166 #Tested 5
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
167 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
168 def funcMakeLabels(lsMetadata):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
169 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
170 Given a list of metadata, labels are assigned. This is function represents a central location to make labels so all are consistent.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
171
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
172 :param lsMetafdata: List of metadata to turn into labels based on the metadata's values.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
173 :type: List of integer labels
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
174 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
175 #Do not use a set to make elements unique. Need to preserve order.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
176 #First label should be 0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
177 lsUniqueLabels = []
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
178 [lsUniqueLabels.append(sElement) for sElement in lsMetadata if not (sElement in lsUniqueLabels)]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
179
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
180 dictLabels = dict([[str(lenuLabels[1]),str(lenuLabels[0])] for lenuLabels in enumerate(lsUniqueLabels)])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
181 return [dictLabels[sLabel] for sLabel in lsMetadata]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
182
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
183 #Tested
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
184 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
185 def funcReadLabelsFromFile(xSVMFile, lsAllSampleNames, isPredictFile):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
186 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
187 Reads in the labels from the input file or prediction output file of a LibSVM formatted file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
188 and associates them in order with the given sample names.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
189
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
190 Prediction file expected format: Labels declared in first line with labels keyword.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
191 Each following row a sample with the first entry the predicted label
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
192 Prediction file example:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
193 labels 0 1
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
194 0 0.3 0.4 0.6
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
195 1 0.1 0.2 0.3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
196 1 0.2 0.2 0.2
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
197 0 0.2 0.4 0.3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
198
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
199 Input file expected format:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
200 Each row a sample with the first entry the predicted label
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
201 Input file example:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
202 0 0.3 0.4 0.6
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
203 1 0.1 0.2 0.3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
204 1 0.2 0.2 0.2
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
205 0 0.2 0.4 0.3
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
206
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
207 :param xSVMFile: File path to read in prediction labels.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
208 :type String
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
209 :param lsAllSampleNames List of sample ids in the order of the labels.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
210 :type List of Strings
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
211 :param isPredictFile: Indicates if the file is the input (False) or prediction (True) file
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
212 :type boolean
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
213 :return: Dictionary {label:["sampleName1", "sampleName2"...],...} or False on error
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
214 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
215 #Open prediction file and input file and get labels to compare to the predictions
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
216 g = csv.reader( open(xSVMFile, 'r') if isinstance(xSVMFile, str) else xSVMFile, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace )
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
217 lsOriginalLabels = [lsLineElements[0] for lsLineElements in g if not lsLineElements[0] == ConstantsBreadCrumbs.c_strSVMNoSample]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
218
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
219 if isPredictFile:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
220 lsOriginalLabels = lsOriginalLabels[1:]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
221
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
222 #Check sample name length
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
223 if not len(lsAllSampleNames) == len(lsOriginalLabels):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
224 print "SVM::funcReadLabelsFromFile. Error, the length of sample names did not match the original labels length. Samples ("+str(len(lsAllSampleNames))+"):"+str(lsAllSampleNames)+" Labels ("+str(len(lsOriginalLabels))+"):"+str(lsOriginalLabels)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
225 return False
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
226
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
227 #Change to {label:["sampleName1", "sampleName2"...],...}
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
228 dictSampleLabelsRet = dict()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
229 for sValue in set(lsOriginalLabels):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
230 dictSampleLabelsRet[sValue] = set([lsAllSampleNames[iindex] for iindex, sLabel in enumerate(lsOriginalLabels) if sLabel == sValue])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
231 return dictSampleLabelsRet
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
232
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
233 #Tested
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
234 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
235 def funcScaleFeature(npdData):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
236 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
237 Scale a feature between 0 and 1. Using 01 and not 01,1 because it keeps the sparsity of the data and may save time.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
238
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
239 :param npdData: Feature data to scale.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
240 :type Numpy Array Scaled feature data.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
241 :return npaFloat: A numpy array of floats.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
242 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
243 if sum(npdData) == 0 or len(set(npdData))==1:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
244 return npdData
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
245 dMin = min(npdData)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
246 return (npdData-dMin)/float(max(npdData-dMin))
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
247
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
248 #Tested
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
249 @staticmethod
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
250 def funcWeightLabels(lLabels):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
251 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
252 Returns weights for labels based on how balanced the labels are. Weights try to balance unbalanced results.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
253
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
254 :params lLabels: List of labels to use for measure how balanced the comparison is.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
255 :type List
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
256 :return List: [dictWeights ({"label":weight}),lUniqueLabels (unique occurences of original labels)]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
257 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
258 #Convert to dict
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
259 #Do not use set to make elements unique. Need to preserve order.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
260 #First label should be 0
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
261 lUniqueLabels = []
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
262 for sElement in lLabels:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
263 if sElement not in lUniqueLabels:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
264 lUniqueLabels.append(sElement)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
265 dictLabels = dict(zip(lUniqueLabels, range(len(lUniqueLabels))))
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
266
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
267 #Build a dict of weights per label {label:weight, label:weight}
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
268 #Get the occurrence of each label
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
269 dictWeights = dict()
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
270 for sLabelKey in dictLabels:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
271 sCurLabel = dictLabels[sLabelKey]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
272 dictWeights[sCurLabel] = lLabels.count(sLabelKey)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
273
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
274 #Divide the highest occurrence each occurrence
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
275 iMaxOccurence = max(dictWeights.values())
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
276 for sWeightKey in dictWeights:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
277 dictWeights[sWeightKey]=iMaxOccurence/float(dictWeights[sWeightKey])
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
278
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
279 return [dictWeights,lUniqueLabels]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
280
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
281 #Tested 3/4 cases could add in test 12 with randomize True
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
282 def func10FoldCrossvalidation(self, iTotalSampleCount, fRandomise = False):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
283 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
284 Generator.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
285 Generates the indexes for a 10 fold cross validation given a sample count.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
286 If there are less than 10 samples, it uses the sample count as the K-fold cross validation
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
287 as a leave one out method.
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
288
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
289 :param iTotalSampleCount: Total Sample Count
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
290 :type Integer Sample Count
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
291 :param fRandomise: Random sample indices
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
292 :type Boolean True indicates randomise (Default False)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
293 """
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
294 #Make indices and shuffle if needed
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
295 liindices = range(iTotalSampleCount)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
296 if fRandomise:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
297 shuffle(liindices)
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
298
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
299 #For 10 times
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
300 iKFold = 10
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
301 if iTotalSampleCount < iKFold:
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
302 iKFold = iTotalSampleCount
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
303 for iiteration in xrange(iKFold):
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
304 lfTraining = [iindex % iKFold != iiteration for iindex in liindices]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
305 lfValidation = [not iindex for iindex in lfTraining]
2f4f6f08c8c4 Uploaded
george-weingart
parents:
diff changeset
306 yield lfTraining, lfValidation