view src/breadcrumbs/src/SVM.py @ 0:2f4f6f08c8c4 draft

Uploaded
author george-weingart
date Tue, 13 May 2014 21:58:57 -0400
parents
children
line wrap: on
line source

"""
Author: Timothy Tickle
Description: Class to Allow Support Vector Machine analysis and to contain associated scripts
"""

#####################################################################################
#Copyright (C) <2012>
#
#Permission is hereby granted, free of charge, to any person obtaining a copy of
#this software and associated documentation files (the "Software"), to deal in the
#Software without restriction, including without limitation the rights to use, copy,
#modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
#and to permit persons to whom the Software is furnished to do so, subject to
#the following conditions:
#
#The above copyright notice and this permission notice shall be included in all copies
#or substantial portions of the Software.
#
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#####################################################################################

__author__ = "Timothy Tickle"
__copyright__ = "Copyright 2012"
__credits__ = ["Timothy Tickle"]
__license__ = "MIT"
__maintainer__ = "Timothy Tickle"
__email__ = "ttickle@sph.harvard.edu"
__status__ = "Development"

#Libraries
from AbundanceTable import AbundanceTable
from ConstantsBreadCrumbs import ConstantsBreadCrumbs
import csv
import os
from random import shuffle
from ValidateData import ValidateData

class SVM:
    """
    Class which holds generic methods for SVM use.
    """

    #1 Happy Path tested
    @staticmethod
    def funcConvertAbundanceTableToSVMFile(abndAbundanceTable, xOutputSVMFile, sMetadataLabel, lsOriginalLabels = None, lsSampleOrdering = None):
        """
        Converts abundance files to input SVM files.

        :param abndAbundanceTable:    AbudanceTable object to turn to input SVM file.
        :type:	AbundanceTable
        :param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
        :type:	FileStream or string file path
        :param	sMetadataLabel: The name of the last row in the abundance table representing metadata.
        :type:	String
	:param:	lsOriginalLabels The original labels.
	:type:	List of strings
        :param	lsSampleOrdering: Order of samples to output to output file. If none, the order in the abundance table is used.
        :type:	List of strings
        :return	lsUniqueLabels:	List of unique labels.
        """

        #Create data matrix
        dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())

        #Add labels
        llData = []
        lsLabels = lsOriginalLabels if lsOriginalLabels else SVM.funcMakeLabels(abndAbundanceTable.funcGetMetadata(sMetadataLabel))
        if not isinstance(xOutputSVMFile,str):
            if xOutputSVMFile.closed:
                xOutputSVMFile = open(xOutputSVMFile.name,"w")
	ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
        f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)

	#This allows the creation of partially known files for stratification purposes
	lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()
        lsOrderingSamples = lsSampleOrdering if lsSampleOrdering else lsCurrentSamples[:]

	iLabelIndex = 0
	iSize = len(dataMatrix[0])
	iIndexSample = 1
	for sSample in lsOrderingSamples:
		if sSample in lsCurrentSamples:
        		f.writerow([lsLabels[iLabelIndex]]+
				[ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
			iLabelIndex += 1
			iIndexSample += 1
		#Make blank entry
		else:
			f.writerow([ConstantsBreadCrumbs.c_strSVMNoSample]+[ConstantsBreadCrumbs.c_strColon.join([str(tpleNas[0]+1),str(tpleNas[1])])
						for tpleNas in enumerate([ConstantsBreadCrumbs.c_strSVMNoSample]*iSize)])
			if lsOriginalLabels:
				iLabelIndex += 1
	ostm.close()
        return set(lsLabels)

    @staticmethod
    def funcUpdateSVMFileWithAbundanceTable(abndAbundanceTable, xOutputSVMFile, lsOriginalLabels, lsSampleOrdering):
        """
        Takes a SVM input file and updates it with an abundance table.
        lsOriginalLabels and lsSampleOrdering should be consistent to the input file.
        Samples in the abundance table will be used to update the file if the sample name in the abundace table is also in the lsSampleOrdering.
        lsOriginalLabels and lsSampleOrdering should be in the same order.

        :param abndAbundanceTable:   AbudanceTable object to turn to input SVM file.
        :type:    AbundanceTable
        :param xOutputSVMFile: File to save SVM data to when converted from the abundance table.
        :type:	FileStream or string file path
        :param	lsOriginalLabels: The list of the original labels (as numerics 0,1,2,3,4...as should be in the file).
        :type:	List of strings
        :param	lsSampleOrdering: Order of samples in the output file.
        :type:	List of strings
        :return	lsUniqueLabels:	List of unique labels.
        """

        #Read in old file
        if not isinstance(xOutputSVMFile,str):
            if xOutputSVMFile.closed:
                xOutputSVMFile = open(xOutputSVMFile.name,"r")
	ostm = open(xOutputSVMFile,"r") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
        fin = csv.reader(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)
	#Read in contents of file
	llsOldContents = [lsRow for lsRow in fin]
	ostm.close()

	#Check to make sure this ordering covers all positions in the old file
	if not len(llsOldContents) == len(lsSampleOrdering):
		print "The length of the original file ("+str(len(llsOldContents))+") does not match the length of the ordering given ("+str(len(lsSampleOrdering))+")."
		return False

        #Create data matrix from new data
        dataMatrix = zip(*abndAbundanceTable.funcGetAbundanceCopy())

        #Add labels
        llData = []

	#Write to file
        if not isinstance(xOutputSVMFile,str):
            if xOutputSVMFile.closed:
                xOutputSVMFile = open(xOutputSVMFile.name,"w")
	ostm = open(xOutputSVMFile,"w") if isinstance(xOutputSVMFile, str) else xOutputSVMFile
        f = csv.writer(ostm, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace)

	#This allows to know what position to place the new lines
	lsCurrentSamples = abndAbundanceTable.funcGetSampleNames()

	iSize = len(dataMatrix[0])
	iIndexSample = 1
	iIndexOriginalOrder = 0
	for sSample in lsSampleOrdering:
		if sSample in lsCurrentSamples:
        		f.writerow([lsOriginalLabels[iIndexOriginalOrder]]+
				[ConstantsBreadCrumbs.c_strColon.join([str(tpleFeature[0]+1),str(tpleFeature[1])]) for tpleFeature in enumerate(dataMatrix[iIndexSample])])
			iIndexSample += 1
		#Make blank entry
		else:
			f.writerow(llsOldContents[iIndexOriginalOrder])
		iIndexOriginalOrder += 1
	ostm.close()
        return True

    #Tested 5
    @staticmethod
    def funcMakeLabels(lsMetadata):
        """
        Given a list of metadata, labels are assigned. This is function represents a central location to make labels so all are consistent.

        :param	lsMetafdata:    List of metadata to turn into labels based on the metadata's values.
        :type:	List of integer labels
        """
        #Do not use a set to make elements unique. Need to preserve order.
        #First label should be 0
        lsUniqueLabels = []
        [lsUniqueLabels.append(sElement) for sElement in lsMetadata if not (sElement in lsUniqueLabels)]

        dictLabels = dict([[str(lenuLabels[1]),str(lenuLabels[0])] for lenuLabels in enumerate(lsUniqueLabels)])
        return [dictLabels[sLabel] for sLabel in lsMetadata]

    #Tested
    @staticmethod
    def funcReadLabelsFromFile(xSVMFile, lsAllSampleNames, isPredictFile):
      """
      Reads in the labels from the input file or prediction output file of a LibSVM formatted file
      and associates them in order with the given sample names.

      Prediction file expected format: Labels declared in first line with labels keyword.
      Each following row a sample with the first entry the predicted label
      Prediction file example:
      labels 0 1
      0	0.3	0.4	0.6
      1	0.1	0.2	0.3
      1	0.2	0.2	0.2
      0	0.2	0.4	0.3

      Input file expected format:
      Each row a sample with the first entry the predicted label
      Input file example:
      0	0.3	0.4	0.6
      1	0.1	0.2	0.3
      1	0.2	0.2	0.2
      0	0.2	0.4	0.3

      :param xSVMFile:  File path to read in prediction labels.
      :type String
      :param lsAllSampleNames List of sample ids in the order of the labels.
      :type List of Strings
      :param isPredictFile: Indicates if the file is the input (False) or prediction (True) file
      :type boolean
      :return: Dictionary {label:["sampleName1", "sampleName2"...],...} or False on error
      """
      #Open prediction file and input file and get labels to compare to the predictions
      g = csv.reader( open(xSVMFile, 'r') if isinstance(xSVMFile, str) else xSVMFile, csv.excel_tab, delimiter = ConstantsBreadCrumbs.c_strBreadCrumbsSVMSpace )
      lsOriginalLabels = [lsLineElements[0] for lsLineElements in g if not lsLineElements[0] == ConstantsBreadCrumbs.c_strSVMNoSample]

      if isPredictFile:
          lsOriginalLabels = lsOriginalLabels[1:]

      #Check sample name length
      if not len(lsAllSampleNames) == len(lsOriginalLabels):
        print "SVM::funcReadLabelsFromFile. Error, the length of sample names did not match the original labels length. Samples ("+str(len(lsAllSampleNames))+"):"+str(lsAllSampleNames)+" Labels ("+str(len(lsOriginalLabels))+"):"+str(lsOriginalLabels)
        return False

      #Change to {label:["sampleName1", "sampleName2"...],...}
      dictSampleLabelsRet = dict()
      for sValue in set(lsOriginalLabels):  
        dictSampleLabelsRet[sValue] = set([lsAllSampleNames[iindex] for iindex, sLabel in enumerate(lsOriginalLabels) if sLabel == sValue])
      return dictSampleLabelsRet

    #Tested
    @staticmethod
    def funcScaleFeature(npdData):
        """
        Scale a feature between 0 and 1. Using 01 and not 01,1 because it keeps the sparsity of the data and may save time.

        :param	npdData:	Feature data to scale.
        :type	Numpy Array	Scaled feature data.
        :return npaFloat:    A numpy array of floats.
        """
        if sum(npdData) == 0 or len(set(npdData))==1:
            return npdData
        dMin = min(npdData)
        return (npdData-dMin)/float(max(npdData-dMin))

    #Tested
    @staticmethod
    def funcWeightLabels(lLabels):
        """
        Returns weights for labels based on how balanced the labels are. Weights try to balance unbalanced results.

        :params	lLabels:	List of labels to use for measure how balanced the comparison is.
        :type	List
        :return	List:		[dictWeights ({"label":weight}),lUniqueLabels (unique occurences of original labels)]
        """
        #Convert to dict
        #Do not use set to make elements unique. Need to preserve order.
        #First label should be 0
        lUniqueLabels = []
        for sElement in lLabels:
            if sElement not in lUniqueLabels:
                lUniqueLabels.append(sElement)
        dictLabels = dict(zip(lUniqueLabels, range(len(lUniqueLabels))))

        #Build a dict of weights per label {label:weight, label:weight}
        #Get the occurrence of each label
        dictWeights = dict()
        for sLabelKey in dictLabels:
            sCurLabel = dictLabels[sLabelKey]
            dictWeights[sCurLabel] = lLabels.count(sLabelKey)

        #Divide the highest occurrence each occurrence
        iMaxOccurence = max(dictWeights.values())
        for sWeightKey in dictWeights:
            dictWeights[sWeightKey]=iMaxOccurence/float(dictWeights[sWeightKey])

        return [dictWeights,lUniqueLabels]

    #Tested 3/4 cases could add in test 12 with randomize True
    def func10FoldCrossvalidation(self, iTotalSampleCount, fRandomise = False):
        """
        Generator.
        Generates the indexes for a 10 fold cross validation given a sample count.
        If there are less than 10 samples, it uses the sample count as the K-fold cross validation
        as a leave one out method.

        :param	iTotalSampleCount:	Total Sample Count
	:type	Integer	Sample Count
	:param	fRandomise:	Random sample indices
	:type	Boolean	True indicates randomise (Default False)
        """
        #Make indices and shuffle if needed
        liindices = range(iTotalSampleCount)
        if fRandomise:
            shuffle(liindices)

        #For 10 times
        iKFold = 10
        if iTotalSampleCount < iKFold:
            iKFold = iTotalSampleCount
        for iiteration in xrange(iKFold):
            lfTraining = [iindex % iKFold != iiteration for iindex in liindices]
            lfValidation = [not iindex for iindex in lfTraining]
            yield lfTraining, lfValidation