view TEisotools-1.1.a/commons/core/utils/Classif.py @ 15:255c852351c5 draft

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:36:44 -0400
parents feef9a0db09d
children
line wrap: on
line source

import re
import os
from collections import OrderedDict

DWICKERCODE = {
               "ClassI":"RXX",
               "ClassII":"DXX",
               "LTR":"RLX",
               "DIRS":"RYX",
               "PLE":"RPX",
               "LINE":"RIX",
               "SINE":"RSX",
               "TIR":"DTX",
               "Crypton":"DYX",
               "Helitron":"DHX",
               "Maverick":"DMX",

               "TIR-MITE":"DTX",
               "LTR-LARD":"RLX",
               "LTR-TRIM":"RLX"
               }
    
class Classif(object):
    """ The class Classif is a object what determine a line in classif file.
    """

    def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"):
        self._consensusName = consensusName
        self._confusness = outConfuseness
        self._completeness = outCompleteness
        self._projectName = projectName
        self._isShorten = isShorten
        self._consensusLength = consensusLength
        self._consensusStrand = consensusStrand
        self._consensusClass = consensusClass
        self._consensusOrder = consensusOrder
        self._consensusSuperFam = consensusSuperFam
        self._consensusCI = consensusCI
        self._consensusCoding = ""
        self._consensusStruct = ""
        self._consensusOther = ""
        self._isNoChim = ""
        self._hasCodingPart = False
        self._hasStructPart = False
        self._hasOtherPart = False
        self._code = code 
        self._evidence = {}       

    def __eq__(self, o):
        if type(o) is type(self):
            return self._consensusName == o._consensusName and self._code == o._code \
                and self._confusness == o._confusness and self._completeness == o._completeness
        return False

    def __ne__(self, o):
        return not self.__eq__(o)
    
    def getConsensusName(self):
        return self._consensusName

    def getCode(self):
        return self._code

    def getconfusness(self):
        return self._confusness

    def getcompleteness(self):
        return self._completeness

    def getprojectName(self):
        return self._projectName
    
    def getConsensusLength(self):
        return self._consensusLength
    
    def getConsensusStrand(self):
        return self._consensusStrand
    
    def getConsensusClass(self):
        return self._consensusClass
    
    def getConsensusOrder(self):
        return self._consensusOrder
    
    def getConsensusSuperFamily(self):
        return self._consensusSuperFam
    
    def getConsensusCI(self):
        return str(self._consensusCI)
    
    def getInfoEvidence(self):
        return self._evidence
    
    def getConsensusCoding(self):
        if self._confusness == 'ok':            
            coding = self.writeCodingFeaturesLine(self._evidence)
        else:
            lOrder = self.getConsensusOrder().split("|")
            coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]])
            for order in lOrder[1:]:
                if self._evidence[order].keys() != ['other']:
                    coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order])
        return "coding=" + coding
    
    def getConsensusStructure(self):
        if self._confusness == 'ok':            
            Structure = self.writeStructFeaturesLine(self._evidence)
        else:
            lOrder = self.getConsensusOrder().split("|")
            Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]])
            for order in lOrder[1:]:
                if self._evidence[order].keys() != ['other']:
                    Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order])
        return "struct=" + Structure
    
    def getConsensusOther(self):
        if self._confusness == 'ok':            
            Other = self.writeOtherFeaturesLine(self._evidence)
        else:
            lOrder = self.getConsensusOrder().split("|")
            Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]])
            for order in lOrder[1:]:
                    Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order])
        return "other=" + Other

    def setConsensusName(self, consensusName):
        self._consensusName = consensusName

    def setInfoEvidence(self, evidence):
        self._evidence = evidence

    def setCode(self):
        self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder())

    def setConfusness(self, Confusness):
        self._confusness = Confusness

    def setCompleteness(self, completeness):
        self._completeness = completeness

    def setProjectName(self, projectName):
        self._projectName = projectName

    def setConsensusLength(self, cLength):
        self._consensusLength = cLength

    def setConsensusStrand(self, cStrand):
        self._consensusStrand = cStrand

    def setConsensusClass(self, cClass):
        self._consensusClass = cClass

    def setConsensusOrder(self, cOrder):
        self._consensusOrder = cOrder

    def setConsensusSuperFamily(self, cSuperFamily):
        self._consensusSuperFamily = cSuperFamily

    def setConsensusCI(self, CI):
        self._consensusCI = CI

    def setConsensusCoding(self, coding):
        self._consensusCoding = coding

    def setConsensusStructure(self, structure):
        self._consensusStruct = structure

    def setConsensusOther(self, other):
        self._consensusOther = other

    def setCodStrOthFromMessage(self, dico):
        self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico)
        self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico)
        self._consensusOther = "other="+self.writeOtherFeaturesLine(dico)

    def setCodStrOthFromMessage2(self, dico, cOrder):
        if 'rDNA' in cOrder:
            cOrder = cOrder.replace('rDNA', 'RDNA')
        lOrder = cOrder.split("|")
        lDicoKeys = dico.keys()
        if lOrder[0] not in lDicoKeys:
            self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico)
            self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico)
            self._consensusOther = "other="+self.writeOtherFeaturesLine(dico)
        else:
            self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]])
            self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]])
            self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]])
            if len(lDicoKeys) != 1:
                for order in lDicoKeys[1:]:
                    if dico[order].keys() == ['other']:
                        self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order])
                    else:
                        self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order])
                        self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order])
                        self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order])

    def createNewConsensusName(self):
        pastecClassif = "%s" % self._code
        if self._completeness != "":
            pastecClassif += "-%s" % self._completeness
        if self._confusness != "":
            pastecClassif += "-%s" % self._confusness
        if self._isShorten:
            pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName
            if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName:
                header = self.shortenConsensusName()
                header = "%s_%s" % (pastecClassif, header)
            else:
                header = "%s_%s" % (pastecClassif, self._consensusName)
        else:
            header = "%s_%s" % (pastecClassif, self._consensusName)

        return header

    def shortenConsensusName(self):
        desc = self._consensusName.split(self._projectName)[1]
        palignMeth = desc.split("_")[1]
        clustMeth = desc.split("_")[2]
        clustID = desc.split("_")[3]
        lmalignMeth = desc.split("_")[4:]
        if len(lmalignMeth) > 2:
            malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2])
        else:
            malignMeth = "".join(lmalignMeth)
        consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth)

        return consensusShorten

    def renameHeaderInConsensusFastaFile(self, fileName = ""):
        newFileName = fileName.split(".")[0]+"New.fa"
        
        oldFile = open(fileName, "r")
        newFile = open(newFileName, "w")
        
        inputLine = oldFile.readline()
        while inputLine != "" :
            if ">" in inputLine:
                self.setConsensusName(inputLine)
                outputLine = ">%s" % self.shortenConsensusName()           
                newFile.write(outputLine)
            else:
                newFile.write(inputLine)
            
            inputLine = oldFile.readline()
        
        oldFile.close()
        newFile.close()
        
        os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0]))
        os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0]))
        os.system("rm -f %sOld.fa" % fileName.split(".")[0])

    def writeOtherFeaturesLine(self, dEvidence):
        other = "(NA)"
        if dEvidence.has_key('other'):
                lResults = []
                dOtherResults = dEvidence['other']
                lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults)
                lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding)
                if len(lResultsFilled) != 0:
                    subOther = "; ".join(lResultsFilled)
                    other = '(%s)' % subOther
                    self._hasOtherPart = True
        return other

    def writeCodingFeaturesLine(self, dEvidence):
        lResults = []
        lResultsFilled = self.formatCodingFeatures(dEvidence, lResults)
        if len(lResultsFilled) != 0:
            subCoding = "; ".join(lResultsFilled)
            coding = '(%s)' % subCoding
        else:
            coding = "(NA)"
        return coding

    def writeStructFeaturesLine(self, dEvidence):
        lResults = []
        lResultsFilled = self.formatStructFeatures(dEvidence, lResults)
        if len(lResultsFilled) != 0:
            subStruct = "; ".join(lResultsFilled)
            struct = '(%s)' % subStruct
        else:
            struct = "(NA)"
        return struct

    def formatCodingFeatures(self, dEvidence, lResults):
        if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []:
            lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx'])))
        
        if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []:
            lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx'])))
            
        if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None):
            lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer']))
            
        if dEvidence.has_key('Other_profiles'):
            lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles']))
        
        if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None):
            lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"])
        
        if dEvidence.has_key("HG") and (dEvidence["HG"] != None):
            lResults.append("HG_BLRn: %s" % dEvidence["HG"])
        
        if len(lResults) != 0:
            self._hasCodingPart = True
        return lResults

    def formatProfilesResults(self, dProfilesResults):
        if len(dProfilesResults.keys()) == 0:
            return ""
        lResults = []
        for key in dProfilesResults.keys():
            iPDM = dProfilesResults[key]
            cov = "%.2f%%" % iPDM.getCoverageOnSubject()
            profilesResult = '%s: %s' % (key, cov)
            lResults.append(profilesResult)
        return ", ".join(lResults)
    
    def formatStructFeatures(self, dEvidence, lResults):
        if dEvidence.has_key('length') and (dEvidence['length']!= None):
            lResults.append('TElength: %s' % dEvidence['length'])

        if dEvidence.has_key('TR') and (dEvidence['TR'] != None):
            lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR'])))    
            
        if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None):
            lResults.append('ORF: %s' % ", ".join(dEvidence['ORF']))        

        if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None):
            lResults.append('SSR: %s' % ", ".join(dEvidence['SSR']))
        
        if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) :
            lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage'])
                                        
        if dEvidence.has_key('polyAtail'):
            lResults.append('polyAtail')   
             
        if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None):
            lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities'])))
        if len(lResults) != 0:
            self._hasStructPart = True        
        return lResults
    
    def _decisionRuleForWickerCode(self, teClass, order):
        code = 'NA'
        if order in DWICKERCODE.keys():
            code = DWICKERCODE[order]
        elif teClass in DWICKERCODE.keys():
            code = DWICKERCODE[teClass]
        elif order == "Unclassified" and teClass == "Unclassified":
            code = "NA"
        elif re.search("\|", order) and teClass == "Unclassified":
            code = "XXX"
        elif re.search("\|", order) and re.search("\|",teClass):
            lClass = teClass.split("|")
            for iC in lClass[1:]:
                if lClass[0] != iC:
                    code = "XXX"
                    return code
            code = DWICKERCODE[lClass[0]]
        return code
    
    def renameLARDTRIMAndMITE(self):
        order = self.getConsensusOrder()
        order = order.replace("MITE", "TIR-MITE")
        order = order.replace("LARD", "LTR-LARD")
        order = order.replace("TRIM", "LTR-TRIM")
        self.setConsensusOrder(order)
        dEvidence = self.getInfoEvidence()
        if 'LARD' in dEvidence.keys():
            dEvidence["LTR-LARD"] = dEvidence["LARD"]
            del dEvidence["LARD"]
        if 'TRIM' in dEvidence.keys():
            dEvidence["LTR-TRIM"] = dEvidence["TRIM"]
            del dEvidence["TRIM"]
        if 'MITE' in dEvidence.keys():
            dEvidence["TIR-MITE"] = dEvidence["MITE"]
            del dEvidence["MITE"]
        self.setInfoEvidence(dEvidence)