Mercurial > repos > urgi-team > teiso
diff TEisotools-1.0/commons/core/utils/Classif.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TEisotools-1.0/commons/core/utils/Classif.py Wed Jul 20 05:00:24 2016 -0400 @@ -0,0 +1,385 @@ +import re +import os +from collections import OrderedDict + +DWICKERCODE = { + "ClassI":"RXX", + "ClassII":"DXX", + "LTR":"RLX", + "DIRS":"RYX", + "PLE":"RPX", + "LINE":"RIX", + "SINE":"RSX", + "TIR":"DTX", + "Crypton":"DYX", + "Helitron":"DHX", + "Maverick":"DMX", + + "TIR-MITE":"DTX", + "LTR-LARD":"RLX", + "LTR-TRIM":"RLX" + } + +class Classif(object): + """ The class Classif is a object what determine a line in classif file. + """ + + def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"): + self._consensusName = consensusName + self._confusness = outConfuseness + self._completeness = outCompleteness + self._projectName = projectName + self._isShorten = isShorten + self._consensusLength = consensusLength + self._consensusStrand = consensusStrand + self._consensusClass = consensusClass + self._consensusOrder = consensusOrder + self._consensusSuperFam = consensusSuperFam + self._consensusCI = consensusCI + self._consensusCoding = "" + self._consensusStruct = "" + self._consensusOther = "" + self._isNoChim = "" + self._hasCodingPart = False + self._hasStructPart = False + self._hasOtherPart = False + self._code = code + self._evidence = {} + + def __eq__(self, o): + if type(o) is type(self): + return self._consensusName == o._consensusName and self._code == o._code \ + and self._confusness == o._confusness and self._completeness == o._completeness + return False + + def __ne__(self, o): + return not self.__eq__(o) + + def getConsensusName(self): + return self._consensusName + + def getCode(self): + return self._code + + def getconfusness(self): + return self._confusness + + def getcompleteness(self): + return self._completeness + + def getprojectName(self): + return self._projectName + + def getConsensusLength(self): + return self._consensusLength + + def getConsensusStrand(self): + return self._consensusStrand + + def getConsensusClass(self): + return self._consensusClass + + def getConsensusOrder(self): + return self._consensusOrder + + def getConsensusSuperFamily(self): + return self._consensusSuperFam + + def getConsensusCI(self): + return str(self._consensusCI) + + def getInfoEvidence(self): + return self._evidence + + def getConsensusCoding(self): + if self._confusness == 'ok': + coding = self.writeCodingFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + if self._evidence[order].keys() != ['other']: + coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order]) + return "coding=" + coding + + def getConsensusStructure(self): + if self._confusness == 'ok': + Structure = self.writeStructFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + if self._evidence[order].keys() != ['other']: + Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order]) + return "struct=" + Structure + + def getConsensusOther(self): + if self._confusness == 'ok': + Other = self.writeOtherFeaturesLine(self._evidence) + else: + lOrder = self.getConsensusOrder().split("|") + Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]]) + for order in lOrder[1:]: + Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order]) + return "other=" + Other + + def setConsensusName(self, consensusName): + self._consensusName = consensusName + + def setInfoEvidence(self, evidence): + self._evidence = evidence + + def setCode(self): + self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder()) + + def setConfusness(self, Confusness): + self._confusness = Confusness + + def setCompleteness(self, completeness): + self._completeness = completeness + + def setProjectName(self, projectName): + self._projectName = projectName + + def setConsensusLength(self, cLength): + self._consensusLength = cLength + + def setConsensusStrand(self, cStrand): + self._consensusStrand = cStrand + + def setConsensusClass(self, cClass): + self._consensusClass = cClass + + def setConsensusOrder(self, cOrder): + self._consensusOrder = cOrder + + def setConsensusSuperFamily(self, cSuperFamily): + self._consensusSuperFamily = cSuperFamily + + def setConsensusCI(self, CI): + self._consensusCI = CI + + def setConsensusCoding(self, coding): + self._consensusCoding = coding + + def setConsensusStructure(self, structure): + self._consensusStruct = structure + + def setConsensusOther(self, other): + self._consensusOther = other + + def setCodStrOthFromMessage(self, dico): + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) + + def setCodStrOthFromMessage2(self, dico, cOrder): + if 'rDNA' in cOrder: + cOrder = cOrder.replace('rDNA', 'RDNA') + lOrder = cOrder.split("|") + lDicoKeys = dico.keys() + if lOrder[0] not in lDicoKeys: + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) + else: + self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]]) + self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]]) + self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]]) + if len(lDicoKeys) != 1: + for order in lDicoKeys[1:]: + if dico[order].keys() == ['other']: + self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) + else: + self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order]) + self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order]) + self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) + + def createNewConsensusName(self): + pastecClassif = "%s" % self._code + if self._completeness != "": + pastecClassif += "-%s" % self._completeness + if self._confusness != "": + pastecClassif += "-%s" % self._confusness + if self._isShorten: + pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName + if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName: + header = self.shortenConsensusName() + header = "%s_%s" % (pastecClassif, header) + else: + header = "%s_%s" % (pastecClassif, self._consensusName) + else: + header = "%s_%s" % (pastecClassif, self._consensusName) + + return header + + def shortenConsensusName(self): + desc = self._consensusName.split(self._projectName)[1] + palignMeth = desc.split("_")[1] + clustMeth = desc.split("_")[2] + clustID = desc.split("_")[3] + lmalignMeth = desc.split("_")[4:] + if len(lmalignMeth) > 2: + malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2]) + else: + malignMeth = "".join(lmalignMeth) + consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth) + + return consensusShorten + + def renameHeaderInConsensusFastaFile(self, fileName = ""): + newFileName = fileName.split(".")[0]+"New.fa" + + oldFile = open(fileName, "r") + newFile = open(newFileName, "w") + + inputLine = oldFile.readline() + while inputLine != "" : + if ">" in inputLine: + self.setConsensusName(inputLine) + outputLine = ">%s" % self.shortenConsensusName() + newFile.write(outputLine) + else: + newFile.write(inputLine) + + inputLine = oldFile.readline() + + oldFile.close() + newFile.close() + + os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0])) + os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0])) + os.system("rm -f %sOld.fa" % fileName.split(".")[0]) + + def writeOtherFeaturesLine(self, dEvidence): + other = "(NA)" + if dEvidence.has_key('other'): + lResults = [] + dOtherResults = dEvidence['other'] + lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults) + lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding) + if len(lResultsFilled) != 0: + subOther = "; ".join(lResultsFilled) + other = '(%s)' % subOther + self._hasOtherPart = True + return other + + def writeCodingFeaturesLine(self, dEvidence): + lResults = [] + lResultsFilled = self.formatCodingFeatures(dEvidence, lResults) + if len(lResultsFilled) != 0: + subCoding = "; ".join(lResultsFilled) + coding = '(%s)' % subCoding + else: + coding = "(NA)" + return coding + + def writeStructFeaturesLine(self, dEvidence): + lResults = [] + lResultsFilled = self.formatStructFeatures(dEvidence, lResults) + if len(lResultsFilled) != 0: + subStruct = "; ".join(lResultsFilled) + struct = '(%s)' % subStruct + else: + struct = "(NA)" + return struct + + def formatCodingFeatures(self, dEvidence, lResults): + if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []: + lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx']))) + + if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []: + lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx']))) + + if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None): + lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer'])) + + if dEvidence.has_key('Other_profiles'): + lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles'])) + + if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None): + lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"]) + + if dEvidence.has_key("HG") and (dEvidence["HG"] != None): + lResults.append("HG_BLRn: %s" % dEvidence["HG"]) + + if len(lResults) != 0: + self._hasCodingPart = True + return lResults + + def formatProfilesResults(self, dProfilesResults): + if len(dProfilesResults.keys()) == 0: + return "" + lResults = [] + for key in dProfilesResults.keys(): + iPDM = dProfilesResults[key] + cov = "%.2f%%" % iPDM.getCoverageOnSubject() + profilesResult = '%s: %s' % (key, cov) + lResults.append(profilesResult) + return ", ".join(lResults) + + def formatStructFeatures(self, dEvidence, lResults): + if dEvidence.has_key('length') and (dEvidence['length']!= None): + lResults.append('TElength: %s' % dEvidence['length']) + + if dEvidence.has_key('TR') and (dEvidence['TR'] != None): + lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR']))) + + if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None): + lResults.append('ORF: %s' % ", ".join(dEvidence['ORF'])) + + if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None): + lResults.append('SSR: %s' % ", ".join(dEvidence['SSR'])) + + if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) : + lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage']) + + if dEvidence.has_key('polyAtail'): + lResults.append('polyAtail') + + if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None): + lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities']))) + if len(lResults) != 0: + self._hasStructPart = True + return lResults + + def _decisionRuleForWickerCode(self, teClass, order): + code = 'NA' + if order in DWICKERCODE.keys(): + code = DWICKERCODE[order] + elif teClass in DWICKERCODE.keys(): + code = DWICKERCODE[teClass] + elif order == "Unclassified" and teClass == "Unclassified": + code = "NA" + elif re.search("\|", order) and teClass == "Unclassified": + code = "XXX" + elif re.search("\|", order) and re.search("\|",teClass): + lClass = teClass.split("|") + for iC in lClass[1:]: + if lClass[0] != iC: + code = "XXX" + return code + code = DWICKERCODE[lClass[0]] + return code + + def renameLARDTRIMAndMITE(self): + order = self.getConsensusOrder() + order = order.replace("MITE", "TIR-MITE") + order = order.replace("LARD", "LTR-LARD") + order = order.replace("TRIM", "LTR-TRIM") + self.setConsensusOrder(order) + dEvidence = self.getInfoEvidence() + if 'LARD' in dEvidence.keys(): + dEvidence["LTR-LARD"] = dEvidence["LARD"] + del dEvidence["LARD"] + if 'TRIM' in dEvidence.keys(): + dEvidence["LTR-TRIM"] = dEvidence["TRIM"] + del dEvidence["TRIM"] + if 'MITE' in dEvidence.keys(): + dEvidence["TIR-MITE"] = dEvidence["MITE"] + del dEvidence["MITE"] + self.setInfoEvidence(dEvidence) + + + + \ No newline at end of file