Mercurial > repos > urgi-team > teiso
view TEisotools-1.0/commons/core/utils/Classif.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
line wrap: on
line source
import re import os from collections import OrderedDict DWICKERCODE = { "ClassI":"RXX", "ClassII":"DXX", "LTR":"RLX", "DIRS":"RYX", "PLE":"RPX", "LINE":"RIX", "SINE":"RSX", "TIR":"DTX", "Crypton":"DYX", "Helitron":"DHX", "Maverick":"DMX", "TIR-MITE":"DTX", "LTR-LARD":"RLX", "LTR-TRIM":"RLX" } class Classif(object): """ The class Classif is a object what determine a line in classif file. """ def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"): self._consensusName = consensusName self._confusness = outConfuseness self._completeness = outCompleteness self._projectName = projectName self._isShorten = isShorten self._consensusLength = consensusLength self._consensusStrand = consensusStrand self._consensusClass = consensusClass self._consensusOrder = consensusOrder self._consensusSuperFam = consensusSuperFam self._consensusCI = consensusCI self._consensusCoding = "" self._consensusStruct = "" self._consensusOther = "" self._isNoChim = "" self._hasCodingPart = False self._hasStructPart = False self._hasOtherPart = False self._code = code self._evidence = {} def __eq__(self, o): if type(o) is type(self): return self._consensusName == o._consensusName and self._code == o._code \ and self._confusness == o._confusness and self._completeness == o._completeness return False def __ne__(self, o): return not self.__eq__(o) def getConsensusName(self): return self._consensusName def getCode(self): return self._code def getconfusness(self): return self._confusness def getcompleteness(self): return self._completeness def getprojectName(self): return self._projectName def getConsensusLength(self): return self._consensusLength def getConsensusStrand(self): return self._consensusStrand def getConsensusClass(self): return self._consensusClass def getConsensusOrder(self): return self._consensusOrder def getConsensusSuperFamily(self): return self._consensusSuperFam def getConsensusCI(self): return str(self._consensusCI) def getInfoEvidence(self): return self._evidence def getConsensusCoding(self): if self._confusness == 'ok': coding = self.writeCodingFeaturesLine(self._evidence) else: lOrder = self.getConsensusOrder().split("|") coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]]) for order in lOrder[1:]: if self._evidence[order].keys() != ['other']: coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order]) return "coding=" + coding def getConsensusStructure(self): if self._confusness == 'ok': Structure = self.writeStructFeaturesLine(self._evidence) else: lOrder = self.getConsensusOrder().split("|") Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]]) for order in lOrder[1:]: if self._evidence[order].keys() != ['other']: Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order]) return "struct=" + Structure def getConsensusOther(self): if self._confusness == 'ok': Other = self.writeOtherFeaturesLine(self._evidence) else: lOrder = self.getConsensusOrder().split("|") Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]]) for order in lOrder[1:]: Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order]) return "other=" + Other def setConsensusName(self, consensusName): self._consensusName = consensusName def setInfoEvidence(self, evidence): self._evidence = evidence def setCode(self): self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder()) def setConfusness(self, Confusness): self._confusness = Confusness def setCompleteness(self, completeness): self._completeness = completeness def setProjectName(self, projectName): self._projectName = projectName def setConsensusLength(self, cLength): self._consensusLength = cLength def setConsensusStrand(self, cStrand): self._consensusStrand = cStrand def setConsensusClass(self, cClass): self._consensusClass = cClass def setConsensusOrder(self, cOrder): self._consensusOrder = cOrder def setConsensusSuperFamily(self, cSuperFamily): self._consensusSuperFamily = cSuperFamily def setConsensusCI(self, CI): self._consensusCI = CI def setConsensusCoding(self, coding): self._consensusCoding = coding def setConsensusStructure(self, structure): self._consensusStruct = structure def setConsensusOther(self, other): self._consensusOther = other def setCodStrOthFromMessage(self, dico): self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) def setCodStrOthFromMessage2(self, dico, cOrder): if 'rDNA' in cOrder: cOrder = cOrder.replace('rDNA', 'RDNA') lOrder = cOrder.split("|") lDicoKeys = dico.keys() if lOrder[0] not in lDicoKeys: self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) else: self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]]) self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]]) self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]]) if len(lDicoKeys) != 1: for order in lDicoKeys[1:]: if dico[order].keys() == ['other']: self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) else: self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order]) self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order]) self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) def createNewConsensusName(self): pastecClassif = "%s" % self._code if self._completeness != "": pastecClassif += "-%s" % self._completeness if self._confusness != "": pastecClassif += "-%s" % self._confusness if self._isShorten: pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName: header = self.shortenConsensusName() header = "%s_%s" % (pastecClassif, header) else: header = "%s_%s" % (pastecClassif, self._consensusName) else: header = "%s_%s" % (pastecClassif, self._consensusName) return header def shortenConsensusName(self): desc = self._consensusName.split(self._projectName)[1] palignMeth = desc.split("_")[1] clustMeth = desc.split("_")[2] clustID = desc.split("_")[3] lmalignMeth = desc.split("_")[4:] if len(lmalignMeth) > 2: malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2]) else: malignMeth = "".join(lmalignMeth) consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth) return consensusShorten def renameHeaderInConsensusFastaFile(self, fileName = ""): newFileName = fileName.split(".")[0]+"New.fa" oldFile = open(fileName, "r") newFile = open(newFileName, "w") inputLine = oldFile.readline() while inputLine != "" : if ">" in inputLine: self.setConsensusName(inputLine) outputLine = ">%s" % self.shortenConsensusName() newFile.write(outputLine) else: newFile.write(inputLine) inputLine = oldFile.readline() oldFile.close() newFile.close() os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0])) os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0])) os.system("rm -f %sOld.fa" % fileName.split(".")[0]) def writeOtherFeaturesLine(self, dEvidence): other = "(NA)" if dEvidence.has_key('other'): lResults = [] dOtherResults = dEvidence['other'] lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults) lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding) if len(lResultsFilled) != 0: subOther = "; ".join(lResultsFilled) other = '(%s)' % subOther self._hasOtherPart = True return other def writeCodingFeaturesLine(self, dEvidence): lResults = [] lResultsFilled = self.formatCodingFeatures(dEvidence, lResults) if len(lResultsFilled) != 0: subCoding = "; ".join(lResultsFilled) coding = '(%s)' % subCoding else: coding = "(NA)" return coding def writeStructFeaturesLine(self, dEvidence): lResults = [] lResultsFilled = self.formatStructFeatures(dEvidence, lResults) if len(lResultsFilled) != 0: subStruct = "; ".join(lResultsFilled) struct = '(%s)' % subStruct else: struct = "(NA)" return struct def formatCodingFeatures(self, dEvidence, lResults): if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []: lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx']))) if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []: lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx']))) if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None): lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer'])) if dEvidence.has_key('Other_profiles'): lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles'])) if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None): lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"]) if dEvidence.has_key("HG") and (dEvidence["HG"] != None): lResults.append("HG_BLRn: %s" % dEvidence["HG"]) if len(lResults) != 0: self._hasCodingPart = True return lResults def formatProfilesResults(self, dProfilesResults): if len(dProfilesResults.keys()) == 0: return "" lResults = [] for key in dProfilesResults.keys(): iPDM = dProfilesResults[key] cov = "%.2f%%" % iPDM.getCoverageOnSubject() profilesResult = '%s: %s' % (key, cov) lResults.append(profilesResult) return ", ".join(lResults) def formatStructFeatures(self, dEvidence, lResults): if dEvidence.has_key('length') and (dEvidence['length']!= None): lResults.append('TElength: %s' % dEvidence['length']) if dEvidence.has_key('TR') and (dEvidence['TR'] != None): lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR']))) if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None): lResults.append('ORF: %s' % ", ".join(dEvidence['ORF'])) if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None): lResults.append('SSR: %s' % ", ".join(dEvidence['SSR'])) if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) : lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage']) if dEvidence.has_key('polyAtail'): lResults.append('polyAtail') if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None): lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities']))) if len(lResults) != 0: self._hasStructPart = True return lResults def _decisionRuleForWickerCode(self, teClass, order): code = 'NA' if order in DWICKERCODE.keys(): code = DWICKERCODE[order] elif teClass in DWICKERCODE.keys(): code = DWICKERCODE[teClass] elif order == "Unclassified" and teClass == "Unclassified": code = "NA" elif re.search("\|", order) and teClass == "Unclassified": code = "XXX" elif re.search("\|", order) and re.search("\|",teClass): lClass = teClass.split("|") for iC in lClass[1:]: if lClass[0] != iC: code = "XXX" return code code = DWICKERCODE[lClass[0]] return code def renameLARDTRIMAndMITE(self): order = self.getConsensusOrder() order = order.replace("MITE", "TIR-MITE") order = order.replace("LARD", "LTR-LARD") order = order.replace("TRIM", "LTR-TRIM") self.setConsensusOrder(order) dEvidence = self.getInfoEvidence() if 'LARD' in dEvidence.keys(): dEvidence["LTR-LARD"] = dEvidence["LARD"] del dEvidence["LARD"] if 'TRIM' in dEvidence.keys(): dEvidence["LTR-TRIM"] = dEvidence["TRIM"] del dEvidence["TRIM"] if 'MITE' in dEvidence.keys(): dEvidence["TIR-MITE"] = dEvidence["MITE"] del dEvidence["MITE"] self.setInfoEvidence(dEvidence)