Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/utils/Classif.py @ 6:20ec0d14798e draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 05:00:24 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:4093a2fb58be | 6:20ec0d14798e |
|---|---|
| 1 import re | |
| 2 import os | |
| 3 from collections import OrderedDict | |
| 4 | |
| 5 DWICKERCODE = { | |
| 6 "ClassI":"RXX", | |
| 7 "ClassII":"DXX", | |
| 8 "LTR":"RLX", | |
| 9 "DIRS":"RYX", | |
| 10 "PLE":"RPX", | |
| 11 "LINE":"RIX", | |
| 12 "SINE":"RSX", | |
| 13 "TIR":"DTX", | |
| 14 "Crypton":"DYX", | |
| 15 "Helitron":"DHX", | |
| 16 "Maverick":"DMX", | |
| 17 | |
| 18 "TIR-MITE":"DTX", | |
| 19 "LTR-LARD":"RLX", | |
| 20 "LTR-TRIM":"RLX" | |
| 21 } | |
| 22 | |
| 23 class Classif(object): | |
| 24 """ The class Classif is a object what determine a line in classif file. | |
| 25 """ | |
| 26 | |
| 27 def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"): | |
| 28 self._consensusName = consensusName | |
| 29 self._confusness = outConfuseness | |
| 30 self._completeness = outCompleteness | |
| 31 self._projectName = projectName | |
| 32 self._isShorten = isShorten | |
| 33 self._consensusLength = consensusLength | |
| 34 self._consensusStrand = consensusStrand | |
| 35 self._consensusClass = consensusClass | |
| 36 self._consensusOrder = consensusOrder | |
| 37 self._consensusSuperFam = consensusSuperFam | |
| 38 self._consensusCI = consensusCI | |
| 39 self._consensusCoding = "" | |
| 40 self._consensusStruct = "" | |
| 41 self._consensusOther = "" | |
| 42 self._isNoChim = "" | |
| 43 self._hasCodingPart = False | |
| 44 self._hasStructPart = False | |
| 45 self._hasOtherPart = False | |
| 46 self._code = code | |
| 47 self._evidence = {} | |
| 48 | |
| 49 def __eq__(self, o): | |
| 50 if type(o) is type(self): | |
| 51 return self._consensusName == o._consensusName and self._code == o._code \ | |
| 52 and self._confusness == o._confusness and self._completeness == o._completeness | |
| 53 return False | |
| 54 | |
| 55 def __ne__(self, o): | |
| 56 return not self.__eq__(o) | |
| 57 | |
| 58 def getConsensusName(self): | |
| 59 return self._consensusName | |
| 60 | |
| 61 def getCode(self): | |
| 62 return self._code | |
| 63 | |
| 64 def getconfusness(self): | |
| 65 return self._confusness | |
| 66 | |
| 67 def getcompleteness(self): | |
| 68 return self._completeness | |
| 69 | |
| 70 def getprojectName(self): | |
| 71 return self._projectName | |
| 72 | |
| 73 def getConsensusLength(self): | |
| 74 return self._consensusLength | |
| 75 | |
| 76 def getConsensusStrand(self): | |
| 77 return self._consensusStrand | |
| 78 | |
| 79 def getConsensusClass(self): | |
| 80 return self._consensusClass | |
| 81 | |
| 82 def getConsensusOrder(self): | |
| 83 return self._consensusOrder | |
| 84 | |
| 85 def getConsensusSuperFamily(self): | |
| 86 return self._consensusSuperFam | |
| 87 | |
| 88 def getConsensusCI(self): | |
| 89 return str(self._consensusCI) | |
| 90 | |
| 91 def getInfoEvidence(self): | |
| 92 return self._evidence | |
| 93 | |
| 94 def getConsensusCoding(self): | |
| 95 if self._confusness == 'ok': | |
| 96 coding = self.writeCodingFeaturesLine(self._evidence) | |
| 97 else: | |
| 98 lOrder = self.getConsensusOrder().split("|") | |
| 99 coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]]) | |
| 100 for order in lOrder[1:]: | |
| 101 if self._evidence[order].keys() != ['other']: | |
| 102 coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order]) | |
| 103 return "coding=" + coding | |
| 104 | |
| 105 def getConsensusStructure(self): | |
| 106 if self._confusness == 'ok': | |
| 107 Structure = self.writeStructFeaturesLine(self._evidence) | |
| 108 else: | |
| 109 lOrder = self.getConsensusOrder().split("|") | |
| 110 Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]]) | |
| 111 for order in lOrder[1:]: | |
| 112 if self._evidence[order].keys() != ['other']: | |
| 113 Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order]) | |
| 114 return "struct=" + Structure | |
| 115 | |
| 116 def getConsensusOther(self): | |
| 117 if self._confusness == 'ok': | |
| 118 Other = self.writeOtherFeaturesLine(self._evidence) | |
| 119 else: | |
| 120 lOrder = self.getConsensusOrder().split("|") | |
| 121 Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]]) | |
| 122 for order in lOrder[1:]: | |
| 123 Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order]) | |
| 124 return "other=" + Other | |
| 125 | |
| 126 def setConsensusName(self, consensusName): | |
| 127 self._consensusName = consensusName | |
| 128 | |
| 129 def setInfoEvidence(self, evidence): | |
| 130 self._evidence = evidence | |
| 131 | |
| 132 def setCode(self): | |
| 133 self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder()) | |
| 134 | |
| 135 def setConfusness(self, Confusness): | |
| 136 self._confusness = Confusness | |
| 137 | |
| 138 def setCompleteness(self, completeness): | |
| 139 self._completeness = completeness | |
| 140 | |
| 141 def setProjectName(self, projectName): | |
| 142 self._projectName = projectName | |
| 143 | |
| 144 def setConsensusLength(self, cLength): | |
| 145 self._consensusLength = cLength | |
| 146 | |
| 147 def setConsensusStrand(self, cStrand): | |
| 148 self._consensusStrand = cStrand | |
| 149 | |
| 150 def setConsensusClass(self, cClass): | |
| 151 self._consensusClass = cClass | |
| 152 | |
| 153 def setConsensusOrder(self, cOrder): | |
| 154 self._consensusOrder = cOrder | |
| 155 | |
| 156 def setConsensusSuperFamily(self, cSuperFamily): | |
| 157 self._consensusSuperFamily = cSuperFamily | |
| 158 | |
| 159 def setConsensusCI(self, CI): | |
| 160 self._consensusCI = CI | |
| 161 | |
| 162 def setConsensusCoding(self, coding): | |
| 163 self._consensusCoding = coding | |
| 164 | |
| 165 def setConsensusStructure(self, structure): | |
| 166 self._consensusStruct = structure | |
| 167 | |
| 168 def setConsensusOther(self, other): | |
| 169 self._consensusOther = other | |
| 170 | |
| 171 def setCodStrOthFromMessage(self, dico): | |
| 172 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) | |
| 173 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) | |
| 174 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) | |
| 175 | |
| 176 def setCodStrOthFromMessage2(self, dico, cOrder): | |
| 177 if 'rDNA' in cOrder: | |
| 178 cOrder = cOrder.replace('rDNA', 'RDNA') | |
| 179 lOrder = cOrder.split("|") | |
| 180 lDicoKeys = dico.keys() | |
| 181 if lOrder[0] not in lDicoKeys: | |
| 182 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico) | |
| 183 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico) | |
| 184 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico) | |
| 185 else: | |
| 186 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]]) | |
| 187 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]]) | |
| 188 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]]) | |
| 189 if len(lDicoKeys) != 1: | |
| 190 for order in lDicoKeys[1:]: | |
| 191 if dico[order].keys() == ['other']: | |
| 192 self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) | |
| 193 else: | |
| 194 self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order]) | |
| 195 self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order]) | |
| 196 self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order]) | |
| 197 | |
| 198 def createNewConsensusName(self): | |
| 199 pastecClassif = "%s" % self._code | |
| 200 if self._completeness != "": | |
| 201 pastecClassif += "-%s" % self._completeness | |
| 202 if self._confusness != "": | |
| 203 pastecClassif += "-%s" % self._confusness | |
| 204 if self._isShorten: | |
| 205 pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName | |
| 206 if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName: | |
| 207 header = self.shortenConsensusName() | |
| 208 header = "%s_%s" % (pastecClassif, header) | |
| 209 else: | |
| 210 header = "%s_%s" % (pastecClassif, self._consensusName) | |
| 211 else: | |
| 212 header = "%s_%s" % (pastecClassif, self._consensusName) | |
| 213 | |
| 214 return header | |
| 215 | |
| 216 def shortenConsensusName(self): | |
| 217 desc = self._consensusName.split(self._projectName)[1] | |
| 218 palignMeth = desc.split("_")[1] | |
| 219 clustMeth = desc.split("_")[2] | |
| 220 clustID = desc.split("_")[3] | |
| 221 lmalignMeth = desc.split("_")[4:] | |
| 222 if len(lmalignMeth) > 2: | |
| 223 malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2]) | |
| 224 else: | |
| 225 malignMeth = "".join(lmalignMeth) | |
| 226 consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth) | |
| 227 | |
| 228 return consensusShorten | |
| 229 | |
| 230 def renameHeaderInConsensusFastaFile(self, fileName = ""): | |
| 231 newFileName = fileName.split(".")[0]+"New.fa" | |
| 232 | |
| 233 oldFile = open(fileName, "r") | |
| 234 newFile = open(newFileName, "w") | |
| 235 | |
| 236 inputLine = oldFile.readline() | |
| 237 while inputLine != "" : | |
| 238 if ">" in inputLine: | |
| 239 self.setConsensusName(inputLine) | |
| 240 outputLine = ">%s" % self.shortenConsensusName() | |
| 241 newFile.write(outputLine) | |
| 242 else: | |
| 243 newFile.write(inputLine) | |
| 244 | |
| 245 inputLine = oldFile.readline() | |
| 246 | |
| 247 oldFile.close() | |
| 248 newFile.close() | |
| 249 | |
| 250 os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0])) | |
| 251 os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0])) | |
| 252 os.system("rm -f %sOld.fa" % fileName.split(".")[0]) | |
| 253 | |
| 254 def writeOtherFeaturesLine(self, dEvidence): | |
| 255 other = "(NA)" | |
| 256 if dEvidence.has_key('other'): | |
| 257 lResults = [] | |
| 258 dOtherResults = dEvidence['other'] | |
| 259 lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults) | |
| 260 lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding) | |
| 261 if len(lResultsFilled) != 0: | |
| 262 subOther = "; ".join(lResultsFilled) | |
| 263 other = '(%s)' % subOther | |
| 264 self._hasOtherPart = True | |
| 265 return other | |
| 266 | |
| 267 def writeCodingFeaturesLine(self, dEvidence): | |
| 268 lResults = [] | |
| 269 lResultsFilled = self.formatCodingFeatures(dEvidence, lResults) | |
| 270 if len(lResultsFilled) != 0: | |
| 271 subCoding = "; ".join(lResultsFilled) | |
| 272 coding = '(%s)' % subCoding | |
| 273 else: | |
| 274 coding = "(NA)" | |
| 275 return coding | |
| 276 | |
| 277 def writeStructFeaturesLine(self, dEvidence): | |
| 278 lResults = [] | |
| 279 lResultsFilled = self.formatStructFeatures(dEvidence, lResults) | |
| 280 if len(lResultsFilled) != 0: | |
| 281 subStruct = "; ".join(lResultsFilled) | |
| 282 struct = '(%s)' % subStruct | |
| 283 else: | |
| 284 struct = "(NA)" | |
| 285 return struct | |
| 286 | |
| 287 def formatCodingFeatures(self, dEvidence, lResults): | |
| 288 if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []: | |
| 289 lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx']))) | |
| 290 | |
| 291 if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []: | |
| 292 lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx']))) | |
| 293 | |
| 294 if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None): | |
| 295 lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer'])) | |
| 296 | |
| 297 if dEvidence.has_key('Other_profiles'): | |
| 298 lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles'])) | |
| 299 | |
| 300 if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None): | |
| 301 lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"]) | |
| 302 | |
| 303 if dEvidence.has_key("HG") and (dEvidence["HG"] != None): | |
| 304 lResults.append("HG_BLRn: %s" % dEvidence["HG"]) | |
| 305 | |
| 306 if len(lResults) != 0: | |
| 307 self._hasCodingPart = True | |
| 308 return lResults | |
| 309 | |
| 310 def formatProfilesResults(self, dProfilesResults): | |
| 311 if len(dProfilesResults.keys()) == 0: | |
| 312 return "" | |
| 313 lResults = [] | |
| 314 for key in dProfilesResults.keys(): | |
| 315 iPDM = dProfilesResults[key] | |
| 316 cov = "%.2f%%" % iPDM.getCoverageOnSubject() | |
| 317 profilesResult = '%s: %s' % (key, cov) | |
| 318 lResults.append(profilesResult) | |
| 319 return ", ".join(lResults) | |
| 320 | |
| 321 def formatStructFeatures(self, dEvidence, lResults): | |
| 322 if dEvidence.has_key('length') and (dEvidence['length']!= None): | |
| 323 lResults.append('TElength: %s' % dEvidence['length']) | |
| 324 | |
| 325 if dEvidence.has_key('TR') and (dEvidence['TR'] != None): | |
| 326 lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR']))) | |
| 327 | |
| 328 if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None): | |
| 329 lResults.append('ORF: %s' % ", ".join(dEvidence['ORF'])) | |
| 330 | |
| 331 if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None): | |
| 332 lResults.append('SSR: %s' % ", ".join(dEvidence['SSR'])) | |
| 333 | |
| 334 if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) : | |
| 335 lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage']) | |
| 336 | |
| 337 if dEvidence.has_key('polyAtail'): | |
| 338 lResults.append('polyAtail') | |
| 339 | |
| 340 if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None): | |
| 341 lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities']))) | |
| 342 if len(lResults) != 0: | |
| 343 self._hasStructPart = True | |
| 344 return lResults | |
| 345 | |
| 346 def _decisionRuleForWickerCode(self, teClass, order): | |
| 347 code = 'NA' | |
| 348 if order in DWICKERCODE.keys(): | |
| 349 code = DWICKERCODE[order] | |
| 350 elif teClass in DWICKERCODE.keys(): | |
| 351 code = DWICKERCODE[teClass] | |
| 352 elif order == "Unclassified" and teClass == "Unclassified": | |
| 353 code = "NA" | |
| 354 elif re.search("\|", order) and teClass == "Unclassified": | |
| 355 code = "XXX" | |
| 356 elif re.search("\|", order) and re.search("\|",teClass): | |
| 357 lClass = teClass.split("|") | |
| 358 for iC in lClass[1:]: | |
| 359 if lClass[0] != iC: | |
| 360 code = "XXX" | |
| 361 return code | |
| 362 code = DWICKERCODE[lClass[0]] | |
| 363 return code | |
| 364 | |
| 365 def renameLARDTRIMAndMITE(self): | |
| 366 order = self.getConsensusOrder() | |
| 367 order = order.replace("MITE", "TIR-MITE") | |
| 368 order = order.replace("LARD", "LTR-LARD") | |
| 369 order = order.replace("TRIM", "LTR-TRIM") | |
| 370 self.setConsensusOrder(order) | |
| 371 dEvidence = self.getInfoEvidence() | |
| 372 if 'LARD' in dEvidence.keys(): | |
| 373 dEvidence["LTR-LARD"] = dEvidence["LARD"] | |
| 374 del dEvidence["LARD"] | |
| 375 if 'TRIM' in dEvidence.keys(): | |
| 376 dEvidence["LTR-TRIM"] = dEvidence["TRIM"] | |
| 377 del dEvidence["TRIM"] | |
| 378 if 'MITE' in dEvidence.keys(): | |
| 379 dEvidence["TIR-MITE"] = dEvidence["MITE"] | |
| 380 del dEvidence["MITE"] | |
| 381 self.setInfoEvidence(dEvidence) | |
| 382 | |
| 383 | |
| 384 | |
| 385 |
