comparison TEisotools-1.1.a/commons/core/utils/Classif.py @ 13:feef9a0db09d draft

Uploaded
author urgi-team
date Wed, 20 Jul 2016 09:04:42 -0400
parents
children
comparison
equal deleted inserted replaced
12:22b0494ec883 13:feef9a0db09d
1 import re
2 import os
3 from collections import OrderedDict
4
5 DWICKERCODE = {
6 "ClassI":"RXX",
7 "ClassII":"DXX",
8 "LTR":"RLX",
9 "DIRS":"RYX",
10 "PLE":"RPX",
11 "LINE":"RIX",
12 "SINE":"RSX",
13 "TIR":"DTX",
14 "Crypton":"DYX",
15 "Helitron":"DHX",
16 "Maverick":"DMX",
17
18 "TIR-MITE":"DTX",
19 "LTR-LARD":"RLX",
20 "LTR-TRIM":"RLX"
21 }
22
23 class Classif(object):
24 """ The class Classif is a object what determine a line in classif file.
25 """
26
27 def __init__(self, consensusName = "", code = "NA", outConfuseness = "", outCompleteness = "", projectName = "", isShorten = False, consensusLength = "NA", consensusStrand = "NA", consensusClass = "NA", consensusOrder = "NA", consensusSuperFam = "NA", consensusCI = "NA"):
28 self._consensusName = consensusName
29 self._confusness = outConfuseness
30 self._completeness = outCompleteness
31 self._projectName = projectName
32 self._isShorten = isShorten
33 self._consensusLength = consensusLength
34 self._consensusStrand = consensusStrand
35 self._consensusClass = consensusClass
36 self._consensusOrder = consensusOrder
37 self._consensusSuperFam = consensusSuperFam
38 self._consensusCI = consensusCI
39 self._consensusCoding = ""
40 self._consensusStruct = ""
41 self._consensusOther = ""
42 self._isNoChim = ""
43 self._hasCodingPart = False
44 self._hasStructPart = False
45 self._hasOtherPart = False
46 self._code = code
47 self._evidence = {}
48
49 def __eq__(self, o):
50 if type(o) is type(self):
51 return self._consensusName == o._consensusName and self._code == o._code \
52 and self._confusness == o._confusness and self._completeness == o._completeness
53 return False
54
55 def __ne__(self, o):
56 return not self.__eq__(o)
57
58 def getConsensusName(self):
59 return self._consensusName
60
61 def getCode(self):
62 return self._code
63
64 def getconfusness(self):
65 return self._confusness
66
67 def getcompleteness(self):
68 return self._completeness
69
70 def getprojectName(self):
71 return self._projectName
72
73 def getConsensusLength(self):
74 return self._consensusLength
75
76 def getConsensusStrand(self):
77 return self._consensusStrand
78
79 def getConsensusClass(self):
80 return self._consensusClass
81
82 def getConsensusOrder(self):
83 return self._consensusOrder
84
85 def getConsensusSuperFamily(self):
86 return self._consensusSuperFam
87
88 def getConsensusCI(self):
89 return str(self._consensusCI)
90
91 def getInfoEvidence(self):
92 return self._evidence
93
94 def getConsensusCoding(self):
95 if self._confusness == 'ok':
96 coding = self.writeCodingFeaturesLine(self._evidence)
97 else:
98 lOrder = self.getConsensusOrder().split("|")
99 coding = self.writeCodingFeaturesLine(self._evidence[lOrder[0]])
100 for order in lOrder[1:]:
101 if self._evidence[order].keys() != ['other']:
102 coding = coding + "|" + self.writeCodingFeaturesLine(self._evidence[order])
103 return "coding=" + coding
104
105 def getConsensusStructure(self):
106 if self._confusness == 'ok':
107 Structure = self.writeStructFeaturesLine(self._evidence)
108 else:
109 lOrder = self.getConsensusOrder().split("|")
110 Structure = self.writeStructFeaturesLine(self._evidence[lOrder[0]])
111 for order in lOrder[1:]:
112 if self._evidence[order].keys() != ['other']:
113 Structure = Structure + "|" + self.writeStructFeaturesLine(self._evidence[order])
114 return "struct=" + Structure
115
116 def getConsensusOther(self):
117 if self._confusness == 'ok':
118 Other = self.writeOtherFeaturesLine(self._evidence)
119 else:
120 lOrder = self.getConsensusOrder().split("|")
121 Other = self.writeOtherFeaturesLine(self._evidence[lOrder[0]])
122 for order in lOrder[1:]:
123 Other = Other + "|" + self.writeOtherFeaturesLine(self._evidence[order])
124 return "other=" + Other
125
126 def setConsensusName(self, consensusName):
127 self._consensusName = consensusName
128
129 def setInfoEvidence(self, evidence):
130 self._evidence = evidence
131
132 def setCode(self):
133 self._code = self._decisionRuleForWickerCode(self.getConsensusClass(), self.getConsensusOrder())
134
135 def setConfusness(self, Confusness):
136 self._confusness = Confusness
137
138 def setCompleteness(self, completeness):
139 self._completeness = completeness
140
141 def setProjectName(self, projectName):
142 self._projectName = projectName
143
144 def setConsensusLength(self, cLength):
145 self._consensusLength = cLength
146
147 def setConsensusStrand(self, cStrand):
148 self._consensusStrand = cStrand
149
150 def setConsensusClass(self, cClass):
151 self._consensusClass = cClass
152
153 def setConsensusOrder(self, cOrder):
154 self._consensusOrder = cOrder
155
156 def setConsensusSuperFamily(self, cSuperFamily):
157 self._consensusSuperFamily = cSuperFamily
158
159 def setConsensusCI(self, CI):
160 self._consensusCI = CI
161
162 def setConsensusCoding(self, coding):
163 self._consensusCoding = coding
164
165 def setConsensusStructure(self, structure):
166 self._consensusStruct = structure
167
168 def setConsensusOther(self, other):
169 self._consensusOther = other
170
171 def setCodStrOthFromMessage(self, dico):
172 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico)
173 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico)
174 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico)
175
176 def setCodStrOthFromMessage2(self, dico, cOrder):
177 if 'rDNA' in cOrder:
178 cOrder = cOrder.replace('rDNA', 'RDNA')
179 lOrder = cOrder.split("|")
180 lDicoKeys = dico.keys()
181 if lOrder[0] not in lDicoKeys:
182 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico)
183 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico)
184 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico)
185 else:
186 self._consensusCoding = "coding="+self.writeCodingFeaturesLine(dico[lDicoKeys[0]])
187 self._consensusStruct = "struct="+self.writeStructFeaturesLine(dico[lDicoKeys[0]])
188 self._consensusOther = "other="+self.writeOtherFeaturesLine(dico[lDicoKeys[0]])
189 if len(lDicoKeys) != 1:
190 for order in lDicoKeys[1:]:
191 if dico[order].keys() == ['other']:
192 self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order])
193 else:
194 self._consensusCoding = self._consensusCoding+"|"+self.writeCodingFeaturesLine(dico[order])
195 self._consensusStruct = self._consensusStruct+"|"+self.writeStructFeaturesLine(dico[order])
196 self._consensusOther = self._consensusOther+"|"+self.writeOtherFeaturesLine(dico[order])
197
198 def createNewConsensusName(self):
199 pastecClassif = "%s" % self._code
200 if self._completeness != "":
201 pastecClassif += "-%s" % self._completeness
202 if self._confusness != "":
203 pastecClassif += "-%s" % self._confusness
204 if self._isShorten:
205 pattern = "%s_[a-zA-Z0-9]+_[a-zA-Z0-9]+_[a-zA-Z0-9_]+" % self._projectName
206 if re.match(pattern, self._consensusName) and not "%s_RS_" % self._projectName in self._consensusName:
207 header = self.shortenConsensusName()
208 header = "%s_%s" % (pastecClassif, header)
209 else:
210 header = "%s_%s" % (pastecClassif, self._consensusName)
211 else:
212 header = "%s_%s" % (pastecClassif, self._consensusName)
213
214 return header
215
216 def shortenConsensusName(self):
217 desc = self._consensusName.split(self._projectName)[1]
218 palignMeth = desc.split("_")[1]
219 clustMeth = desc.split("_")[2]
220 clustID = desc.split("_")[3]
221 lmalignMeth = desc.split("_")[4:]
222 if len(lmalignMeth) > 2:
223 malignMeth = "%s%s_%s" % (lmalignMeth[0], lmalignMeth[1], lmalignMeth[2])
224 else:
225 malignMeth = "".join(lmalignMeth)
226 consensusShorten = "%s-%s-%s%s-%s" % (self._projectName, palignMeth[0], clustMeth[0], clustID, malignMeth)
227
228 return consensusShorten
229
230 def renameHeaderInConsensusFastaFile(self, fileName = ""):
231 newFileName = fileName.split(".")[0]+"New.fa"
232
233 oldFile = open(fileName, "r")
234 newFile = open(newFileName, "w")
235
236 inputLine = oldFile.readline()
237 while inputLine != "" :
238 if ">" in inputLine:
239 self.setConsensusName(inputLine)
240 outputLine = ">%s" % self.shortenConsensusName()
241 newFile.write(outputLine)
242 else:
243 newFile.write(inputLine)
244
245 inputLine = oldFile.readline()
246
247 oldFile.close()
248 newFile.close()
249
250 os.system("mv %s.fa %sOld.fa" % (fileName.split(".")[0], fileName.split(".")[0]))
251 os.system("mv %sNew.fa %s.fa" % (fileName.split(".")[0], fileName.split(".")[0]))
252 os.system("rm -f %sOld.fa" % fileName.split(".")[0])
253
254 def writeOtherFeaturesLine(self, dEvidence):
255 other = "(NA)"
256 if dEvidence.has_key('other'):
257 lResults = []
258 dOtherResults = dEvidence['other']
259 lResultsWithCoding = self.formatCodingFeatures(dOtherResults, lResults)
260 lResultsFilled = self.formatStructFeatures(dOtherResults, lResultsWithCoding)
261 if len(lResultsFilled) != 0:
262 subOther = "; ".join(lResultsFilled)
263 other = '(%s)' % subOther
264 self._hasOtherPart = True
265 return other
266
267 def writeCodingFeaturesLine(self, dEvidence):
268 lResults = []
269 lResultsFilled = self.formatCodingFeatures(dEvidence, lResults)
270 if len(lResultsFilled) != 0:
271 subCoding = "; ".join(lResultsFilled)
272 coding = '(%s)' % subCoding
273 else:
274 coding = "(NA)"
275 return coding
276
277 def writeStructFeaturesLine(self, dEvidence):
278 lResults = []
279 lResultsFilled = self.formatStructFeatures(dEvidence, lResults)
280 if len(lResultsFilled) != 0:
281 subStruct = "; ".join(lResultsFilled)
282 struct = '(%s)' % subStruct
283 else:
284 struct = "(NA)"
285 return struct
286
287 def formatCodingFeatures(self, dEvidence, lResults):
288 if dEvidence.has_key('Repbase_tbx') and dEvidence['Repbase_tbx'] != []:
289 lResults.append("TE_BLRtx: %s" % ", ".join(map(str, dEvidence['Repbase_tbx'])))
290
291 if dEvidence.has_key('Repbase_bx') and dEvidence['Repbase_bx'] != []:
292 lResults.append("TE_BLRx: %s" % ", ".join(map(str, dEvidence['Repbase_bx'])))
293
294 if (dEvidence.has_key('te_hmmer')) and (dEvidence['te_hmmer'] != None):
295 lResults.append('profiles: %s' % self.formatProfilesResults(dEvidence['te_hmmer']))
296
297 if dEvidence.has_key('Other_profiles'):
298 lResults.append('Other_profiles: %s' % self.formatProfilesResults(dEvidence['Other_profiles']))
299
300 if dEvidence.has_key("rDNA") and (dEvidence["rDNA"] != None):
301 lResults.append("rDNA_BLRn: %s" % dEvidence["rDNA"])
302
303 if dEvidence.has_key("HG") and (dEvidence["HG"] != None):
304 lResults.append("HG_BLRn: %s" % dEvidence["HG"])
305
306 if len(lResults) != 0:
307 self._hasCodingPart = True
308 return lResults
309
310 def formatProfilesResults(self, dProfilesResults):
311 if len(dProfilesResults.keys()) == 0:
312 return ""
313 lResults = []
314 for key in dProfilesResults.keys():
315 iPDM = dProfilesResults[key]
316 cov = "%.2f%%" % iPDM.getCoverageOnSubject()
317 profilesResult = '%s: %s' % (key, cov)
318 lResults.append(profilesResult)
319 return ", ".join(lResults)
320
321 def formatStructFeatures(self, dEvidence, lResults):
322 if dEvidence.has_key('length') and (dEvidence['length']!= None):
323 lResults.append('TElength: %s' % dEvidence['length'])
324
325 if dEvidence.has_key('TR') and (dEvidence['TR'] != None):
326 lResults.append('TermRepeats: %s' % ", ".join(map(str, dEvidence['TR'])))
327
328 if dEvidence.has_key('ORF') and (dEvidence['ORF'] != None):
329 lResults.append('ORF: %s' % ", ".join(dEvidence['ORF']))
330
331 if dEvidence.has_key('SSR') and (dEvidence['SSR'] != None):
332 lResults.append('SSR: %s' % ", ".join(dEvidence['SSR']))
333
334 if dEvidence.has_key('SSRCoverage') and (dEvidence['SSRCoverage'] != None) :
335 lResults.append('SSRCoverage=%s' % dEvidence['SSRCoverage'])
336
337 if dEvidence.has_key('polyAtail'):
338 lResults.append('polyAtail')
339
340 if dEvidence.has_key('helitronExtremities') and (dEvidence['helitronExtremities'] != None):
341 lResults.append('helitronExtremities: %s' % ", ".join(map(str, dEvidence['helitronExtremities'])))
342 if len(lResults) != 0:
343 self._hasStructPart = True
344 return lResults
345
346 def _decisionRuleForWickerCode(self, teClass, order):
347 code = 'NA'
348 if order in DWICKERCODE.keys():
349 code = DWICKERCODE[order]
350 elif teClass in DWICKERCODE.keys():
351 code = DWICKERCODE[teClass]
352 elif order == "Unclassified" and teClass == "Unclassified":
353 code = "NA"
354 elif re.search("\|", order) and teClass == "Unclassified":
355 code = "XXX"
356 elif re.search("\|", order) and re.search("\|",teClass):
357 lClass = teClass.split("|")
358 for iC in lClass[1:]:
359 if lClass[0] != iC:
360 code = "XXX"
361 return code
362 code = DWICKERCODE[lClass[0]]
363 return code
364
365 def renameLARDTRIMAndMITE(self):
366 order = self.getConsensusOrder()
367 order = order.replace("MITE", "TIR-MITE")
368 order = order.replace("LARD", "LTR-LARD")
369 order = order.replace("TRIM", "LTR-TRIM")
370 self.setConsensusOrder(order)
371 dEvidence = self.getInfoEvidence()
372 if 'LARD' in dEvidence.keys():
373 dEvidence["LTR-LARD"] = dEvidence["LARD"]
374 del dEvidence["LARD"]
375 if 'TRIM' in dEvidence.keys():
376 dEvidence["LTR-TRIM"] = dEvidence["TRIM"]
377 del dEvidence["TRIM"]
378 if 'MITE' in dEvidence.keys():
379 dEvidence["TIR-MITE"] = dEvidence["MITE"]
380 del dEvidence["MITE"]
381 self.setInfoEvidence(dEvidence)
382
383
384
385