comparison TEisotools-1.1.a/commons/core/utils/ClassifUtils.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
comparison
equal deleted inserted replaced
15:255c852351c5 16:836ce3d9d47a
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31 import os
32 import json
33 from collections import OrderedDict
34 from commons.tools.RenameHeaderClassif import RenameHeaderClassif
35
36 class ClassifUtils(object):
37
38 @staticmethod
39 def _formatProfilesResultsAsDict(lProfilesResults):
40 if len(lProfilesResults) == 0:
41 return OrderedDict()
42
43 dResults = OrderedDict()
44
45 for refNameAndCoverage in lProfilesResults:
46 refName, coverage = refNameAndCoverage.split(": ")
47
48 coverage = coverage.split("%(")
49 coverageOnSubject = float(coverage.pop(1).replace("%)", ""))
50 coverage = float(coverage.pop(0))
51
52 profilesResult = OrderedDict()
53 profilesResult["cov"] = coverage
54 profilesResult["covOnSubject"] = coverageOnSubject
55 dResults[refName] = profilesResult
56 return dResults
57
58 @staticmethod
59 def _formatCodingFeaturesAsDict(lineOfEvidence, dCoding):
60 codingEvidences = lineOfEvidence.split("; ")
61
62 for codingTypeData in codingEvidences:
63 codingTypeData = codingTypeData.split(": ")
64 codingType = codingTypeData.pop(0)
65
66 codingTypeData = ": ".join(codingTypeData)
67 codingTypeData = codingTypeData.split(", ")
68
69 if codingType == "TE_BLRtx":
70 if not dCoding.has_key("TE_BLRtx"):
71 dCoding["TE_BLRtx"] = OrderedDict()
72 for refNameAndCoverage in codingTypeData:
73 blrtxResult = OrderedDict()
74 refName, coverage = refNameAndCoverage.rsplit(": ", 1)
75 coverage = float(coverage.replace("%", ""))
76 blrtxResult["cov"] = coverage
77 dCoding["TE_BLRtx"][refName] = blrtxResult
78
79 if codingType == "TE_BLRx":
80 if not dCoding.has_key("TE_BLRx"):
81 dCoding["TE_BLRx"] = OrderedDict()
82 for refNameAndCoverage in codingTypeData:
83 blrxResult = OrderedDict()
84 refName, coverage = refNameAndCoverage.rsplit(": ", 1)
85 coverage = float(coverage.replace("%", ""))
86 blrxResult["cov"] = coverage
87 dCoding["TE_BLRx"][refName] = blrxResult
88
89 if codingType == "profiles":
90 dCoding["profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData)
91
92 if codingType == "Other_profiles":
93 dCoding["Other_profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData)
94
95 if codingType == "rDNA_BLRn":
96 dCoding["rDNA_BLRn"] = OrderedDict()
97 codingTypeData = ", ".join(codingTypeData)
98 try:
99 refName, coverage = codingTypeData.rsplit(": ", 1)
100 coverage = float(coverage.replace("%", ""))
101 except ValueError:
102 refName = codingTypeData
103 coverage = -1.0
104
105 dCoding["rDNA_BLRn"]["name"] = refName
106 dCoding["rDNA_BLRn"]["cov"] = coverage
107
108 if codingType == "HG_BLRn":
109 dCoding["HG_BLRn"] = OrderedDict()
110 refName, coverage = codingTypeData[0].rsplit(": ", 1)
111 coverage = float(coverage.replace("%", ""))
112
113 dCoding["HG_BLRn"]["name"] = refName
114 dCoding["HG_BLRn"]["cov"] = coverage
115
116 @staticmethod
117 def _formatStructFeaturesAsDict(lineOfEvidence, dStruct):
118 structEvidences = lineOfEvidence.split("; ")
119 for structTypeData in structEvidences:
120
121 structTypeData = structTypeData.split(": ")
122 structType = structTypeData.pop(0)
123
124 structTypeData = ": ".join(structTypeData)
125 structTypeData = structTypeData.split(", ")
126
127 if structType == "TElength":
128 dStruct["TElength"] = structTypeData.pop()
129
130 if structType == "TermRepeats":
131 dStruct["TermRepeats"] = OrderedDict()
132 for refNameAndLength in structTypeData:
133 refName, length = refNameAndLength.rsplit(": ", 1)
134 dStruct["TermRepeats"][refName] = int(length)
135
136 if structType == "ORF":
137 if not dStruct.has_key("ORF"):
138 dStruct["ORF"] = structTypeData
139
140 if structType in ["SSR", "SSRtrf"]:
141 if not dStruct.has_key(structType):
142 dStruct[structType] = structTypeData
143
144 if "SSRCoverage" in structType :
145 dummy, cov = structType.split("=")
146 dStruct["SSRCoverage"] = float(cov)
147
148 if structType == "polyAtail":
149 dStruct["polyAtail"] = True
150
151 if structType == "helitronExtremities":
152 structTypeData = ", ".join(structTypeData)
153 structTypeData = structTypeData.split("), ")
154 dStruct["helitronExtremities"] = OrderedDict()
155 for helitronData in structTypeData:
156 helName, helData = helitronData.split(": (")
157 helData = helData.replace(")", "")
158 eValue, start, end = helData.split(", ")
159
160 helitronExtResult = OrderedDict()
161 helitronExtResult["start"] = int(start)
162 helitronExtResult["end"] = int(end)
163 helitronExtResult["eValue"] = float(eValue)
164 dStruct["helitronExtremities"][helName] = helitronExtResult
165
166 @staticmethod
167 def _formatOtherFeaturesAsDict(lineOfEvidence, dOther):
168 if lineOfEvidence != "":
169 ClassifUtils._formatCodingFeaturesAsDict(lineOfEvidence, dOther)
170 ClassifUtils._formatStructFeaturesAsDict(lineOfEvidence, dOther)
171
172 @staticmethod
173 def getClassifLineAsDict(line):
174 dClassif = OrderedDict()
175 iRenameHeaderClassif = RenameHeaderClassif()
176 lClassifItem = line.split("\t")
177 if len(lClassifItem) != 8:
178 msg = "Can't parse line: \"%s\"\n" % line.strip()
179 print("WARNING - ClassifUtils - %s" % msg)
180 return dClassif
181
182 teClass = lClassifItem[4]
183 teOrder = lClassifItem[5]
184 # TODO: recompute wicker code like this or force the user to provide a classif file as input with the wicker code already added
185 wCode = iRenameHeaderClassif._decisionRuleForWickerCode(teClass, teOrder)
186
187 dClassif["name"] = lClassifItem[0]
188 dClassif["wCode"] = wCode
189 dClassif["length"] = int(lClassifItem[1])
190 dClassif["strand"] = lClassifItem[2]
191 dClassif["chimeric"] = False if lClassifItem[3] == "ok" else True
192
193 dClassif["class"] = teClass
194 dClassif["order"] = teOrder
195
196 if(lClassifItem[6] == "complete"):
197 dClassif["complete"] = True
198 elif(lClassifItem[6] == "incomplete"):
199 dClassif["complete"] = False
200 else:
201 dClassif["complete"] = None
202
203 allFields = lClassifItem[7].split("; ")
204
205 CI = allFields.pop(0)
206 CI = CI.split("=")[-1]
207 if CI != "NA":
208 try:
209 CI = int(CI)
210 except ValueError as e:
211 print "Couldn't convert %s to int : %s" % (CI, e)
212 dClassif["CI"] = CI
213
214 dClassif["coding"] = OrderedDict()
215 dClassif["struct"] = OrderedDict()
216 dClassif["other"] = OrderedDict()
217
218 allFields = "; ".join(allFields)
219 codingField = ""
220 structField = ""
221 otherField = ""
222
223 codingStart = allFields.find("coding=(")
224 if codingStart != -1:
225 pCount = 1
226 trueStart = codingStart + len("coding=(")
227 end = trueStart
228 for char in allFields[trueStart:]:
229 if char == "(":
230 pCount += 1
231 if char == ")":
232 pCount -= 1
233 if pCount == 0:
234 break;
235 end += 1
236 if pCount == 0:
237 codingField = allFields[trueStart:end]
238
239 structStart = allFields.find("struct=(")
240 if structStart != -1:
241 pCount = 1
242 trueStart = structStart + len("struct=(")
243 end = trueStart
244 for char in allFields[trueStart:]:
245 if char == "(":
246 pCount += 1
247 if char == ")":
248 pCount -= 1
249 if pCount == 0:
250 break;
251 end += 1
252 structField = allFields[trueStart:end]
253
254 otherStart = allFields.find("other=(")
255 if otherStart != -1:
256 pCount = 1
257 trueStart = otherStart + len("other=(")
258 end = trueStart
259 for char in allFields[trueStart:]:
260 if char == "(":
261 pCount += 1
262 if char == ")":
263 pCount -= 1
264 if pCount == 0:
265 break;
266 end += 1
267 otherField = allFields[trueStart:end]
268
269 if codingField != "":
270 ClassifUtils._formatCodingFeaturesAsDict(codingField, dClassif["coding"])
271 if structField != "":
272 ClassifUtils._formatStructFeaturesAsDict(structField, dClassif["struct"])
273 if otherField != "":
274 ClassifUtils._formatOtherFeaturesAsDict(otherField, dClassif["other"])
275
276 return dClassif
277
278 ## Retrieve the classification informations of a classif file
279 #
280 # @param fileName Name of the classif file
281 # @return A dict containing the classification infos
282 #
283 @staticmethod
284 def getClassifInfosAsDict(fileName):
285 dConsensusInfo = OrderedDict()
286
287 ext = os.path.splitext(fileName)[1]
288 if ext != ".classif":
289 msg = "Input file must be a classif file from TEdenovo\n"
290 print("ERROR - ClassifUtils - %s" % msg)
291 exit(1)
292
293 with open(fileName, "r") as classifFile:
294 for line in classifFile:
295 seqName = line.split("\t")[0]
296 dConsensusInfo[seqName] = ClassifUtils.getClassifLineAsDict(line)
297
298 return dConsensusInfo
299
300 ## Convert a classif file to JSON format
301 #
302 # @param fileName Name of the classif file
303 # @param outFileName Name of the output JSON file (optional)
304 #
305 @staticmethod
306 def convertClassifToJson(fileName, outFileName = ""):
307 dConsensusInfo = ClassifUtils.getClassifInfosAsDict(fileName)
308 if outFileName == "":
309 outFileName = "%s_classif.json" % (os.path.basename(fileName).rsplit(".", 1)[0])
310 with open(outFileName, 'w') as outFile:
311 json.dump(dConsensusInfo, outFile)