Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/utils/ClassifUtils.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:4093a2fb58be | 6:20ec0d14798e |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 import os | |
32 import json | |
33 from collections import OrderedDict | |
34 from commons.tools.RenameHeaderClassif import RenameHeaderClassif | |
35 | |
36 class ClassifUtils(object): | |
37 | |
38 @staticmethod | |
39 def _formatProfilesResultsAsDict(lProfilesResults): | |
40 if len(lProfilesResults) == 0: | |
41 return OrderedDict() | |
42 | |
43 dResults = OrderedDict() | |
44 | |
45 for refNameAndCoverage in lProfilesResults: | |
46 refName, coverage = refNameAndCoverage.split(": ") | |
47 | |
48 coverage = coverage.split("%(") | |
49 coverageOnSubject = float(coverage.pop(1).replace("%)", "")) | |
50 coverage = float(coverage.pop(0)) | |
51 | |
52 profilesResult = OrderedDict() | |
53 profilesResult["cov"] = coverage | |
54 profilesResult["covOnSubject"] = coverageOnSubject | |
55 dResults[refName] = profilesResult | |
56 return dResults | |
57 | |
58 @staticmethod | |
59 def _formatCodingFeaturesAsDict(lineOfEvidence, dCoding): | |
60 codingEvidences = lineOfEvidence.split("; ") | |
61 | |
62 for codingTypeData in codingEvidences: | |
63 codingTypeData = codingTypeData.split(": ") | |
64 codingType = codingTypeData.pop(0) | |
65 | |
66 codingTypeData = ": ".join(codingTypeData) | |
67 codingTypeData = codingTypeData.split(", ") | |
68 | |
69 if codingType == "TE_BLRtx": | |
70 if not dCoding.has_key("TE_BLRtx"): | |
71 dCoding["TE_BLRtx"] = OrderedDict() | |
72 for refNameAndCoverage in codingTypeData: | |
73 blrtxResult = OrderedDict() | |
74 refName, coverage = refNameAndCoverage.rsplit(": ", 1) | |
75 coverage = float(coverage.replace("%", "")) | |
76 blrtxResult["cov"] = coverage | |
77 dCoding["TE_BLRtx"][refName] = blrtxResult | |
78 | |
79 if codingType == "TE_BLRx": | |
80 if not dCoding.has_key("TE_BLRx"): | |
81 dCoding["TE_BLRx"] = OrderedDict() | |
82 for refNameAndCoverage in codingTypeData: | |
83 blrxResult = OrderedDict() | |
84 refName, coverage = refNameAndCoverage.rsplit(": ", 1) | |
85 coverage = float(coverage.replace("%", "")) | |
86 blrxResult["cov"] = coverage | |
87 dCoding["TE_BLRx"][refName] = blrxResult | |
88 | |
89 if codingType == "profiles": | |
90 dCoding["profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) | |
91 | |
92 if codingType == "Other_profiles": | |
93 dCoding["Other_profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) | |
94 | |
95 if codingType == "rDNA_BLRn": | |
96 dCoding["rDNA_BLRn"] = OrderedDict() | |
97 codingTypeData = ", ".join(codingTypeData) | |
98 try: | |
99 refName, coverage = codingTypeData.rsplit(": ", 1) | |
100 coverage = float(coverage.replace("%", "")) | |
101 except ValueError: | |
102 refName = codingTypeData | |
103 coverage = -1.0 | |
104 | |
105 dCoding["rDNA_BLRn"]["name"] = refName | |
106 dCoding["rDNA_BLRn"]["cov"] = coverage | |
107 | |
108 if codingType == "HG_BLRn": | |
109 dCoding["HG_BLRn"] = OrderedDict() | |
110 refName, coverage = codingTypeData[0].rsplit(": ", 1) | |
111 coverage = float(coverage.replace("%", "")) | |
112 | |
113 dCoding["HG_BLRn"]["name"] = refName | |
114 dCoding["HG_BLRn"]["cov"] = coverage | |
115 | |
116 @staticmethod | |
117 def _formatStructFeaturesAsDict(lineOfEvidence, dStruct): | |
118 structEvidences = lineOfEvidence.split("; ") | |
119 for structTypeData in structEvidences: | |
120 | |
121 structTypeData = structTypeData.split(": ") | |
122 structType = structTypeData.pop(0) | |
123 | |
124 structTypeData = ": ".join(structTypeData) | |
125 structTypeData = structTypeData.split(", ") | |
126 | |
127 if structType == "TElength": | |
128 dStruct["TElength"] = structTypeData.pop() | |
129 | |
130 if structType == "TermRepeats": | |
131 dStruct["TermRepeats"] = OrderedDict() | |
132 for refNameAndLength in structTypeData: | |
133 refName, length = refNameAndLength.rsplit(": ", 1) | |
134 dStruct["TermRepeats"][refName] = int(length) | |
135 | |
136 if structType == "ORF": | |
137 if not dStruct.has_key("ORF"): | |
138 dStruct["ORF"] = structTypeData | |
139 | |
140 if structType in ["SSR", "SSRtrf"]: | |
141 if not dStruct.has_key(structType): | |
142 dStruct[structType] = structTypeData | |
143 | |
144 if "SSRCoverage" in structType : | |
145 dummy, cov = structType.split("=") | |
146 dStruct["SSRCoverage"] = float(cov) | |
147 | |
148 if structType == "polyAtail": | |
149 dStruct["polyAtail"] = True | |
150 | |
151 if structType == "helitronExtremities": | |
152 structTypeData = ", ".join(structTypeData) | |
153 structTypeData = structTypeData.split("), ") | |
154 dStruct["helitronExtremities"] = OrderedDict() | |
155 for helitronData in structTypeData: | |
156 helName, helData = helitronData.split(": (") | |
157 helData = helData.replace(")", "") | |
158 eValue, start, end = helData.split(", ") | |
159 | |
160 helitronExtResult = OrderedDict() | |
161 helitronExtResult["start"] = int(start) | |
162 helitronExtResult["end"] = int(end) | |
163 helitronExtResult["eValue"] = float(eValue) | |
164 dStruct["helitronExtremities"][helName] = helitronExtResult | |
165 | |
166 @staticmethod | |
167 def _formatOtherFeaturesAsDict(lineOfEvidence, dOther): | |
168 if lineOfEvidence != "": | |
169 ClassifUtils._formatCodingFeaturesAsDict(lineOfEvidence, dOther) | |
170 ClassifUtils._formatStructFeaturesAsDict(lineOfEvidence, dOther) | |
171 | |
172 @staticmethod | |
173 def getClassifLineAsDict(line): | |
174 dClassif = OrderedDict() | |
175 iRenameHeaderClassif = RenameHeaderClassif() | |
176 lClassifItem = line.split("\t") | |
177 if len(lClassifItem) != 8: | |
178 msg = "Can't parse line: \"%s\"\n" % line.strip() | |
179 print("WARNING - ClassifUtils - %s" % msg) | |
180 return dClassif | |
181 | |
182 teClass = lClassifItem[4] | |
183 teOrder = lClassifItem[5] | |
184 # TODO: recompute wicker code like this or force the user to provide a classif file as input with the wicker code already added | |
185 wCode = iRenameHeaderClassif._decisionRuleForWickerCode(teClass, teOrder) | |
186 | |
187 dClassif["name"] = lClassifItem[0] | |
188 dClassif["wCode"] = wCode | |
189 dClassif["length"] = int(lClassifItem[1]) | |
190 dClassif["strand"] = lClassifItem[2] | |
191 dClassif["chimeric"] = False if lClassifItem[3] == "ok" else True | |
192 | |
193 dClassif["class"] = teClass | |
194 dClassif["order"] = teOrder | |
195 | |
196 if(lClassifItem[6] == "complete"): | |
197 dClassif["complete"] = True | |
198 elif(lClassifItem[6] == "incomplete"): | |
199 dClassif["complete"] = False | |
200 else: | |
201 dClassif["complete"] = None | |
202 | |
203 allFields = lClassifItem[7].split("; ") | |
204 | |
205 CI = allFields.pop(0) | |
206 CI = CI.split("=")[-1] | |
207 if CI != "NA": | |
208 try: | |
209 CI = int(CI) | |
210 except ValueError as e: | |
211 print "Couldn't convert %s to int : %s" % (CI, e) | |
212 dClassif["CI"] = CI | |
213 | |
214 dClassif["coding"] = OrderedDict() | |
215 dClassif["struct"] = OrderedDict() | |
216 dClassif["other"] = OrderedDict() | |
217 | |
218 allFields = "; ".join(allFields) | |
219 codingField = "" | |
220 structField = "" | |
221 otherField = "" | |
222 | |
223 codingStart = allFields.find("coding=(") | |
224 if codingStart != -1: | |
225 pCount = 1 | |
226 trueStart = codingStart + len("coding=(") | |
227 end = trueStart | |
228 for char in allFields[trueStart:]: | |
229 if char == "(": | |
230 pCount += 1 | |
231 if char == ")": | |
232 pCount -= 1 | |
233 if pCount == 0: | |
234 break; | |
235 end += 1 | |
236 if pCount == 0: | |
237 codingField = allFields[trueStart:end] | |
238 | |
239 structStart = allFields.find("struct=(") | |
240 if structStart != -1: | |
241 pCount = 1 | |
242 trueStart = structStart + len("struct=(") | |
243 end = trueStart | |
244 for char in allFields[trueStart:]: | |
245 if char == "(": | |
246 pCount += 1 | |
247 if char == ")": | |
248 pCount -= 1 | |
249 if pCount == 0: | |
250 break; | |
251 end += 1 | |
252 structField = allFields[trueStart:end] | |
253 | |
254 otherStart = allFields.find("other=(") | |
255 if otherStart != -1: | |
256 pCount = 1 | |
257 trueStart = otherStart + len("other=(") | |
258 end = trueStart | |
259 for char in allFields[trueStart:]: | |
260 if char == "(": | |
261 pCount += 1 | |
262 if char == ")": | |
263 pCount -= 1 | |
264 if pCount == 0: | |
265 break; | |
266 end += 1 | |
267 otherField = allFields[trueStart:end] | |
268 | |
269 if codingField != "": | |
270 ClassifUtils._formatCodingFeaturesAsDict(codingField, dClassif["coding"]) | |
271 if structField != "": | |
272 ClassifUtils._formatStructFeaturesAsDict(structField, dClassif["struct"]) | |
273 if otherField != "": | |
274 ClassifUtils._formatOtherFeaturesAsDict(otherField, dClassif["other"]) | |
275 | |
276 return dClassif | |
277 | |
278 ## Retrieve the classification informations of a classif file | |
279 # | |
280 # @param fileName Name of the classif file | |
281 # @return A dict containing the classification infos | |
282 # | |
283 @staticmethod | |
284 def getClassifInfosAsDict(fileName): | |
285 dConsensusInfo = OrderedDict() | |
286 | |
287 ext = os.path.splitext(fileName)[1] | |
288 if ext != ".classif": | |
289 msg = "Input file must be a classif file from TEdenovo\n" | |
290 print("ERROR - ClassifUtils - %s" % msg) | |
291 exit(1) | |
292 | |
293 with open(fileName, "r") as classifFile: | |
294 for line in classifFile: | |
295 seqName = line.split("\t")[0] | |
296 dConsensusInfo[seqName] = ClassifUtils.getClassifLineAsDict(line) | |
297 | |
298 return dConsensusInfo | |
299 | |
300 ## Convert a classif file to JSON format | |
301 # | |
302 # @param fileName Name of the classif file | |
303 # @param outFileName Name of the output JSON file (optional) | |
304 # | |
305 @staticmethod | |
306 def convertClassifToJson(fileName, outFileName = ""): | |
307 dConsensusInfo = ClassifUtils.getClassifInfosAsDict(fileName) | |
308 if outFileName == "": | |
309 outFileName = "%s_classif.json" % (os.path.basename(fileName).rsplit(".", 1)[0]) | |
310 with open(outFileName, 'w') as outFile: | |
311 json.dump(dConsensusInfo, outFile) |