Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/utils/ClassifUtils.py @ 16:836ce3d9d47a draft default tip
Uploaded
| author | urgi-team |
|---|---|
| date | Thu, 21 Jul 2016 07:42:47 -0400 |
| parents | 255c852351c5 |
| children |
comparison
equal
deleted
inserted
replaced
| 15:255c852351c5 | 16:836ce3d9d47a |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 import os | |
| 32 import json | |
| 33 from collections import OrderedDict | |
| 34 from commons.tools.RenameHeaderClassif import RenameHeaderClassif | |
| 35 | |
| 36 class ClassifUtils(object): | |
| 37 | |
| 38 @staticmethod | |
| 39 def _formatProfilesResultsAsDict(lProfilesResults): | |
| 40 if len(lProfilesResults) == 0: | |
| 41 return OrderedDict() | |
| 42 | |
| 43 dResults = OrderedDict() | |
| 44 | |
| 45 for refNameAndCoverage in lProfilesResults: | |
| 46 refName, coverage = refNameAndCoverage.split(": ") | |
| 47 | |
| 48 coverage = coverage.split("%(") | |
| 49 coverageOnSubject = float(coverage.pop(1).replace("%)", "")) | |
| 50 coverage = float(coverage.pop(0)) | |
| 51 | |
| 52 profilesResult = OrderedDict() | |
| 53 profilesResult["cov"] = coverage | |
| 54 profilesResult["covOnSubject"] = coverageOnSubject | |
| 55 dResults[refName] = profilesResult | |
| 56 return dResults | |
| 57 | |
| 58 @staticmethod | |
| 59 def _formatCodingFeaturesAsDict(lineOfEvidence, dCoding): | |
| 60 codingEvidences = lineOfEvidence.split("; ") | |
| 61 | |
| 62 for codingTypeData in codingEvidences: | |
| 63 codingTypeData = codingTypeData.split(": ") | |
| 64 codingType = codingTypeData.pop(0) | |
| 65 | |
| 66 codingTypeData = ": ".join(codingTypeData) | |
| 67 codingTypeData = codingTypeData.split(", ") | |
| 68 | |
| 69 if codingType == "TE_BLRtx": | |
| 70 if not dCoding.has_key("TE_BLRtx"): | |
| 71 dCoding["TE_BLRtx"] = OrderedDict() | |
| 72 for refNameAndCoverage in codingTypeData: | |
| 73 blrtxResult = OrderedDict() | |
| 74 refName, coverage = refNameAndCoverage.rsplit(": ", 1) | |
| 75 coverage = float(coverage.replace("%", "")) | |
| 76 blrtxResult["cov"] = coverage | |
| 77 dCoding["TE_BLRtx"][refName] = blrtxResult | |
| 78 | |
| 79 if codingType == "TE_BLRx": | |
| 80 if not dCoding.has_key("TE_BLRx"): | |
| 81 dCoding["TE_BLRx"] = OrderedDict() | |
| 82 for refNameAndCoverage in codingTypeData: | |
| 83 blrxResult = OrderedDict() | |
| 84 refName, coverage = refNameAndCoverage.rsplit(": ", 1) | |
| 85 coverage = float(coverage.replace("%", "")) | |
| 86 blrxResult["cov"] = coverage | |
| 87 dCoding["TE_BLRx"][refName] = blrxResult | |
| 88 | |
| 89 if codingType == "profiles": | |
| 90 dCoding["profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) | |
| 91 | |
| 92 if codingType == "Other_profiles": | |
| 93 dCoding["Other_profiles"] = ClassifUtils._formatProfilesResultsAsDict(codingTypeData) | |
| 94 | |
| 95 if codingType == "rDNA_BLRn": | |
| 96 dCoding["rDNA_BLRn"] = OrderedDict() | |
| 97 codingTypeData = ", ".join(codingTypeData) | |
| 98 try: | |
| 99 refName, coverage = codingTypeData.rsplit(": ", 1) | |
| 100 coverage = float(coverage.replace("%", "")) | |
| 101 except ValueError: | |
| 102 refName = codingTypeData | |
| 103 coverage = -1.0 | |
| 104 | |
| 105 dCoding["rDNA_BLRn"]["name"] = refName | |
| 106 dCoding["rDNA_BLRn"]["cov"] = coverage | |
| 107 | |
| 108 if codingType == "HG_BLRn": | |
| 109 dCoding["HG_BLRn"] = OrderedDict() | |
| 110 refName, coverage = codingTypeData[0].rsplit(": ", 1) | |
| 111 coverage = float(coverage.replace("%", "")) | |
| 112 | |
| 113 dCoding["HG_BLRn"]["name"] = refName | |
| 114 dCoding["HG_BLRn"]["cov"] = coverage | |
| 115 | |
| 116 @staticmethod | |
| 117 def _formatStructFeaturesAsDict(lineOfEvidence, dStruct): | |
| 118 structEvidences = lineOfEvidence.split("; ") | |
| 119 for structTypeData in structEvidences: | |
| 120 | |
| 121 structTypeData = structTypeData.split(": ") | |
| 122 structType = structTypeData.pop(0) | |
| 123 | |
| 124 structTypeData = ": ".join(structTypeData) | |
| 125 structTypeData = structTypeData.split(", ") | |
| 126 | |
| 127 if structType == "TElength": | |
| 128 dStruct["TElength"] = structTypeData.pop() | |
| 129 | |
| 130 if structType == "TermRepeats": | |
| 131 dStruct["TermRepeats"] = OrderedDict() | |
| 132 for refNameAndLength in structTypeData: | |
| 133 refName, length = refNameAndLength.rsplit(": ", 1) | |
| 134 dStruct["TermRepeats"][refName] = int(length) | |
| 135 | |
| 136 if structType == "ORF": | |
| 137 if not dStruct.has_key("ORF"): | |
| 138 dStruct["ORF"] = structTypeData | |
| 139 | |
| 140 if structType in ["SSR", "SSRtrf"]: | |
| 141 if not dStruct.has_key(structType): | |
| 142 dStruct[structType] = structTypeData | |
| 143 | |
| 144 if "SSRCoverage" in structType : | |
| 145 dummy, cov = structType.split("=") | |
| 146 dStruct["SSRCoverage"] = float(cov) | |
| 147 | |
| 148 if structType == "polyAtail": | |
| 149 dStruct["polyAtail"] = True | |
| 150 | |
| 151 if structType == "helitronExtremities": | |
| 152 structTypeData = ", ".join(structTypeData) | |
| 153 structTypeData = structTypeData.split("), ") | |
| 154 dStruct["helitronExtremities"] = OrderedDict() | |
| 155 for helitronData in structTypeData: | |
| 156 helName, helData = helitronData.split(": (") | |
| 157 helData = helData.replace(")", "") | |
| 158 eValue, start, end = helData.split(", ") | |
| 159 | |
| 160 helitronExtResult = OrderedDict() | |
| 161 helitronExtResult["start"] = int(start) | |
| 162 helitronExtResult["end"] = int(end) | |
| 163 helitronExtResult["eValue"] = float(eValue) | |
| 164 dStruct["helitronExtremities"][helName] = helitronExtResult | |
| 165 | |
| 166 @staticmethod | |
| 167 def _formatOtherFeaturesAsDict(lineOfEvidence, dOther): | |
| 168 if lineOfEvidence != "": | |
| 169 ClassifUtils._formatCodingFeaturesAsDict(lineOfEvidence, dOther) | |
| 170 ClassifUtils._formatStructFeaturesAsDict(lineOfEvidence, dOther) | |
| 171 | |
| 172 @staticmethod | |
| 173 def getClassifLineAsDict(line): | |
| 174 dClassif = OrderedDict() | |
| 175 iRenameHeaderClassif = RenameHeaderClassif() | |
| 176 lClassifItem = line.split("\t") | |
| 177 if len(lClassifItem) != 8: | |
| 178 msg = "Can't parse line: \"%s\"\n" % line.strip() | |
| 179 print("WARNING - ClassifUtils - %s" % msg) | |
| 180 return dClassif | |
| 181 | |
| 182 teClass = lClassifItem[4] | |
| 183 teOrder = lClassifItem[5] | |
| 184 # TODO: recompute wicker code like this or force the user to provide a classif file as input with the wicker code already added | |
| 185 wCode = iRenameHeaderClassif._decisionRuleForWickerCode(teClass, teOrder) | |
| 186 | |
| 187 dClassif["name"] = lClassifItem[0] | |
| 188 dClassif["wCode"] = wCode | |
| 189 dClassif["length"] = int(lClassifItem[1]) | |
| 190 dClassif["strand"] = lClassifItem[2] | |
| 191 dClassif["chimeric"] = False if lClassifItem[3] == "ok" else True | |
| 192 | |
| 193 dClassif["class"] = teClass | |
| 194 dClassif["order"] = teOrder | |
| 195 | |
| 196 if(lClassifItem[6] == "complete"): | |
| 197 dClassif["complete"] = True | |
| 198 elif(lClassifItem[6] == "incomplete"): | |
| 199 dClassif["complete"] = False | |
| 200 else: | |
| 201 dClassif["complete"] = None | |
| 202 | |
| 203 allFields = lClassifItem[7].split("; ") | |
| 204 | |
| 205 CI = allFields.pop(0) | |
| 206 CI = CI.split("=")[-1] | |
| 207 if CI != "NA": | |
| 208 try: | |
| 209 CI = int(CI) | |
| 210 except ValueError as e: | |
| 211 print "Couldn't convert %s to int : %s" % (CI, e) | |
| 212 dClassif["CI"] = CI | |
| 213 | |
| 214 dClassif["coding"] = OrderedDict() | |
| 215 dClassif["struct"] = OrderedDict() | |
| 216 dClassif["other"] = OrderedDict() | |
| 217 | |
| 218 allFields = "; ".join(allFields) | |
| 219 codingField = "" | |
| 220 structField = "" | |
| 221 otherField = "" | |
| 222 | |
| 223 codingStart = allFields.find("coding=(") | |
| 224 if codingStart != -1: | |
| 225 pCount = 1 | |
| 226 trueStart = codingStart + len("coding=(") | |
| 227 end = trueStart | |
| 228 for char in allFields[trueStart:]: | |
| 229 if char == "(": | |
| 230 pCount += 1 | |
| 231 if char == ")": | |
| 232 pCount -= 1 | |
| 233 if pCount == 0: | |
| 234 break; | |
| 235 end += 1 | |
| 236 if pCount == 0: | |
| 237 codingField = allFields[trueStart:end] | |
| 238 | |
| 239 structStart = allFields.find("struct=(") | |
| 240 if structStart != -1: | |
| 241 pCount = 1 | |
| 242 trueStart = structStart + len("struct=(") | |
| 243 end = trueStart | |
| 244 for char in allFields[trueStart:]: | |
| 245 if char == "(": | |
| 246 pCount += 1 | |
| 247 if char == ")": | |
| 248 pCount -= 1 | |
| 249 if pCount == 0: | |
| 250 break; | |
| 251 end += 1 | |
| 252 structField = allFields[trueStart:end] | |
| 253 | |
| 254 otherStart = allFields.find("other=(") | |
| 255 if otherStart != -1: | |
| 256 pCount = 1 | |
| 257 trueStart = otherStart + len("other=(") | |
| 258 end = trueStart | |
| 259 for char in allFields[trueStart:]: | |
| 260 if char == "(": | |
| 261 pCount += 1 | |
| 262 if char == ")": | |
| 263 pCount -= 1 | |
| 264 if pCount == 0: | |
| 265 break; | |
| 266 end += 1 | |
| 267 otherField = allFields[trueStart:end] | |
| 268 | |
| 269 if codingField != "": | |
| 270 ClassifUtils._formatCodingFeaturesAsDict(codingField, dClassif["coding"]) | |
| 271 if structField != "": | |
| 272 ClassifUtils._formatStructFeaturesAsDict(structField, dClassif["struct"]) | |
| 273 if otherField != "": | |
| 274 ClassifUtils._formatOtherFeaturesAsDict(otherField, dClassif["other"]) | |
| 275 | |
| 276 return dClassif | |
| 277 | |
| 278 ## Retrieve the classification informations of a classif file | |
| 279 # | |
| 280 # @param fileName Name of the classif file | |
| 281 # @return A dict containing the classification infos | |
| 282 # | |
| 283 @staticmethod | |
| 284 def getClassifInfosAsDict(fileName): | |
| 285 dConsensusInfo = OrderedDict() | |
| 286 | |
| 287 ext = os.path.splitext(fileName)[1] | |
| 288 if ext != ".classif": | |
| 289 msg = "Input file must be a classif file from TEdenovo\n" | |
| 290 print("ERROR - ClassifUtils - %s" % msg) | |
| 291 exit(1) | |
| 292 | |
| 293 with open(fileName, "r") as classifFile: | |
| 294 for line in classifFile: | |
| 295 seqName = line.split("\t")[0] | |
| 296 dConsensusInfo[seqName] = ClassifUtils.getClassifLineAsDict(line) | |
| 297 | |
| 298 return dConsensusInfo | |
| 299 | |
| 300 ## Convert a classif file to JSON format | |
| 301 # | |
| 302 # @param fileName Name of the classif file | |
| 303 # @param outFileName Name of the output JSON file (optional) | |
| 304 # | |
| 305 @staticmethod | |
| 306 def convertClassifToJson(fileName, outFileName = ""): | |
| 307 dConsensusInfo = ClassifUtils.getClassifInfosAsDict(fileName) | |
| 308 if outFileName == "": | |
| 309 outFileName = "%s_classif.json" % (os.path.basename(fileName).rsplit(".", 1)[0]) | |
| 310 with open(outFileName, 'w') as outFile: | |
| 311 json.dump(dConsensusInfo, outFile) |
