Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/TEiso/ClosestToStartSite.py @ 6:20ec0d14798e draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 05:00:24 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:4093a2fb58be | 6:20ec0d14798e |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 4 # http://www.inra.fr | |
| 5 # http://urgi.versailles.inra.fr | |
| 6 # | |
| 7 # This software is governed by the CeCILL license under French law and | |
| 8 # abiding by the rules of distribution of free software. You can use, | |
| 9 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 11 # "http://www.cecill.info". | |
| 12 # | |
| 13 # As a counterpart to the access to the source code and rights to copy, | |
| 14 # modify and redistribute granted by the license, users are provided only | |
| 15 # with a limited warranty and the software's author, the holder of the | |
| 16 # economic rights, and the successive licensors have only limited | |
| 17 # liability. | |
| 18 # | |
| 19 # In this respect, the user's attention is drawn to the risks associated | |
| 20 # with loading, using, modifying and/or developing or reproducing the | |
| 21 # software by the user in light of its specific status of free software, | |
| 22 # that may mean that it is complicated to manipulate, and that also | |
| 23 # therefore means that it is reserved for developers and experienced | |
| 24 # professionals having in-depth computer knowledge. Users are therefore | |
| 25 # encouraged to load and test the software's suitability as regards their | |
| 26 # requirements in conditions enabling the security of their systems and/or | |
| 27 # data to be ensured and, more generally, to use and operate it in the | |
| 28 # same conditions as regards security. | |
| 29 # | |
| 30 # The fact that you are presently reading this means that you have had | |
| 31 # knowledge of the CeCILL license and that you accept its terms. | |
| 32 | |
| 33 from commons.core.LoggerFactory import LoggerFactory | |
| 34 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
| 35 from commons.core.checker.RepetException import RepetException | |
| 36 from commons.core.utils.FileUtils import FileUtils | |
| 37 import re,os | |
| 38 LOG_NAME = "TEiso" | |
| 39 | |
| 40 class ClosestToStartSite(object): | |
| 41 | |
| 42 def __init__(self, inputFile = "", cuffcom_tmap = "", outputFile = "", verbosity = 3): | |
| 43 self._inputFile = inputFile | |
| 44 self._cuffcom_tmap = cuffcom_tmap | |
| 45 self._outputFile = outputFile | |
| 46 self._verbosity = verbosity | |
| 47 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) | |
| 48 | |
| 49 def setAttributesFromCmdLine(self): | |
| 50 self._toolVersion = "1.0" | |
| 51 description = "ClosestToStartSite version %s" % self._toolVersion | |
| 52 epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n" | |
| 53 epilog +="it can also add the class code of features A.\n" | |
| 54 epilog += "example: ClosestToStartSite.py -i <inputFile> -c <cuff_in.tmap> -o <outputFile>\n" | |
| 55 parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) | |
| 56 parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "input bed file name.", default = "") | |
| 57 parser.add_option("-c", "--cuffcom_tmap", dest = "cuffcom_tmap", action = "store", type = "string", help = "input gtf file of cuffcompare (<cuff_in>.tmap)", default = "") | |
| 58 parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "") | |
| 59 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 3]",default = 3) | |
| 60 options = parser.parse_args()[0] | |
| 61 self._setAttributesFromOptions(options) | |
| 62 | |
| 63 def _setAttributesFromOptions(self, options): | |
| 64 self._inputFile = options.inputFile | |
| 65 self._cuffcom_tmap = options.cuffcom_tmap | |
| 66 self._outputFile = options.outputFile | |
| 67 self._verbosity = options.verbosity | |
| 68 | |
| 69 def _logAndRaise(self, errorMsg): | |
| 70 self._log.error(errorMsg) | |
| 71 raise RepetException(errorMsg) | |
| 72 | |
| 73 def checkoption(self): | |
| 74 if self._inputFile == "": | |
| 75 self._log.info("Missing input file") | |
| 76 else: | |
| 77 if not FileUtils.isRessourceExists(self._inputFile): | |
| 78 self._log.info("'%s' doesn't exist!" % self._inputFile) | |
| 79 | |
| 80 if self._cuffcom_tmap != "": | |
| 81 if not FileUtils.isRessourceExists(self._cuffcom_tmap): | |
| 82 self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap) | |
| 83 if self._outputFile == "": | |
| 84 self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0] | |
| 85 else: | |
| 86 if FileUtils.isRessourceExists(self._outputFile): | |
| 87 self._log.info("Output file '%s' already exists!" % self._outputFile) | |
| 88 else: | |
| 89 if self._outputFile == "": | |
| 90 self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0] | |
| 91 else: | |
| 92 if FileUtils.isRessourceExists(self._outputFile): | |
| 93 self._log.info("Output file '%s' already exists!" % self._outputFile) | |
| 94 | |
| 95 | |
| 96 def getClassCodeCuffcompare(self, tmap_file, listPossitions): | |
| 97 class_code_dic = {} | |
| 98 lcode_ref = [] | |
| 99 tmp = [] | |
| 100 linetowrite =[] | |
| 101 with open(tmap_file) as tmap: | |
| 102 tmapline = tmap.readlines() | |
| 103 for i in range(1,len(tmapline)): | |
| 104 cuff_id = tmapline[i].split("\t")[4].strip() | |
| 105 class_code = tmapline[i].split("\t")[2].strip() | |
| 106 ref_id = tmapline[i].split("\t")[1].strip() | |
| 107 lcode_ref.append(class_code) | |
| 108 lcode_ref.append(ref_id) | |
| 109 class_code_dic[cuff_id] = lcode_ref | |
| 110 lcode_ref = [] | |
| 111 | |
| 112 | |
| 113 for i in xrange(0,len(listPossitions)): | |
| 114 tmp.extend(listPossitions[i]) | |
| 115 transcript_bedtools = listPossitions[i][3] | |
| 116 | |
| 117 if transcript_bedtools in class_code_dic.keys(): | |
| 118 tmp.append(class_code_dic[transcript_bedtools][0]) | |
| 119 tmp.append(class_code_dic[transcript_bedtools][1]) | |
| 120 else: | |
| 121 tmp.append("NA") | |
| 122 tmp.append("NA") | |
| 123 linetowrite.append(tmp) | |
| 124 tmp=[] | |
| 125 return linetowrite | |
| 126 | |
| 127 | |
| 128 | |
| 129 def getClosestToStartSite(self, inputFile): | |
| 130 linelist = [] | |
| 131 tmplist = [] | |
| 132 with open(inputFile, "r") as bedFile: | |
| 133 for line in bedFile.readlines(): | |
| 134 m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) | |
| 135 if(m != None): | |
| 136 start_TR = int(m.group(2))##F[1] | |
| 137 end_TR = int(m.group(3))##F[2] | |
| 138 strand_TR= m.group(6) ##[5] | |
| 139 start_TE = int(m.group(9))##[8] | |
| 140 end_TE = int(m.group(10))##[9] | |
| 141 dist = int(m.group(14))##[13] | |
| 142 if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): | |
| 143 for i in range(0,len(line.split("\t"))-1): | |
| 144 tmplist.append(line.split("\t")[i]) | |
| 145 tmplist.append(line.split("\t")[13].strip()) | |
| 146 tmplist.append("TE_closest_TSS") | |
| 147 linelist.append(tmplist) | |
| 148 tmplist = [] | |
| 149 # F[1] gene F[2] | |
| 150 # =========================> | |
| 151 # ------------ | |
| 152 # F[8] F[9] | |
| 153 | |
| 154 if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): | |
| 155 for i in range(0,len(line.split("\t"))-1): | |
| 156 tmplist.append(line.split("\t")[i]) | |
| 157 tmplist.append(line.split("\t")[13].strip()) | |
| 158 tmplist.append("TE_closest_TSS") | |
| 159 linelist.append(tmplist) | |
| 160 tmplist = [] | |
| 161 | |
| 162 # F[1] F[2] | |
| 163 # <====================== | |
| 164 # --------------- | |
| 165 | |
| 166 if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): | |
| 167 for i in range(0,len(line.split("\t"))-1): | |
| 168 tmplist.append(line.split("\t")[i]) | |
| 169 overlap = (end_TE-start_TR)+1 | |
| 170 tmplist.append(overlap) | |
| 171 tmplist.append("TE_overlap_TSS") | |
| 172 linelist.append(tmplist) | |
| 173 tmplist = [] | |
| 174 | |
| 175 # F[1] gene F[2] | |
| 176 # =========================> | |
| 177 # ------------- | |
| 178 # F[8] F[9] | |
| 179 | |
| 180 # gene | |
| 181 # F[1]=========================>F[2] | |
| 182 | |
| 183 # F[8]---------------F[9] | |
| 184 | |
| 185 | |
| 186 if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): | |
| 187 for i in range(0,len(line.split("\t"))-1): | |
| 188 tmplist.append(line.split("\t")[i]) | |
| 189 tmplist.append(0) | |
| 190 tmplist.append("TE_overlap_TSS") | |
| 191 linelist.append(tmplist) | |
| 192 tmplist = [] | |
| 193 | |
| 194 ## F[1]=============================>F[2] | |
| 195 ## F[8]---------------F[9] | |
| 196 | |
| 197 | |
| 198 if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): | |
| 199 for i in range(0,len(line.split("\t"))-1): | |
| 200 tmplist.append(line.split("\t")[i]) | |
| 201 overlap = (end_TR-start_TE)+1 | |
| 202 tmplist.append(overlap) | |
| 203 tmplist.append("TE_overlap_TSS") | |
| 204 linelist.append(tmplist) | |
| 205 tmplist = [] | |
| 206 | |
| 207 | |
| 208 # F[1]<======================F[2] | |
| 209 # --------------- | |
| 210 # F[8] F[9] | |
| 211 # | |
| 212 # | |
| 213 # F[1]<=============================F[2] | |
| 214 # F[8]---------------F[9] | |
| 215 | |
| 216 if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): | |
| 217 for i in range(0,len(line.split("\t"))-1): | |
| 218 tmplist.append(line.split("\t")[i]) | |
| 219 tmplist.append(0) | |
| 220 tmplist.append("TE_overlap_TSS") | |
| 221 linelist.append(tmplist) | |
| 222 tmplist = [] | |
| 223 | |
| 224 # F[1]<=============================F[2] | |
| 225 # F[8]---------------F[9] | |
| 226 | |
| 227 if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): | |
| 228 for i in range(0,len(line.split("\t"))-1): | |
| 229 tmplist.append(line.split("\t")[i]) | |
| 230 tmplist.append(0) | |
| 231 #tmplist.append(line.strip()) | |
| 232 tmplist.append("TE_in_gene") | |
| 233 linelist.append(tmplist) | |
| 234 tmplist = [] | |
| 235 | |
| 236 | |
| 237 # F[1] gene F[2] | |
| 238 # ============================== | |
| 239 # ----------- | |
| 240 # F[8] F[9] | |
| 241 | |
| 242 | |
| 243 if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): | |
| 244 for i in range(0,len(line.split("\t"))-1): | |
| 245 tmplist.append(line.split("\t")[i]) | |
| 246 #lenTE = (end_TE-start_TE)+1 | |
| 247 tmplist.append(0) | |
| 248 tmplist.append("gene_in_TE") | |
| 249 linelist.append(tmplist) | |
| 250 tmplist = [] | |
| 251 | |
| 252 # F[1]======================F[2] | |
| 253 # F[8]----------------------------------------------------F[9] | |
| 254 | |
| 255 | |
| 256 if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): | |
| 257 for i in range(0,len(line.split("\t"))-1): | |
| 258 tmplist.append(line.split("\t")[i]) | |
| 259 tmplist.append(0) | |
| 260 #tmplist.append(line.strip()) | |
| 261 tmplist.append("gene_in_TE") | |
| 262 linelist.append(tmplist) | |
| 263 tmplist = [] | |
| 264 | |
| 265 # F[1]==================================>F[2] | |
| 266 # F[8]----------------------------------------------------------F[9] | |
| 267 | |
| 268 if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): | |
| 269 for i in range(0,len(line.split("\t"))-1): | |
| 270 tmplist.append(line.split("\t")[i]) | |
| 271 tmplist.append(0) | |
| 272 #tmplist.append(line.strip()) | |
| 273 tmplist.append("gene_in_TE") | |
| 274 linelist.append(tmplist) | |
| 275 tmplist = [] | |
| 276 | |
| 277 # F[1]<==================================F[2] | |
| 278 # F[8]----------------------------------------------------------F[9] | |
| 279 | |
| 280 if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): | |
| 281 for i in range(0,len(line.split("\t"))-1): | |
| 282 tmplist.append(line.split("\t")[i]) | |
| 283 tmplist.append(0) | |
| 284 #tmplist.append(line.strip()) | |
| 285 tmplist.append("gene_in_TE") | |
| 286 linelist.append(tmplist) | |
| 287 tmplist = [] | |
| 288 | |
| 289 # F[1]==================================>F[2] | |
| 290 # F[8]----------------------------------------------------------F[9] | |
| 291 | |
| 292 if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): | |
| 293 for i in range(0,len(line.split("\t"))-1): | |
| 294 tmplist.append(line.split("\t")[i]) | |
| 295 tmplist.append(0) | |
| 296 #tmplist.append(line.strip()) | |
| 297 tmplist.append("gene_in_TE") | |
| 298 linelist.append(tmplist) | |
| 299 tmplist = [] | |
| 300 | |
| 301 # F[1]<==================================F[2] | |
| 302 # F[8]----------------------------------------------------------F[9] | |
| 303 | |
| 304 return linelist | |
| 305 | |
| 306 | |
| 307 def writeOutputFromList(self, listPossitions , outputFile): | |
| 308 w = open(outputFile,'w') | |
| 309 for s in listPossitions: | |
| 310 line= "\t".join(str(item) for item in s) | |
| 311 w.write("%s\n" % line) | |
| 312 w.close() | |
| 313 | |
| 314 | |
| 315 def run(self): | |
| 316 self.checkoption() | |
| 317 listPossitions = self.getClosestToStartSite(self._inputFile) | |
| 318 if self._cuffcom_tmap == "": | |
| 319 self.writeOutputFromList(listPossitions, self._outputFile ) | |
| 320 else: | |
| 321 listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions) | |
| 322 self.writeOutputFromList(listclasscode, self._outputFile) | |
| 323 | |
| 324 if __name__== "__main__": | |
| 325 iClosestToStartSite = ClosestToStartSite() | |
| 326 iClosestToStartSite.setAttributesFromCmdLine() | |
| 327 iClosestToStartSite.run() | |
| 328 | |
| 329 | |
| 330 | |
| 331 | |
| 332 | |
| 333 |
