comparison TEisotools-1.1.a/TEiso/ClosestToStartSite.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
comparison
equal deleted inserted replaced
15:255c852351c5 16:836ce3d9d47a
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 from commons.core.LoggerFactory import LoggerFactory
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
35 from commons.core.checker.RepetException import RepetException
36 from commons.core.utils.FileUtils import FileUtils
37 import re,os
38 LOG_NAME = "TEiso"
39
40 class ClosestToStartSite(object):
41
42 def __init__(self, inputFile = "", cuffcom_tmap = "", outputFile = "", verbosity = 3):
43 self._inputFile = inputFile
44 self._cuffcom_tmap = cuffcom_tmap
45 self._outputFile = outputFile
46 self._verbosity = verbosity
47 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity)
48
49 def setAttributesFromCmdLine(self):
50 self._toolVersion = "1.1.a"
51 description = "ClosestToStartSite version %s" % self._toolVersion
52 epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n"
53 epilog +="it can also add the class code of features A.\n"
54 epilog += "example: ClosestToStartSite.py -i <inputFile> -c <cuff_in.tmap> -o <outputFile>\n"
55 parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion)
56 parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "input bed file name.", default = "")
57 parser.add_option("-c", "--cuffcom_tmap", dest = "cuffcom_tmap", action = "store", type = "string", help = "input gtf file of cuffcompare (<cuff_in>.tmap)", default = "")
58 parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "")
59 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 3]",default = 3)
60 options = parser.parse_args()[0]
61 self._setAttributesFromOptions(options)
62
63 def _setAttributesFromOptions(self, options):
64 self._inputFile = options.inputFile
65 self._cuffcom_tmap = options.cuffcom_tmap
66 self._outputFile = options.outputFile
67 self._verbosity = options.verbosity
68
69 def _logAndRaise(self, errorMsg):
70 self._log.error(errorMsg)
71 raise RepetException(errorMsg)
72
73 def checkoption(self):
74 if self._inputFile == "":
75 self._log.info("Missing input file")
76 else:
77 if not FileUtils.isRessourceExists(self._inputFile):
78 self._log.info("'%s' doesn't exist!" % self._inputFile)
79
80 if self._cuffcom_tmap != "":
81 if not FileUtils.isRessourceExists(self._cuffcom_tmap):
82 self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap)
83 if self._outputFile == "":
84 self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0]
85 else:
86 if FileUtils.isRessourceExists(self._outputFile):
87 self._log.info("Output file '%s' already exists!" % self._outputFile)
88 else:
89 if self._outputFile == "":
90 self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0]
91 else:
92 if FileUtils.isRessourceExists(self._outputFile):
93 self._log.info("Output file '%s' already exists!" % self._outputFile)
94
95
96 def getClassCodeCuffcompare(self, tmap_file, listPossitions):
97 class_code_dic = {}
98 lcode_ref = []
99 tmp = []
100 linetowrite =[]
101 with open(tmap_file) as tmap:
102 tmapline = tmap.readlines()
103 for i in range(1,len(tmapline)):
104 cuff_id = tmapline[i].split("\t")[4].strip()
105 class_code = tmapline[i].split("\t")[2].strip()
106 ref_id = tmapline[i].split("\t")[1].strip()
107 lcode_ref.append(class_code)
108 lcode_ref.append(ref_id)
109 class_code_dic[cuff_id] = lcode_ref
110 lcode_ref = []
111
112
113 for i in xrange(0,len(listPossitions)):
114 tmp.extend(listPossitions[i])
115 transcript_bedtools = listPossitions[i][3]
116
117 if transcript_bedtools in class_code_dic.keys():
118 tmp.append(class_code_dic[transcript_bedtools][0])
119 tmp.append(class_code_dic[transcript_bedtools][1])
120 else:
121 tmp.append("NA")
122 tmp.append("NA")
123 linetowrite.append(tmp)
124 tmp=[]
125 return linetowrite
126
127
128
129 def getClosestToStartSite(self, inputFile):
130 linelist = []
131 tmplist = []
132 with open(inputFile, "r") as bedFile:
133 for line in bedFile.readlines():
134 m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line)
135 if(m != None):
136 start_TR = int(m.group(2))##F[1]
137 end_TR = int(m.group(3))##F[2]
138 strand_TR= m.group(6) ##[5]
139 start_TE = int(m.group(9))##[8]
140 end_TE = int(m.group(10))##[9]
141 dist = int(m.group(14))##[13]
142 if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0):
143 for i in range(0,len(line.split("\t"))-1):
144 tmplist.append(line.split("\t")[i])
145 tmplist.append(line.split("\t")[13].strip())
146 tmplist.append("TE_closest_TSS")
147 linelist.append(tmplist)
148 tmplist = []
149 # F[1] gene F[2]
150 # =========================>
151 # ------------
152 # F[8] F[9]
153
154 if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0):
155 for i in range(0,len(line.split("\t"))-1):
156 tmplist.append(line.split("\t")[i])
157 tmplist.append(line.split("\t")[13].strip())
158 tmplist.append("TE_closest_TSS")
159 linelist.append(tmplist)
160 tmplist = []
161
162 # F[1] F[2]
163 # <======================
164 # ---------------
165
166 if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE):
167 for i in range(0,len(line.split("\t"))-1):
168 tmplist.append(line.split("\t")[i])
169 overlap = (end_TE-start_TR)+1
170 tmplist.append(overlap)
171 tmplist.append("TE_overlap_TSS")
172 linelist.append(tmplist)
173 tmplist = []
174
175 # F[1] gene F[2]
176 # =========================>
177 # -------------
178 # F[8] F[9]
179
180 # gene
181 # F[1]=========================>F[2]
182
183 # F[8]---------------F[9]
184
185
186 if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE):
187 for i in range(0,len(line.split("\t"))-1):
188 tmplist.append(line.split("\t")[i])
189 tmplist.append(0)
190 tmplist.append("TE_overlap_TSS")
191 linelist.append(tmplist)
192 tmplist = []
193
194 ## F[1]=============================>F[2]
195 ## F[8]---------------F[9]
196
197
198 if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE):
199 for i in range(0,len(line.split("\t"))-1):
200 tmplist.append(line.split("\t")[i])
201 overlap = (end_TR-start_TE)+1
202 tmplist.append(overlap)
203 tmplist.append("TE_overlap_TSS")
204 linelist.append(tmplist)
205 tmplist = []
206
207
208 # F[1]<======================F[2]
209 # ---------------
210 # F[8] F[9]
211 #
212 #
213 # F[1]<=============================F[2]
214 # F[8]---------------F[9]
215
216 if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE):
217 for i in range(0,len(line.split("\t"))-1):
218 tmplist.append(line.split("\t")[i])
219 tmplist.append(0)
220 tmplist.append("TE_overlap_TSS")
221 linelist.append(tmplist)
222 tmplist = []
223
224 # F[1]<=============================F[2]
225 # F[8]---------------F[9]
226
227 if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0):
228 for i in range(0,len(line.split("\t"))-1):
229 tmplist.append(line.split("\t")[i])
230 tmplist.append(0)
231 #tmplist.append(line.strip())
232 tmplist.append("TE_in_gene")
233 linelist.append(tmplist)
234 tmplist = []
235
236
237 # F[1] gene F[2]
238 # ==============================
239 # -----------
240 # F[8] F[9]
241
242
243 if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE):
244 for i in range(0,len(line.split("\t"))-1):
245 tmplist.append(line.split("\t")[i])
246 #lenTE = (end_TE-start_TE)+1
247 tmplist.append(0)
248 tmplist.append("gene_in_TE")
249 linelist.append(tmplist)
250 tmplist = []
251
252 # F[1]======================F[2]
253 # F[8]----------------------------------------------------F[9]
254
255
256 if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR):
257 for i in range(0,len(line.split("\t"))-1):
258 tmplist.append(line.split("\t")[i])
259 tmplist.append(0)
260 #tmplist.append(line.strip())
261 tmplist.append("gene_in_TE")
262 linelist.append(tmplist)
263 tmplist = []
264
265 # F[1]==================================>F[2]
266 # F[8]----------------------------------------------------------F[9]
267
268 if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR):
269 for i in range(0,len(line.split("\t"))-1):
270 tmplist.append(line.split("\t")[i])
271 tmplist.append(0)
272 #tmplist.append(line.strip())
273 tmplist.append("gene_in_TE")
274 linelist.append(tmplist)
275 tmplist = []
276
277 # F[1]<==================================F[2]
278 # F[8]----------------------------------------------------------F[9]
279
280 if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR):
281 for i in range(0,len(line.split("\t"))-1):
282 tmplist.append(line.split("\t")[i])
283 tmplist.append(0)
284 #tmplist.append(line.strip())
285 tmplist.append("gene_in_TE")
286 linelist.append(tmplist)
287 tmplist = []
288
289 # F[1]==================================>F[2]
290 # F[8]----------------------------------------------------------F[9]
291
292 if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR):
293 for i in range(0,len(line.split("\t"))-1):
294 tmplist.append(line.split("\t")[i])
295 tmplist.append(0)
296 #tmplist.append(line.strip())
297 tmplist.append("gene_in_TE")
298 linelist.append(tmplist)
299 tmplist = []
300
301 # F[1]<==================================F[2]
302 # F[8]----------------------------------------------------------F[9]
303
304 return linelist
305
306
307 def writeOutputFromList(self, listPossitions , outputFile):
308 w = open(outputFile,'w')
309 for s in listPossitions:
310 line= "\t".join(str(item) for item in s)
311 w.write("%s\n" % line)
312 w.close()
313
314
315 def run(self):
316 self.checkoption()
317 listPossitions = self.getClosestToStartSite(self._inputFile)
318 if self._cuffcom_tmap == "":
319 self.writeOutputFromList(listPossitions, self._outputFile )
320 else:
321 listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions)
322 self.writeOutputFromList(listclasscode, self._outputFile)
323
324 if __name__== "__main__":
325 iClosestToStartSite = ClosestToStartSite()
326 iClosestToStartSite.setAttributesFromCmdLine()
327 iClosestToStartSite.run()
328
329
330
331
332
333