Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/TEiso/ClosestToStartSite.py @ 16:836ce3d9d47a draft default tip
Uploaded
author | urgi-team |
---|---|
date | Thu, 21 Jul 2016 07:42:47 -0400 |
parents | 255c852351c5 |
children |
comparison
equal
deleted
inserted
replaced
15:255c852351c5 | 16:836ce3d9d47a |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
4 # http://www.inra.fr | |
5 # http://urgi.versailles.inra.fr | |
6 # | |
7 # This software is governed by the CeCILL license under French law and | |
8 # abiding by the rules of distribution of free software. You can use, | |
9 # modify and/ or redistribute the software under the terms of the CeCILL | |
10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
11 # "http://www.cecill.info". | |
12 # | |
13 # As a counterpart to the access to the source code and rights to copy, | |
14 # modify and redistribute granted by the license, users are provided only | |
15 # with a limited warranty and the software's author, the holder of the | |
16 # economic rights, and the successive licensors have only limited | |
17 # liability. | |
18 # | |
19 # In this respect, the user's attention is drawn to the risks associated | |
20 # with loading, using, modifying and/or developing or reproducing the | |
21 # software by the user in light of its specific status of free software, | |
22 # that may mean that it is complicated to manipulate, and that also | |
23 # therefore means that it is reserved for developers and experienced | |
24 # professionals having in-depth computer knowledge. Users are therefore | |
25 # encouraged to load and test the software's suitability as regards their | |
26 # requirements in conditions enabling the security of their systems and/or | |
27 # data to be ensured and, more generally, to use and operate it in the | |
28 # same conditions as regards security. | |
29 # | |
30 # The fact that you are presently reading this means that you have had | |
31 # knowledge of the CeCILL license and that you accept its terms. | |
32 | |
33 from commons.core.LoggerFactory import LoggerFactory | |
34 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
35 from commons.core.checker.RepetException import RepetException | |
36 from commons.core.utils.FileUtils import FileUtils | |
37 import re,os | |
38 LOG_NAME = "TEiso" | |
39 | |
40 class ClosestToStartSite(object): | |
41 | |
42 def __init__(self, inputFile = "", cuffcom_tmap = "", outputFile = "", verbosity = 3): | |
43 self._inputFile = inputFile | |
44 self._cuffcom_tmap = cuffcom_tmap | |
45 self._outputFile = outputFile | |
46 self._verbosity = verbosity | |
47 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_NAME, self.__class__.__name__), self._verbosity) | |
48 | |
49 def setAttributesFromCmdLine(self): | |
50 self._toolVersion = "1.1.a" | |
51 description = "ClosestToStartSite version %s" % self._toolVersion | |
52 epilog = "\nParser a bed file and create a bed file to create a report about positions of features A to features B. \n" | |
53 epilog +="it can also add the class code of features A.\n" | |
54 epilog += "example: ClosestToStartSite.py -i <inputFile> -c <cuff_in.tmap> -o <outputFile>\n" | |
55 parser = RepetOptionParser(description = description, epilog = epilog, version = self._toolVersion) | |
56 parser.add_option("-i", "--inputFile", dest = "inputFile", action = "store", type = "string", help = "input bed file name.", default = "") | |
57 parser.add_option("-c", "--cuffcom_tmap", dest = "cuffcom_tmap", action = "store", type = "string", help = "input gtf file of cuffcompare (<cuff_in>.tmap)", default = "") | |
58 parser.add_option("-o", "--outputFile", dest = "outputFile", action = "store", type = "string", help = "output file name", default = "") | |
59 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 3]",default = 3) | |
60 options = parser.parse_args()[0] | |
61 self._setAttributesFromOptions(options) | |
62 | |
63 def _setAttributesFromOptions(self, options): | |
64 self._inputFile = options.inputFile | |
65 self._cuffcom_tmap = options.cuffcom_tmap | |
66 self._outputFile = options.outputFile | |
67 self._verbosity = options.verbosity | |
68 | |
69 def _logAndRaise(self, errorMsg): | |
70 self._log.error(errorMsg) | |
71 raise RepetException(errorMsg) | |
72 | |
73 def checkoption(self): | |
74 if self._inputFile == "": | |
75 self._log.info("Missing input file") | |
76 else: | |
77 if not FileUtils.isRessourceExists(self._inputFile): | |
78 self._log.info("'%s' doesn't exist!" % self._inputFile) | |
79 | |
80 if self._cuffcom_tmap != "": | |
81 if not FileUtils.isRessourceExists(self._cuffcom_tmap): | |
82 self._log.info("'%s' doesn't exist!" % self._cuffcom_tmap) | |
83 if self._outputFile == "": | |
84 self._outputFile = "%s_Close2TSS_with_classcode.bed" % os.path.splitext(self._inputFile)[0] | |
85 else: | |
86 if FileUtils.isRessourceExists(self._outputFile): | |
87 self._log.info("Output file '%s' already exists!" % self._outputFile) | |
88 else: | |
89 if self._outputFile == "": | |
90 self._outputFile = "%s_Close2TSS.bed" % os.path.splitext(self._inputFile)[0] | |
91 else: | |
92 if FileUtils.isRessourceExists(self._outputFile): | |
93 self._log.info("Output file '%s' already exists!" % self._outputFile) | |
94 | |
95 | |
96 def getClassCodeCuffcompare(self, tmap_file, listPossitions): | |
97 class_code_dic = {} | |
98 lcode_ref = [] | |
99 tmp = [] | |
100 linetowrite =[] | |
101 with open(tmap_file) as tmap: | |
102 tmapline = tmap.readlines() | |
103 for i in range(1,len(tmapline)): | |
104 cuff_id = tmapline[i].split("\t")[4].strip() | |
105 class_code = tmapline[i].split("\t")[2].strip() | |
106 ref_id = tmapline[i].split("\t")[1].strip() | |
107 lcode_ref.append(class_code) | |
108 lcode_ref.append(ref_id) | |
109 class_code_dic[cuff_id] = lcode_ref | |
110 lcode_ref = [] | |
111 | |
112 | |
113 for i in xrange(0,len(listPossitions)): | |
114 tmp.extend(listPossitions[i]) | |
115 transcript_bedtools = listPossitions[i][3] | |
116 | |
117 if transcript_bedtools in class_code_dic.keys(): | |
118 tmp.append(class_code_dic[transcript_bedtools][0]) | |
119 tmp.append(class_code_dic[transcript_bedtools][1]) | |
120 else: | |
121 tmp.append("NA") | |
122 tmp.append("NA") | |
123 linetowrite.append(tmp) | |
124 tmp=[] | |
125 return linetowrite | |
126 | |
127 | |
128 | |
129 def getClosestToStartSite(self, inputFile): | |
130 linelist = [] | |
131 tmplist = [] | |
132 with open(inputFile, "r") as bedFile: | |
133 for line in bedFile.readlines(): | |
134 m = re.search(r"^\s*(\S+)\t+(\d+)\t+(\d+)\t+([^\t]+)\t+([^\t]+)\t+([+-])\t+(\d+\.\d+)\t+([^\t]+)+\t+(\d+)\t+(\d+)\t+([^\t]+)+\t+([^\t]+)\t+([+-])\t+([^\t]+)",line) | |
135 if(m != None): | |
136 start_TR = int(m.group(2))##F[1] | |
137 end_TR = int(m.group(3))##F[2] | |
138 strand_TR= m.group(6) ##[5] | |
139 start_TE = int(m.group(9))##[8] | |
140 end_TE = int(m.group(10))##[9] | |
141 dist = int(m.group(14))##[13] | |
142 if (start_TE < start_TR) and (end_TE < start_TR) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE) and (dist != 0): | |
143 for i in range(0,len(line.split("\t"))-1): | |
144 tmplist.append(line.split("\t")[i]) | |
145 tmplist.append(line.split("\t")[13].strip()) | |
146 tmplist.append("TE_closest_TSS") | |
147 linelist.append(tmplist) | |
148 tmplist = [] | |
149 # F[1] gene F[2] | |
150 # =========================> | |
151 # ------------ | |
152 # F[8] F[9] | |
153 | |
154 if (start_TE > end_TR) and (end_TE > end_TR) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE) and (dist != 0): | |
155 for i in range(0,len(line.split("\t"))-1): | |
156 tmplist.append(line.split("\t")[i]) | |
157 tmplist.append(line.split("\t")[13].strip()) | |
158 tmplist.append("TE_closest_TSS") | |
159 linelist.append(tmplist) | |
160 tmplist = [] | |
161 | |
162 # F[1] F[2] | |
163 # <====================== | |
164 # --------------- | |
165 | |
166 if (start_TE <= start_TR) and (start_TR < end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): | |
167 for i in range(0,len(line.split("\t"))-1): | |
168 tmplist.append(line.split("\t")[i]) | |
169 overlap = (end_TE-start_TR)+1 | |
170 tmplist.append(overlap) | |
171 tmplist.append("TE_overlap_TSS") | |
172 linelist.append(tmplist) | |
173 tmplist = [] | |
174 | |
175 # F[1] gene F[2] | |
176 # =========================> | |
177 # ------------- | |
178 # F[8] F[9] | |
179 | |
180 # gene | |
181 # F[1]=========================>F[2] | |
182 | |
183 # F[8]---------------F[9] | |
184 | |
185 | |
186 if (start_TE < start_TR) and (start_TR == end_TE) and (strand_TR =="+") and (end_TR > end_TE) and (end_TR > start_TE): | |
187 for i in range(0,len(line.split("\t"))-1): | |
188 tmplist.append(line.split("\t")[i]) | |
189 tmplist.append(0) | |
190 tmplist.append("TE_overlap_TSS") | |
191 linelist.append(tmplist) | |
192 tmplist = [] | |
193 | |
194 ## F[1]=============================>F[2] | |
195 ## F[8]---------------F[9] | |
196 | |
197 | |
198 if (start_TE < end_TR) and (end_TR <= end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): | |
199 for i in range(0,len(line.split("\t"))-1): | |
200 tmplist.append(line.split("\t")[i]) | |
201 overlap = (end_TR-start_TE)+1 | |
202 tmplist.append(overlap) | |
203 tmplist.append("TE_overlap_TSS") | |
204 linelist.append(tmplist) | |
205 tmplist = [] | |
206 | |
207 | |
208 # F[1]<======================F[2] | |
209 # --------------- | |
210 # F[8] F[9] | |
211 # | |
212 # | |
213 # F[1]<=============================F[2] | |
214 # F[8]---------------F[9] | |
215 | |
216 if (start_TE == end_TR) and (end_TR < end_TE) and (strand_TR =="-") and (start_TR < start_TE) and (start_TR < end_TE): | |
217 for i in range(0,len(line.split("\t"))-1): | |
218 tmplist.append(line.split("\t")[i]) | |
219 tmplist.append(0) | |
220 tmplist.append("TE_overlap_TSS") | |
221 linelist.append(tmplist) | |
222 tmplist = [] | |
223 | |
224 # F[1]<=============================F[2] | |
225 # F[8]---------------F[9] | |
226 | |
227 if (start_TR < start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE < end_TR) and (dist == 0): | |
228 for i in range(0,len(line.split("\t"))-1): | |
229 tmplist.append(line.split("\t")[i]) | |
230 tmplist.append(0) | |
231 #tmplist.append(line.strip()) | |
232 tmplist.append("TE_in_gene") | |
233 linelist.append(tmplist) | |
234 tmplist = [] | |
235 | |
236 | |
237 # F[1] gene F[2] | |
238 # ============================== | |
239 # ----------- | |
240 # F[8] F[9] | |
241 | |
242 | |
243 if (start_TE < start_TR) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TR < end_TE): | |
244 for i in range(0,len(line.split("\t"))-1): | |
245 tmplist.append(line.split("\t")[i]) | |
246 #lenTE = (end_TE-start_TE)+1 | |
247 tmplist.append(0) | |
248 tmplist.append("gene_in_TE") | |
249 linelist.append(tmplist) | |
250 tmplist = [] | |
251 | |
252 # F[1]======================F[2] | |
253 # F[8]----------------------------------------------------F[9] | |
254 | |
255 | |
256 if (strand_TR =="+") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): | |
257 for i in range(0,len(line.split("\t"))-1): | |
258 tmplist.append(line.split("\t")[i]) | |
259 tmplist.append(0) | |
260 #tmplist.append(line.strip()) | |
261 tmplist.append("gene_in_TE") | |
262 linelist.append(tmplist) | |
263 tmplist = [] | |
264 | |
265 # F[1]==================================>F[2] | |
266 # F[8]----------------------------------------------------------F[9] | |
267 | |
268 if (strand_TR =="-") and (start_TR > start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE == end_TR): | |
269 for i in range(0,len(line.split("\t"))-1): | |
270 tmplist.append(line.split("\t")[i]) | |
271 tmplist.append(0) | |
272 #tmplist.append(line.strip()) | |
273 tmplist.append("gene_in_TE") | |
274 linelist.append(tmplist) | |
275 tmplist = [] | |
276 | |
277 # F[1]<==================================F[2] | |
278 # F[8]----------------------------------------------------------F[9] | |
279 | |
280 if (strand_TR =="+") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): | |
281 for i in range(0,len(line.split("\t"))-1): | |
282 tmplist.append(line.split("\t")[i]) | |
283 tmplist.append(0) | |
284 #tmplist.append(line.strip()) | |
285 tmplist.append("gene_in_TE") | |
286 linelist.append(tmplist) | |
287 tmplist = [] | |
288 | |
289 # F[1]==================================>F[2] | |
290 # F[8]----------------------------------------------------------F[9] | |
291 | |
292 if (strand_TR =="-") and (start_TR == start_TE) and (start_TR < end_TE) and (start_TE < end_TR) and (end_TE > end_TR): | |
293 for i in range(0,len(line.split("\t"))-1): | |
294 tmplist.append(line.split("\t")[i]) | |
295 tmplist.append(0) | |
296 #tmplist.append(line.strip()) | |
297 tmplist.append("gene_in_TE") | |
298 linelist.append(tmplist) | |
299 tmplist = [] | |
300 | |
301 # F[1]<==================================F[2] | |
302 # F[8]----------------------------------------------------------F[9] | |
303 | |
304 return linelist | |
305 | |
306 | |
307 def writeOutputFromList(self, listPossitions , outputFile): | |
308 w = open(outputFile,'w') | |
309 for s in listPossitions: | |
310 line= "\t".join(str(item) for item in s) | |
311 w.write("%s\n" % line) | |
312 w.close() | |
313 | |
314 | |
315 def run(self): | |
316 self.checkoption() | |
317 listPossitions = self.getClosestToStartSite(self._inputFile) | |
318 if self._cuffcom_tmap == "": | |
319 self.writeOutputFromList(listPossitions, self._outputFile ) | |
320 else: | |
321 listclasscode = self.getClassCodeCuffcompare(self._cuffcom_tmap, listPossitions) | |
322 self.writeOutputFromList(listclasscode, self._outputFile) | |
323 | |
324 if __name__== "__main__": | |
325 iClosestToStartSite = ClosestToStartSite() | |
326 iClosestToStartSite.setAttributesFromCmdLine() | |
327 iClosestToStartSite.run() | |
328 | |
329 | |
330 | |
331 | |
332 | |
333 |