6
|
1 #
|
|
2 # Copyright INRA-URGI 2009-2010
|
|
3 #
|
|
4 # This software is governed by the CeCILL license under French law and
|
|
5 # abiding by the rules of distribution of free software. You can use,
|
|
6 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
7 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
8 # "http://www.cecill.info".
|
|
9 #
|
|
10 # As a counterpart to the access to the source code and rights to copy,
|
|
11 # modify and redistribute granted by the license, users are provided only
|
|
12 # with a limited warranty and the software's author, the holder of the
|
|
13 # economic rights, and the successive licensors have only limited
|
|
14 # liability.
|
|
15 #
|
|
16 # In this respect, the user's attention is drawn to the risks associated
|
|
17 # with loading, using, modifying and/or developing or reproducing the
|
|
18 # software by the user in light of its specific status of free software,
|
|
19 # that may mean that it is complicated to manipulate, and that also
|
|
20 # therefore means that it is reserved for developers and experienced
|
|
21 # professionals having in-depth computer knowledge. Users are therefore
|
|
22 # encouraged to load and test the software's suitability as regards their
|
|
23 # requirements in conditions enabling the security of their systems and/or
|
|
24 # data to be ensured and, more generally, to use and operate it in the
|
|
25 # same conditions as regards security.
|
|
26 #
|
|
27 # The fact that you are presently reading this means that you have had
|
|
28 # knowledge of the CeCILL license and that you accept its terms.
|
|
29 #
|
|
30 import os
|
|
31 import sys
|
|
32 import random
|
|
33 import subprocess
|
|
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList
|
|
35 from SMART.Java.Python.structure.Transcript import Transcript
|
|
36 from SMART.Java.Python.misc.Progress import Progress
|
|
37 from commons.core.parsing.FastaParser import FastaParser
|
|
38
|
|
39
|
|
40 class RnaFoldStructure(object):
|
|
41 """
|
|
42 A structure to store the output of RNAFold
|
|
43 @ivar name: the name of the sequence
|
|
44 @type name: string
|
|
45 @ivar sequence: the sequence (with gaps)
|
|
46 @type sequence: string
|
|
47 @ivar structure: the bracket structure
|
|
48 @type structure: string
|
|
49 @ivar energy: the energy of the fold
|
|
50 @type energy: float
|
|
51 @ivar interactions: the interactions inside the structure
|
|
52 @ivar interactions: the interactions inside the structure
|
|
53 """
|
|
54
|
|
55 def __init__(self, name, sequence, structure, energy):
|
|
56 """
|
|
57 Initialize the structure
|
|
58 @param name the name of the sequence
|
|
59 @type name: string
|
|
60 @param sequence: the sequence (with gaps)
|
|
61 @type sequence: string
|
|
62 @param structure: the bracket structure
|
|
63 @type structure: string
|
|
64 @param energy: the energy of the fold
|
|
65 @type energy: float
|
|
66 """
|
|
67 self.name = name
|
|
68 self.sequence = sequence
|
|
69 self.structure = structure
|
|
70 self.energy = energy
|
|
71 self.interactions = None
|
|
72
|
|
73
|
|
74 def analyze(self):
|
|
75 """
|
|
76 Analyze the output, assign the interactions
|
|
77 """
|
|
78 if len(self.sequence) != len(self.structure):
|
|
79 sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure))
|
|
80 stack = []
|
|
81 self.interactions = [None for i in range(len(self.sequence))]
|
|
82 for i in range(len(self.sequence)):
|
|
83 if self.structure[i] == "(":
|
|
84 stack.append(i)
|
|
85 elif self.structure[i] == ")":
|
|
86 if not stack:
|
|
87 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
|
|
88 otherI = stack.pop()
|
|
89 self.interactions[i] = otherI
|
|
90 self.interactions[otherI] = i
|
|
91 if stack:
|
|
92 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
|
|
93
|
|
94
|
|
95 def getNbBulges(self, start = None, end = None):
|
|
96 """
|
|
97 Get the number of insertions in a given range (in number of letters)
|
|
98 @param start: where to start the count
|
|
99 @type start: int
|
|
100 @param end: where to end the co
|
|
101 @type end: int
|
|
102 """
|
|
103 if start == None:
|
|
104 start = 0
|
|
105 if end == None:
|
|
106 end = len(self.sequence)
|
|
107 previousInt = None
|
|
108 nbBulges = 0
|
|
109 inRegion = False
|
|
110 for i in range(len(self.sequence)):
|
|
111 if i == start:
|
|
112 inRegion = True
|
|
113 if i > end:
|
|
114 return nbBulges
|
|
115 if inRegion:
|
|
116 if self.interactions[i] == None:
|
|
117 nbBulges += 1
|
|
118 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1:
|
|
119 nbBulges += 1
|
|
120 previousInt = self.interactions[i]
|
|
121 return nbBulges
|
|
122
|
|
123
|
|
124 def getStar(self, start = None, end = None):
|
|
125 """
|
|
126 Get the supposed miRNA*
|
|
127 @param start: where to start the count
|
|
128 @type start: int
|
|
129 @param end: where to end the co
|
|
130 @type end: int
|
|
131 """
|
|
132 if start == None:
|
|
133 start = 0
|
|
134 if end == None:
|
|
135 end = len(self.sequence)
|
|
136 minStar = 1000000
|
|
137 maxStar = 0
|
|
138 inRegion = False
|
|
139 for i in range(len(self.sequence)):
|
|
140 if i == start:
|
|
141 inRegion = True
|
|
142 if i > end:
|
|
143 return (minStar, maxStar)
|
|
144 if inRegion:
|
|
145 if self.interactions[i] != None:
|
|
146 minStar = min(minStar, self.interactions[i])
|
|
147 maxStar = max(maxStar, self.interactions[i])
|
|
148 return (minStar, maxStar)
|
|
149
|
|
150
|
|
151
|
|
152 class RnaFoldLauncher(object):
|
|
153 """
|
|
154 Start and parse a RNAFold instance
|
|
155 @ivar inputTranscriptList: a list of transcripts
|
|
156 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
|
|
157 @ivar genomeFileParser: a parser to the genome file
|
|
158 @type genomeFileParser: class L{SequenceListParser<SequenceListParser>}
|
|
159 @ivar bothStrands: whether folding is done on both strands
|
|
160 @type bothStrands: bool
|
|
161 @ivar fivePrimeExtension: extension towards the 5' end
|
|
162 @type fivePrimeExtension: int
|
|
163 @ivar threePrimeExtension: extension towards the 3' end
|
|
164 @type threePrimeExtension: int
|
|
165 @ivar inputTranscriptList: the input list of transcripts
|
|
166 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
|
|
167 @ivar outputTranscriptList: the output list of transcripts
|
|
168 @type outputTranscriptList: class L{TranscriptList<TranscriptList>}
|
|
169 @ivar tmpInputFileName: the name of the temporary input file for RNAFold
|
|
170 @type tmpInputFileName: string
|
|
171 @ivar tmpOutputFileName: the name of the temporary output file for RNAFold
|
|
172 @type tmpOutputFileName: string
|
|
173 @ivar verbosity: verbosity
|
|
174 @type verbosity: int [default: 0]
|
|
175 """
|
|
176
|
|
177 def __init__(self, verbosity = 0):
|
|
178 """
|
|
179 Constructor
|
|
180 @param verbosity: verbosity
|
|
181 @type verbosity: int
|
|
182 """
|
|
183 self.verbosity = verbosity
|
|
184 self.transcriptNames = []
|
|
185 randomNumber = random.randint(0, 100000)
|
|
186 self.bothStrands = True
|
|
187 self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber)
|
|
188 self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber)
|
|
189 self.outputTranscriptList = None
|
|
190 self.fivePrimeExtension = 0
|
|
191 self.threePrimeExtension = 0
|
|
192
|
|
193
|
|
194 def __del__(self):
|
|
195 for file in (self.tmpInputFileName, self.tmpOutputFileName):
|
|
196 if os.path.isfile(file):
|
|
197 os.remove(file)
|
|
198
|
|
199
|
|
200 def setTranscriptList(self, inputTranscriptList):
|
|
201 """
|
|
202 Set the list of transcripts
|
|
203 @ivar inputTranscriptList: a list of transcripts
|
|
204 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
|
|
205 """
|
|
206 self.inputTranscriptList = inputTranscriptList
|
|
207
|
|
208
|
|
209 def setExtensions(self, fivePrime, threePrime):
|
|
210 """
|
|
211 Set extension sizes
|
|
212 @ivar fivePrime: extension towards the 5' end
|
|
213 @type fivePrime: int
|
|
214 @ivar threePrime: extension towards the 3' end
|
|
215 @type threePrime: int
|
|
216 """
|
|
217 self.fivePrimeExtension = fivePrime
|
|
218 self.threePrimeExtension = threePrime
|
|
219
|
|
220
|
|
221 def setNbStrands(self, nbStrands):
|
|
222 """
|
|
223 Set number of strands
|
|
224 @ivar nbStrands: if 2, the work is done on both strands
|
|
225 @type nbStrands: int
|
|
226 """
|
|
227 self.nbStrands = nbStrands
|
|
228
|
|
229
|
|
230 def setGenomeFile(self, fileName):
|
|
231 """
|
|
232 Set the genome file
|
|
233 @ivar genomeFileName: the multi-FASTA file containing the genome
|
|
234 @type genomeFileName: a string
|
|
235 """
|
|
236 self.genomeFileParser = FastaParser(fileName, self.verbosity)
|
|
237
|
|
238
|
|
239 def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension):
|
|
240 """
|
|
241 Write the RNAFold input file, containing the sequence corresponding to the transcript
|
|
242 @ivar transcript: a transcript
|
|
243 @type transcript: class L{Transcript<Transcript>}
|
|
244 @ivar reverse: invert the extensions
|
|
245 @type reverse: bool
|
|
246 """
|
|
247 transcriptCopy = Transcript(transcript)
|
|
248 transcriptCopy.removeExons()
|
|
249 if not reverse:
|
|
250 transcriptCopy.extendStart(fivePrimeExtension)
|
|
251 transcriptCopy.extendEnd(threePrimeExtension)
|
|
252 else:
|
|
253 transcriptCopy.extendStart(threePrimeExtension)
|
|
254 transcriptCopy.extendEnd(fivePrimeExtension)
|
|
255 sequence = transcriptCopy.extractSequence(self.genomeFileParser)
|
|
256 handle = open(self.tmpInputFileName, "w")
|
|
257 handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence()))
|
|
258 handle.close()
|
|
259
|
|
260
|
|
261 def startRnaFold(self):
|
|
262 """
|
|
263 Start RNAFold
|
|
264 """
|
|
265 command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName)
|
|
266 if self.verbosity > 100:
|
|
267 print "Launching command '%s'" % (command)
|
|
268 status = subprocess.call(command, shell=True)
|
|
269 if status != 0:
|
|
270 raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status))
|
|
271
|
|
272
|
|
273 def parseRnaFoldOutput(self):
|
|
274 """
|
|
275 Parse an output file of RNAFold
|
|
276 @return: an RnaFoldStructure
|
|
277 """
|
|
278 lines = open(self.tmpOutputFileName).readlines()
|
|
279 if len(lines) != 3:
|
|
280 raise Exception("Problem in RNAfold output! '%s'" % (lines))
|
|
281 name = lines[0].strip()[1:].split()[0]
|
|
282 sequence = lines[1].strip()
|
|
283 structure = lines[2].strip().split()[0]
|
|
284 energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip())
|
|
285 if self.verbosity > 100:
|
|
286 print "Getting sequence %s, structure %s" % (sequence, structure)
|
|
287 return RnaFoldStructure(name, sequence, structure, energy)
|
|
288
|
|
289
|
|
290 def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension):
|
|
291 """
|
|
292 Analyze the RNAFold
|
|
293 @ivar transcript: a transcript
|
|
294 @type transcript: class L{Transcript<Transcript>}
|
|
295 @ivar rnaFoldOutput: the output of RNAFold
|
|
296 @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>}
|
|
297 @ivar reverse: invert the extensions
|
|
298 @type reverse: bool
|
|
299 @return: a t-uple of energy, number of insertions, number of bulges, strand
|
|
300 """
|
|
301 rnaFoldOutput.analyze()
|
|
302 transcriptSize = transcript.end - transcript.start + 1
|
|
303 start = fivePrimeExtension if not reverse else threePrimeExtension
|
|
304 end = start + transcriptSize
|
|
305 energy = rnaFoldOutput.energy
|
|
306 nbBulges = rnaFoldOutput.getNbBulges(start, end)
|
|
307 (minStar, maxStar) = rnaFoldOutput.getStar(start, end)
|
|
308 minStar += transcript.start - start
|
|
309 maxStar += transcript.start - start
|
|
310 if self.verbosity > 100:
|
|
311 print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+")
|
|
312 return (energy, nbBulges, minStar, maxStar, reverse)
|
|
313
|
|
314
|
|
315 def fold(self, transcript):
|
|
316 """
|
|
317 Fold a transcript (in each strand)
|
|
318 @ivar transcript: a transcript
|
|
319 @type transcript: class L{Transcript<Transcript>}
|
|
320 @return: a t-uple of energy, number of insertions, number of bulges, strand
|
|
321 """
|
|
322 results = [None] * self.nbStrands
|
|
323 strands = [False, True] if self.nbStrands == 2 else [False]
|
|
324 minNbBulges = 1000000
|
|
325 for i, reverse in enumerate(strands):
|
|
326 self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension)
|
|
327 self.startRnaFold()
|
|
328 output = self.parseRnaFoldOutput()
|
|
329 results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension)
|
|
330 minNbBulges = min(minNbBulges, results[i][1])
|
|
331 for result in results:
|
|
332 if result[1] == minNbBulges:
|
|
333 return result
|
|
334 return None
|
|
335
|
|
336
|
|
337 def refold(self, transcript):
|
|
338 """
|
|
339 Fold a transcript, knowing where the miRNA starts and end
|
|
340 @ivar transcript: a transcript
|
|
341 @type transcript: class L{Transcript<Transcript>}
|
|
342 @return: the energy
|
|
343 """
|
|
344 miStar = transcript.getTagValue("miRnaStar")
|
|
345 startMiStar = int(miStar.split("-")[0])
|
|
346 endMiStart = int(miStar.split("-")[1])
|
|
347 fivePrimeExtension = max(0, transcript.start - startMiStar) + 5
|
|
348 threePrimeExtension = max(0, endMiStart - transcript.end) + 5
|
|
349 self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension)
|
|
350 self.startRnaFold()
|
|
351 output = self.parseRnaFoldOutput()
|
|
352 result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension)
|
|
353 return result[0]
|
|
354
|
|
355
|
|
356 def computeResults(self):
|
|
357 """
|
|
358 Fold all and fill an output transcript list with the values
|
|
359 """
|
|
360 progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity)
|
|
361 self.outputTranscriptList = TranscriptList()
|
|
362 for transcript in self.inputTranscriptList.getIterator():
|
|
363 result = self.fold(transcript)
|
|
364 transcript.setTagValue("nbBulges", result[1])
|
|
365 transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3]))
|
|
366 transcript.setTagValue("miRNAstrand", result[4])
|
|
367 transcript.setTagValue("energy", self.refold(transcript))
|
|
368 self.outputTranscriptList.addTranscript(transcript)
|
|
369 progress.inc()
|
|
370 progress.done()
|
|
371
|
|
372
|
|
373 def getResults(self):
|
|
374 """
|
|
375 Get an output transcript list with the values
|
|
376 """
|
|
377 if self.outputTranscriptList == None:
|
|
378 self.computeResults()
|
|
379 return self.outputTranscriptList
|