diff smart_toolShed/SMART/Java/Python/toolLauncher/RnaFoldLauncher.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/toolLauncher/RnaFoldLauncher.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,379 @@
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import os
+import sys
+import random
+import subprocess
+from SMART.Java.Python.structure.TranscriptList import TranscriptList
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.misc.Progress import Progress
+from commons.core.parsing.FastaParser import FastaParser
+
+
+class RnaFoldStructure(object):
+    """
+    A structure to store the output of RNAFold
+    @ivar name:         the name of the sequence
+    @type name:         string
+    @ivar sequence:     the sequence (with gaps)
+    @type sequence:     string
+    @ivar structure:    the bracket structure
+    @type structure:    string
+    @ivar energy:       the energy of the fold
+    @type energy:       float
+    @ivar interactions: the interactions inside the structure
+    @ivar interactions: the interactions inside the structure
+    """
+
+    def __init__(self, name, sequence, structure, energy):
+        """
+        Initialize the structure
+        @param name       the name of the sequence
+        @type  name:      string
+        @param sequence:  the sequence (with gaps)
+        @type  sequence:  string
+        @param structure: the bracket structure
+        @type  structure: string
+        @param energy:    the energy of the fold
+        @type  energy:    float
+        """
+        self.name         = name        
+        self.sequence     = sequence
+        self.structure    = structure
+        self.energy       = energy
+        self.interactions = None
+
+    
+    def analyze(self):
+        """
+        Analyze the output, assign the interactions
+        """
+        if len(self.sequence) != len(self.structure):
+            sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure))
+        stack                         = []
+        self.interactions = [None for i in range(len(self.sequence))]
+        for i in range(len(self.sequence)):
+            if self.structure[i] == "(":
+                stack.append(i)
+            elif self.structure[i] == ")":
+                if not stack:
+                    sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
+                otherI = stack.pop()
+                self.interactions[i]      = otherI
+                self.interactions[otherI] = i
+        if stack:
+            sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
+
+
+    def getNbBulges(self, start = None, end = None):
+        """
+        Get the number of insertions in a given range (in number of letters)
+        @param start: where to start the count
+        @type    start: int
+        @param end:     where to end the co
+        @type    end:     int
+        """
+        if start == None:
+            start = 0
+        if end == None:
+            end = len(self.sequence)
+        previousInt = None
+        nbBulges    = 0
+        inRegion    = False
+        for i in range(len(self.sequence)):
+            if i == start:
+                inRegion = True
+            if i > end:
+                return nbBulges
+            if inRegion:
+                if self.interactions[i] == None:
+                    nbBulges += 1
+                elif previousInt != None and abs(self.interactions[i] - previousInt) != 1:
+                    nbBulges += 1
+            previousInt = self.interactions[i]
+        return nbBulges
+
+
+    def getStar(self, start = None, end = None):
+        """
+        Get the supposed miRNA*
+        @param start: where to start the count
+        @type    start: int
+        @param end:     where to end the co
+        @type    end:     int
+        """
+        if start == None:
+            start = 0
+        if end == None:
+            end = len(self.sequence)
+        minStar  = 1000000
+        maxStar  = 0
+        inRegion = False
+        for i in range(len(self.sequence)):
+            if i == start:
+                inRegion = True
+            if i > end:
+                return (minStar, maxStar)
+            if inRegion:
+                if self.interactions[i] != None:
+                    minStar = min(minStar, self.interactions[i])
+                    maxStar = max(maxStar, self.interactions[i])
+        return (minStar, maxStar)
+
+
+
+class RnaFoldLauncher(object):
+    """
+    Start and parse a RNAFold instance
+    @ivar inputTranscriptList:  a list of transcripts
+    @type inputTranscriptList:  class L{TranscriptList<TranscriptList>}
+    @ivar genomeFileParser:     a parser to the genome file
+    @type genomeFileParser:     class L{SequenceListParser<SequenceListParser>}
+    @ivar bothStrands:          whether folding is done on both strands
+    @type bothStrands:          bool
+    @ivar fivePrimeExtension:   extension towards the 5' end
+    @type fivePrimeExtension:   int
+    @ivar threePrimeExtension:  extension towards the 3' end
+    @type threePrimeExtension:  int
+    @ivar inputTranscriptList:  the input list of transcripts
+    @type inputTranscriptList:  class L{TranscriptList<TranscriptList>}
+    @ivar outputTranscriptList: the output list of transcripts
+    @type outputTranscriptList: class L{TranscriptList<TranscriptList>}
+    @ivar tmpInputFileName:     the name of the temporary input file for RNAFold
+    @type tmpInputFileName:     string
+    @ivar tmpOutputFileName:    the name of the temporary output file for RNAFold
+    @type tmpOutputFileName:    string
+    @ivar verbosity:            verbosity
+    @type verbosity:            int [default: 0]
+    """
+
+    def __init__(self, verbosity = 0):
+        """
+        Constructor
+        @param verbosity: verbosity
+        @type  verbosity: int
+        """
+        self.verbosity            = verbosity
+        self.transcriptNames      = []
+        randomNumber              = random.randint(0, 100000)
+        self.bothStrands          = True
+        self.tmpInputFileName     = "tmpInput_%d.fas" % (randomNumber)
+        self.tmpOutputFileName    = "tmpOutput_%d.fas" % (randomNumber)
+        self.outputTranscriptList = None
+        self.fivePrimeExtension   = 0
+        self.threePrimeExtension  = 0
+
+
+    def __del__(self):
+        for file in (self.tmpInputFileName, self.tmpOutputFileName):
+            if os.path.isfile(file):
+                os.remove(file)
+
+
+    def setTranscriptList(self, inputTranscriptList):
+        """
+        Set the list of transcripts
+        @ivar inputTranscriptList: a list of transcripts
+        @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
+        """
+        self.inputTranscriptList = inputTranscriptList
+
+
+    def setExtensions(self, fivePrime, threePrime):
+        """
+        Set extension sizes
+        @ivar fivePrime:  extension towards the 5' end
+        @type fivePrime:  int
+        @ivar threePrime: extension towards the 3' end
+        @type threePrime: int
+        """
+        self.fivePrimeExtension  = fivePrime
+        self.threePrimeExtension = threePrime
+
+
+    def setNbStrands(self, nbStrands):
+        """
+        Set number of strands
+        @ivar nbStrands: if 2, the work is done on both strands
+        @type nbStrands: int
+        """
+        self.nbStrands = nbStrands
+
+
+    def setGenomeFile(self, fileName):
+        """
+        Set the genome file
+        @ivar genomeFileName: the multi-FASTA file containing the genome
+        @type genomeFileName: a string
+        """
+        self.genomeFileParser = FastaParser(fileName, self.verbosity)
+
+
+    def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension):
+        """
+        Write the RNAFold input file, containing the sequence corresponding to the transcript
+        @ivar transcript: a transcript
+        @type transcript: class L{Transcript<Transcript>}
+        @ivar reverse:    invert the extensions
+        @type reverse:    bool
+        """
+        transcriptCopy = Transcript(transcript)
+        transcriptCopy.removeExons()
+        if not reverse:
+            transcriptCopy.extendStart(fivePrimeExtension)
+            transcriptCopy.extendEnd(threePrimeExtension)
+        else:
+            transcriptCopy.extendStart(threePrimeExtension)
+            transcriptCopy.extendEnd(fivePrimeExtension)
+        sequence = transcriptCopy.extractSequence(self.genomeFileParser)
+        handle   = open(self.tmpInputFileName, "w")
+        handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence()))
+        handle.close()
+
+
+    def startRnaFold(self):
+        """
+        Start RNAFold
+        """
+        command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName)
+        if self.verbosity > 100:
+            print "Launching command '%s'" % (command)
+        status = subprocess.call(command, shell=True)
+        if status != 0:
+            raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status))
+
+    
+    def parseRnaFoldOutput(self):
+        """
+        Parse an output file of RNAFold
+        @return: an RnaFoldStructure
+        """
+        lines = open(self.tmpOutputFileName).readlines()
+        if len(lines) != 3:
+            raise Exception("Problem in RNAfold output! '%s'" % (lines))
+        name      = lines[0].strip()[1:].split()[0]
+        sequence  = lines[1].strip()
+        structure = lines[2].strip().split()[0]
+        energy    = float(lines[2].strip().split(" ", 1)[1][1:-1].strip())
+        if self.verbosity > 100:
+            print "Getting sequence %s, structure %s" % (sequence, structure)
+        return RnaFoldStructure(name, sequence, structure, energy)
+
+
+    def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension):
+        """
+        Analyze the RNAFold
+        @ivar transcript:    a transcript
+        @type transcript:    class L{Transcript<Transcript>}
+        @ivar rnaFoldOutput: the output of RNAFold
+        @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>}
+        @ivar reverse:       invert the extensions
+        @type reverse:       bool
+        @return:             a t-uple of energy, number of insertions, number of bulges, strand
+        """
+        rnaFoldOutput.analyze()
+        transcriptSize     = transcript.end - transcript.start + 1
+        start              = fivePrimeExtension if not reverse else threePrimeExtension
+        end                = start + transcriptSize
+        energy             = rnaFoldOutput.energy
+        nbBulges           = rnaFoldOutput.getNbBulges(start, end)
+        (minStar, maxStar) = rnaFoldOutput.getStar(start, end)
+        minStar           += transcript.start - start
+        maxStar           += transcript.start - start
+        if self.verbosity > 100:
+            print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+")
+        return (energy, nbBulges, minStar, maxStar, reverse)
+
+    
+    def fold(self, transcript):
+        """
+        Fold a transcript (in each strand)
+        @ivar transcript: a transcript
+        @type transcript: class L{Transcript<Transcript>}
+        @return:          a t-uple of energy, number of insertions, number of bulges, strand
+        """
+        results     = [None] * self.nbStrands
+        strands     = [False, True] if self.nbStrands == 2 else [False]
+        minNbBulges = 1000000
+        for i, reverse in enumerate(strands):
+            self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension)
+            self.startRnaFold()
+            output = self.parseRnaFoldOutput()
+            results[i]  = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension)
+            minNbBulges = min(minNbBulges, results[i][1])
+        for result in results:
+            if result[1] == minNbBulges:
+                return result
+        return None
+
+
+    def refold(self, transcript):
+        """
+        Fold a transcript, knowing where the miRNA starts and end
+        @ivar transcript: a transcript
+        @type transcript: class L{Transcript<Transcript>}
+        @return:          the energy
+        """
+        miStar              = transcript.getTagValue("miRnaStar")
+        startMiStar         = int(miStar.split("-")[0])
+        endMiStart          = int(miStar.split("-")[1])
+        fivePrimeExtension  = max(0, transcript.start - startMiStar) + 5
+        threePrimeExtension = max(0, endMiStart - transcript.end) + 5
+        self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension)
+        self.startRnaFold()
+        output = self.parseRnaFoldOutput()
+        result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension)
+        return result[0]
+
+
+    def computeResults(self):
+        """
+        Fold all and fill an output transcript list with the values
+        """
+        progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity)
+        self.outputTranscriptList = TranscriptList()
+        for transcript in self.inputTranscriptList.getIterator():
+            result = self.fold(transcript)
+            transcript.setTagValue("nbBulges", result[1])
+            transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3]))
+            transcript.setTagValue("miRNAstrand", result[4])
+            transcript.setTagValue("energy", self.refold(transcript))
+            self.outputTranscriptList.addTranscript(transcript)
+            progress.inc()
+        progress.done()
+
+
+    def getResults(self):
+        """
+        Get an output transcript list with the values
+        """
+        if self.outputTranscriptList == None:
+            self.computeResults()
+        return self.outputTranscriptList