diff smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/ncList/FindOverlapsWithSeveralIntervals.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,182 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+
+import os, struct, time
+from optparse import OptionParser
+from commons.core.parsing.ParserChooser import ParserChooser
+from SMART.Java.Python.structure.Transcript import Transcript
+from SMART.Java.Python.ncList.NCList import NCList
+from SMART.Java.Python.ncList.NCListCursor import NCListCursor
+from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
+from SMART.Java.Python.ncList.FileSorter import FileSorter
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
+from SMART.Java.Python.ncList.NCListCursor import NCListCursor
+from SMART.Java.Python.ncList.FindOverlapsWithOneInterval import FindOverlapsWithOneInterval
+
+REFERENCE = 0
+QUERY = 1
+TYPETOSTRING = {0: "reference", 1: "query"}
+
+class FindOverlapsWithSeveralIntervals(object):
+    
+    def __init__(self, verbosity = 1):
+        self._parsers            = {}
+        self._outputFileName     = "outputOverlaps.gff3"
+        self._iWriter            = None
+        self._nbLines            = {REFERENCE: 0, QUERY: 0}
+        self._verbosity          = verbosity
+        self._ncLists            = {}
+        self._sortedRefFileNames = None
+        self._transQueryFileName = None
+        self._cursors            = {}
+        self._iFowoi             = FindOverlapsWithOneInterval(self._verbosity)
+        
+    def __del__(self):
+        self.close()
+        for fileName in (self._sortedRefFileNames, self._transQueryFileName):
+            if os.path.exists(fileName):
+                os.remove(fileName)
+    
+    def close(self):
+        self._iFowoi.close()
+        
+    def setRefFileName(self, fileName, format):
+        self.setFileName(fileName, format, REFERENCE)
+        self._sortedRefFileNames = "%s_ref_sorted.pkl" % (os.path.splitext(fileName)[0])
+        
+    def setQueryFileName(self, fileName, format):
+        self.setFileName(fileName, format, QUERY)
+        self._transQueryFileName = "%s_query_trans.pkl" % (os.path.splitext(fileName)[0])
+
+    def setFileName(self, fileName, format, type):
+        chooser = ParserChooser(self._verbosity)
+        chooser.findFormat(format)
+        self._parsers[type]   = chooser.getParser(fileName)
+        
+    def setOutputFileName(self, outputFileName):
+        self._iFowoi.setOutputFileName(outputFileName)
+
+    def _sortRefFile(self):
+        fs = FileSorter(self._parsers[REFERENCE], self._verbosity-4)
+        fs.perChromosome(True)
+        fs.setOutputFileName(self._sortedRefFileNames)
+        fs.sort()
+        self._nbLines[REFERENCE]      = fs.getNbElements()
+        self._nbRefLinesPerChromosome = fs.getNbElementsPerChromosome()
+        self._splittedFileNames       = fs.getOutputFileNames()
+
+    def _translateQueryFile(self):
+        pickler = NCListFilePickle(self._transQueryFileName, self._verbosity)
+        progress = UnlimitedProgress(1000, "Translating query data", self._verbosity-4)
+        cpt      = 0
+        for queryTranscript in self._parsers[QUERY].getIterator():
+            pickler.addTranscript(queryTranscript)
+            progress.inc()
+            cpt += 1
+        progress.done()
+        self._nbLines[QUERY] = cpt
+        self._parsers[QUERY] = NCListFileUnpickle(self._transQueryFileName, self._verbosity)
+            
+    def prepareIntermediateFiles(self):
+        self._sortRefFile()
+        self._translateQueryFile()
+
+    def createNCLists(self):
+        self._ncLists = {}
+        self._indices = {}
+        self._cursors = {}
+        for chromosome, fileName in self._splittedFileNames.iteritems():
+            if self._verbosity > 3:
+                print "  chromosome %s" % (chromosome)
+            ncList = NCList(self._verbosity)
+            ncList.createIndex(True)
+            ncList.setChromosome(chromosome)
+            ncList.setFileName(fileName)
+            ncList.setNbElements(self._nbRefLinesPerChromosome[chromosome])
+            ncList.buildLists()
+            self._ncLists[chromosome] = ncList
+            cursor = NCListCursor(None, ncList, 0, self._verbosity)
+            self._cursors[chromosome] = cursor
+            self._indices[chromosome] = ncList.getIndex()
+        endTime = time.time()
+
+    def compare(self):
+        progress = Progress(self._nbLines[QUERY], "Comparing data", self._verbosity-3)
+        startTime = time.time()
+        for cpt, queryTranscript in enumerate(self._parsers[QUERY].getIterator()):
+            chromosome = queryTranscript.getChromosome()
+            if chromosome not in self._ncLists:
+                continue
+            self._iFowoi.setNCList(self._ncLists[chromosome], self._indices[chromosome])
+            self._iFowoi.setTranscript(queryTranscript)
+            self._iFowoi.compare()
+            self._iFowoi.dumpWriter()
+            progress.inc()
+        progress.done()
+        endTime = time.time()
+        self._timeSpent = endTime - startTime
+
+    def run(self):
+        startTime = time.time()
+        if self._verbosity > 2:
+            print "Creating NC-list..."
+        self.prepareIntermediateFiles()
+        self.createNCLists()
+        endTime = time.time()
+        if self._verbosity > 2:
+            print "    ...done (%.2gs)" % (endTime - startTime)
+        self.compare()
+        self.close()
+        if self._verbosity > 0:
+            print "# queries: %d" % (self._nbLines[QUERY])
+            print "# refs:    %d" % (self._nbLines[REFERENCE])
+            print "# written: %d (%d overlaps)" % (self._iFowoi._nbWritten, self._iFowoi._nbOverlaps)
+            print "time:      %.2gs" % (self._timeSpent)
+
+
+if __name__ == "__main__":
+    description = "FindOverlaps With Several Intervals v1.0.0: Finds overlaps with several query intervals. [Category: Data comparison]"
+
+    parser = OptionParser(description = description)
+    parser.add_option("-i", "--query",       dest="inputQueryFileName", action="store",            type="string",  help="Query input file [compulsory] [format: file in transcript format given by -f]")
+    parser.add_option("-f", "--queryFormat", dest="queryFormat",        action="store",            type="string",  help="format of previous file [compulsory] [format: transcript file format]")
+    parser.add_option("-j", "--ref",         dest="inputRefFileName",   action="store",            type="string",  help="Reference input file [compulsory] [format: file in transcript format given by -g]")
+    parser.add_option("-g", "--refFormat",   dest="refFormat",          action="store",            type="string",  help="format of previous file [compulsory] [format: transcript file format]")
+    parser.add_option("-o", "--output",      dest="outputFileName",     action="store",            type="string",  help="Output file [compulsory] [format: output file in GFF3 format]")
+    parser.add_option("-v", "--verbosity",   dest="verbosity",          action="store", default=1, type="int",     help="Trace level [format: int] [default: 1]")
+    (options, args) = parser.parse_args()
+    
+    iFWSI = FindOverlapsWithSeveralIntervals(options.verbosity)
+    iFWSI.setRefFileName(options.inputRefFileName, options.refFormat)
+    iFWSI.setQueryFileName(options.inputQueryFileName, options.queryFormat)
+    iFWSI.setOutputFileName(options.outputFileName)
+    iFWSI.run()