Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/CompareOverlappingSmallQuery.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children | 169d364ddd91 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/CompareOverlappingSmallQuery.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,261 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2011 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.TranscriptWriter import TranscriptWriter +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Mapping import Mapping +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + +MINBIN = 3 +MAXBIN = 7 +REFERENCE = 0 +QUERY = 1 + +def getBin(start, end): + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + if int(start / binLevel) == int(end / binLevel): + return int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)) + return int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + +def getOverlappingBins(start, end): + array = [] + bigBin = int((MAXBIN + 1) * 10 ** (MAXBIN + 1)) + for i in range(MINBIN, MAXBIN + 1): + binLevel = 10 ** i + array.append((int(i * 10 ** (MAXBIN + 1) + int(start / binLevel)), int(i * 10 ** (MAXBIN + 1) + int(end / binLevel)))) + array.append((bigBin, bigBin)) + return array + + +class CompareOverlappingSmallQuery(object): + + def __init__(self, verbosity): + self.verbosity = verbosity + self.tableNames = {} + self.nbQueries = 0 + self.nbRefs = 0 + self.nbWritten = 0 + self.nbOverlaps = 0 + self.distance = None + self.invert = False + self.antisense = False + self.collinear = False + self.pcOverlapQuery = False + self.pcOverlapRef = False + self.minOverlap = False + self.included = False + self.including = False + self.bins = {} + self.overlaps = {} + self.notOverlapping = False + + def setReferenceFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.refParser = chooser.getParser(fileName) + + def setQueryFile(self, fileName, format): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + self.queryParser = chooser.getParser(fileName) + + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def setDistance(self, distance): + self.distance = distance + + def setInvert(self, boolean): + self.invert = boolean + + def setCollinear(self, boolean): + self.collinear = boolean + + def setAntisense(self, boolean): + self.antisense = boolean + + def setMinPercentOverlap(self, pcOverlapQuery, pcOverlapRef): + self.pcOverlapQuery = pcOverlapQuery + self.pcOverlapRef = pcOverlapRef + + def setMinOverlap(self, minOverlap): + self.minOverlap = minOverlap + + def setInclude(self, included, including): + self.included = included + self.including = including + + def includeNotOverlapping(self, boolean): + self.notOverlapping = boolean + + def loadQuery(self): + progress = UnlimitedProgress(10000, "Reading queries", self.verbosity) + for transcript in self.queryParser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + bin = getBin(transcript.getStart(), transcript.getEnd()) + if chromosome not in self.bins: + self.bins[chromosome] = {} + if bin not in self.bins[chromosome]: + self.bins[chromosome][bin] = [] + self.bins[chromosome][bin].append(transcript) + if self.notOverlapping or self.invert: + self.overlaps[transcript] = {} + self.nbQueries += 1 + progress.inc() + progress.done() + + def _compareTwoTranscripts(self, queryTranscript, refTranscript): + if not queryTranscript.overlapWithExon(refTranscript): + return False + if self.collinear and queryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self.included and not refTranscript.include(queryTranscript): + return False + if self.including and not queryTranscript.include(refTranscript): + return False + querySize = queryTranscript.getSize() + if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)): + return False + refSize = refTranscript.getSize() + if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)): + return False + if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap): + return False + return True + + def _alterTranscript(self, transcript, type): + if type == REFERENCE: + if self.distance != None: + transcript.extendExons(self.distance) + return transcript + + def _compareTranscript(self, refTranscript): + refChromosome = refTranscript.getChromosome() + if refChromosome not in self.bins: + return [] + refStart = refTranscript.getStart() + refEnd = refTranscript.getEnd() + bins = getOverlappingBins(refStart, refEnd) + for binRange in bins: + for bin in range(binRange[0], binRange[1]+1): + if bin not in self.bins[refChromosome]: + continue + for queryTranscript in self.bins[refChromosome][bin]: + if self._compareTwoTranscripts(queryTranscript, refTranscript): + if queryTranscript not in self.overlaps: + self.overlaps[queryTranscript] = {} + nbElements = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.overlaps[queryTranscript][refTranscript.getName()] = int(float(refTranscript.getTagValue("nbElements"))) if "nbElements" in refTranscript.getTagNames() else 1 + self.nbOverlaps += nbElements + + def _updateTranscript(self, queryTranscript): + overlaps = self.overlaps[queryTranscript] + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) + if overlaps: + queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + return queryTranscript + + def compare(self): + progress = UnlimitedProgress(10000, "Comparing references", self.verbosity) + for refTranscript in self.refParser.getIterator(): + if refTranscript.__class__.__name__ == "Mapping": + refTranscript = refTranscript.getTranscript() + refTranscript = self._alterTranscript(refTranscript, REFERENCE) + self._compareTranscript(refTranscript) + self.nbRefs += 1 + progress.inc() + progress.done() + + def printResults(self): + for transcript in self.overlaps: + if not self.invert or not self.overlaps[transcript]: + if not self.invert: + transcript = self._updateTranscript(transcript) + self.writer.addTranscript(transcript) + self.nbWritten += 1 + self.writer.close() + + def displayResults(self): + if self.verbosity: + print "# queries: %d" % (self.nbQueries) + print "# refs: %d" % (self.nbRefs) + print "# written: %d (%d overlaps)" % (self.nbWritten, self.nbOverlaps) + + def run(self): + self.loadQuery() + self.compare() + self.printResults() + self.displayResults() + +if __name__ == "__main__": + + description = "Compare Overlapping Small Query v1.0.1: Provide the queries that overlap with a reference, when the query is small. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]") + parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") + parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + cosq = CompareOverlappingSmallQuery(options.verbosity) + cosq.setQueryFile(options.inputFileName1, options.format1) + cosq.setReferenceFile(options.inputFileName2, options.format2) + cosq.setOutputFile(options.outputFileName) + cosq.includeNotOverlapping(options.notOverlapping) + cosq.setDistance(options.distance) + cosq.setCollinear(options.collinear) + cosq.setAntisense(options.antisense) + cosq.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef) + cosq.setMinOverlap(options.minOverlap) + cosq.setInclude(options.included, options.including) + cosq.setInvert(options.exclude) + cosq.run()