| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2012 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os | 
|  | 32 from optparse import OptionParser, OptionGroup | 
|  | 33 from commons.core.parsing.ParserChooser import ParserChooser | 
|  | 34 from commons.core.writer.Gff3Writer import Gff3Writer | 
|  | 35 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 36 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle | 
|  | 37 from SMART.Java.Python.ncList.FileSorter import FileSorter | 
|  | 38 from SMART.Java.Python.misc.Progress import Progress | 
|  | 39 from SMART.Java.Python.misc import Utils | 
|  | 40 | 
|  | 41 | 
|  | 42 class GetUpDownStream(object): | 
|  | 43 | 
|  | 44     def __init__(self, verbosity = 0): | 
|  | 45         self.verbosity         = verbosity | 
|  | 46         self.inputReader       = None | 
|  | 47         self.outputWriter      = None | 
|  | 48         self.nbRead            = 0 | 
|  | 49         self.nbWritten         = 0 | 
|  | 50         self.nbMerges          = 0 | 
|  | 51         self.splittedFileNames = {} | 
|  | 52 | 
|  | 53     def __del__(self): | 
|  | 54         for fileName in self.splittedFileNames.values(): | 
|  | 55             os.remove(fileName) | 
|  | 56 | 
|  | 57     def setInputFile(self, fileName, format): | 
|  | 58         parserChooser = ParserChooser(self.verbosity) | 
|  | 59         parserChooser.findFormat(format, "transcript") | 
|  | 60         self.parser = parserChooser.getParser(fileName) | 
|  | 61         self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) | 
|  | 62 | 
|  | 63     def setOutputFile(self, fileName): | 
|  | 64         self.outputWriter = Gff3Writer(fileName, self.verbosity) | 
|  | 65 | 
|  | 66     def setDistances(self, up, down): | 
|  | 67         self.upDistance   = up | 
|  | 68         self.downDistance = down | 
|  | 69 | 
|  | 70     def _sortFile(self): | 
|  | 71         fs = FileSorter(self.parser, self.verbosity-4) | 
|  | 72         fs.perChromosome(True) | 
|  | 73         fs.setOutputFileName(self.sortedFileName) | 
|  | 74         fs.sort() | 
|  | 75         self.splittedFileNames       = fs.getOutputFileNames() | 
|  | 76         self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() | 
|  | 77         self.nbRead                  = fs.getNbElements() | 
|  | 78 | 
|  | 79     def _write(self, start, end, reference, after): | 
|  | 80         if start > end: | 
|  | 81             return | 
|  | 82         transcript = Transcript() | 
|  | 83         transcript.setChromosome(reference.getChromosome()) | 
|  | 84         transcript.setStart(start) | 
|  | 85         transcript.setEnd(end) | 
|  | 86         transcript.setDirection("+") | 
|  | 87         transcript.setName("%s_%s" % ("up" if Utils.xor(reference.getDirection() == 1, after) else "down", reference.getName())) | 
|  | 88         self.outputWriter.addTranscript(transcript) | 
|  | 89 | 
|  | 90     def _getFlanking(self, chromosome): | 
|  | 91         progress    = Progress(self.nbElementsPerChromosome[chromosome], "Analyzing chromosome %s" % (chromosome), self.verbosity) | 
|  | 92         parser      = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) | 
|  | 93         previous    = None | 
|  | 94         for transcript in parser.getIterator(): | 
|  | 95             progress.inc() | 
|  | 96             transcript.removeExons() | 
|  | 97             if previous == None: | 
|  | 98                 distance = self.upDistance if transcript.getDirection() == 1 else self.downDistance | 
|  | 99                 start    = max(1, transcript.getStart() - distance) | 
|  | 100                 self._write(start, transcript.getStart()-1, transcript, False) | 
|  | 101                 previous = transcript | 
|  | 102                 continue | 
|  | 103             if previous.include(transcript): | 
|  | 104                 continue | 
|  | 105             if transcript.overlapWith(previous): | 
|  | 106                 previous = transcript | 
|  | 107                 continue | 
|  | 108             distancePrevious = self.downDistance if previous.getDirection()   == 1 else self.upDistance | 
|  | 109             distanceCurrent  = self.upDistance   if transcript.getDirection() == 1 else self.downDistance | 
|  | 110             distance = transcript.getDistance(previous) | 
|  | 111             if distancePrevious + distanceCurrent == 0: | 
|  | 112                 previous = transcript | 
|  | 113                 continue | 
|  | 114             if distance >= distancePrevious + distanceCurrent: | 
|  | 115                 endPrevious  = previous.getEnd() + distancePrevious | 
|  | 116                 startCurrent = transcript.getStart() - distanceCurrent | 
|  | 117             else: | 
|  | 118                 middle       = previous.getEnd() + int((distance-1) * float(distancePrevious) / (distancePrevious + distanceCurrent)) | 
|  | 119                 endPrevious  = middle | 
|  | 120                 startCurrent = middle+1 | 
|  | 121             self._write(previous.getEnd() + 1, endPrevious, previous, True) | 
|  | 122             self._write(startCurrent, transcript.getStart() - 1, transcript, False) | 
|  | 123             previous = transcript | 
|  | 124         distance = self.downDistance if previous.getDirection() == 1 else self.upDistance | 
|  | 125         self._write(previous.getEnd() + 1, previous.getEnd() + distance, previous, True) | 
|  | 126         progress.done() | 
|  | 127 | 
|  | 128     def run(self): | 
|  | 129         self._sortFile() | 
|  | 130         for chromosome in sorted(self.nbElementsPerChromosome.keys()): | 
|  | 131             self._getFlanking(chromosome) | 
|  | 132         self.outputWriter.close() | 
|  | 133 | 
|  | 134 if __name__ == "__main__": | 
|  | 135 | 
|  | 136     # parse command line | 
|  | 137     description = "Get Up and Down Stream v1.0.0: Get the flanking regions of an annotation. [Category: Data Modification]" | 
|  | 138 | 
|  | 139     parser = OptionParser(description = description) | 
|  | 140     parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in mapping format given by -f]") | 
|  | 141     parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of the file [compulsory] [format: mapping file format]") | 
|  | 142     parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in GFF3 format]") | 
|  | 143     parser.add_option("-u", "--up",        dest="up",             action="store",      default=0,     type="int",    help="the upstream distance  [format: int]") | 
|  | 144     parser.add_option("-d", "--down",      dest="down",           action="store",      default=0,     type="int",    help="the downstream distance  [format: int]") | 
|  | 145     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [default: 1] [format: int]") | 
|  | 146     (options, args) = parser.parse_args() | 
|  | 147 | 
|  | 148     guds = GetUpDownStream(options.verbosity) | 
|  | 149     guds.setInputFile(options.inputFileName, options.format) | 
|  | 150     guds.setOutputFile(options.outputFileName) | 
|  | 151     guds.setDistances(options.up, options.down) | 
|  | 152     guds.run() |