| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 import os
 | 
| 
 | 
    32 from optparse import OptionParser, OptionGroup
 | 
| 
 | 
    33 from commons.core.parsing.ParserChooser import ParserChooser
 | 
| 
 | 
    34 from commons.core.writer.Gff3Writer import Gff3Writer
 | 
| 
 | 
    35 from SMART.Java.Python.structure.Transcript import Transcript
 | 
| 
 | 
    36 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
 | 
| 
 | 
    37 from SMART.Java.Python.ncList.FileSorter import FileSorter
 | 
| 
 | 
    38 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 
 | 
| 
 | 
    41 class CollapseReads(object):
 | 
| 
 | 
    42     """
 | 
| 
 | 
    43     Merge two reads if they have exactly the same genomic coordinates
 | 
| 
 | 
    44     """
 | 
| 
 | 
    45 
 | 
| 
 | 
    46     def __init__(self, verbosity = 0):
 | 
| 
 | 
    47         self.verbosity         = verbosity
 | 
| 
 | 
    48         self.inputReader       = None
 | 
| 
 | 
    49         self.outputWriter      = None
 | 
| 
 | 
    50         self.strands           = True
 | 
| 
 | 
    51         self.nbRead            = 0
 | 
| 
 | 
    52         self.nbWritten         = 0
 | 
| 
 | 
    53         self.nbMerges          = 0
 | 
| 
 | 
    54         self.splittedFileNames = {}
 | 
| 
 | 
    55 
 | 
| 
 | 
    56     def __del__(self):
 | 
| 
 | 
    57         for fileName in self.splittedFileNames.values():
 | 
| 
 | 
    58             os.remove(fileName)
 | 
| 
 | 
    59             
 | 
| 
 | 
    60     def close(self):
 | 
| 
 | 
    61         self.outputWriter.close()
 | 
| 
 | 
    62         
 | 
| 
 | 
    63     def setInputFile(self, fileName, format):
 | 
| 
 | 
    64         parserChooser = ParserChooser(self.verbosity)
 | 
| 
18
 | 
    65         parserChooser.findFormat(format)
 | 
| 
6
 | 
    66         self.parser = parserChooser.getParser(fileName)
 | 
| 
 | 
    67         self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
 | 
| 
 | 
    68 
 | 
| 
 | 
    69     def setOutputFile(self, fileName):
 | 
| 
 | 
    70         self.outputWriter = Gff3Writer(fileName, self.verbosity)
 | 
| 
 | 
    71 
 | 
| 
 | 
    72     def getNbElements(self):
 | 
| 
 | 
    73         return self.parser.getNbTranscripts()
 | 
| 
 | 
    74 
 | 
| 
 | 
    75     def _sortFile(self):
 | 
| 
 | 
    76         fs = FileSorter(self.parser, self.verbosity-4)
 | 
| 
 | 
    77         fs.perChromosome(True)
 | 
| 
 | 
    78         fs.setOutputFileName(self.sortedFileName)
 | 
| 
 | 
    79         fs.sort()
 | 
| 
 | 
    80         self.splittedFileNames       = fs.getOutputFileNames()
 | 
| 
 | 
    81         self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
 | 
| 
 | 
    82         self.nbRead                  = fs.getNbElements()
 | 
| 
 | 
    83         
 | 
| 
 | 
    84     def _iterate(self, chromosome):
 | 
| 
 | 
    85         progress    = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
 | 
| 
 | 
    86         transcripts = []
 | 
| 
 | 
    87         parser      = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
 | 
| 
 | 
    88         for newTranscript in parser.getIterator():
 | 
| 
 | 
    89             newTranscripts = []
 | 
| 
 | 
    90             for oldTranscript in transcripts:
 | 
| 
 | 
    91                 if self._checkOverlap(newTranscript, oldTranscript):
 | 
| 
 | 
    92                     self._merge(newTranscript, oldTranscript)
 | 
| 
 | 
    93                 elif self._checkPassed(newTranscript, oldTranscript):
 | 
| 
 | 
    94                     self._write(oldTranscript)
 | 
| 
 | 
    95                 else:
 | 
| 
 | 
    96                     newTranscripts.append(oldTranscript)
 | 
| 
 | 
    97             newTranscripts.append(newTranscript)
 | 
| 
 | 
    98             transcripts = newTranscripts
 | 
| 
 | 
    99             progress.inc()
 | 
| 
 | 
   100         for transcript in transcripts:
 | 
| 
 | 
   101             self._write(transcript)
 | 
| 
 | 
   102         progress.done()
 | 
| 
 | 
   103 
 | 
| 
 | 
   104     def _merge(self, transcript1, transcript2):
 | 
| 
 | 
   105         self.nbMerges += 1
 | 
| 
 | 
   106         transcript2.setDirection(transcript1.getDirection())
 | 
| 
 | 
   107         transcript1.merge(transcript2)
 | 
| 
 | 
   108 
 | 
| 
 | 
   109     def _write(self, transcript):
 | 
| 
 | 
   110         self.nbWritten += 1
 | 
| 
 | 
   111         self.outputWriter.addTranscript(transcript)
 | 
| 
 | 
   112 
 | 
| 
 | 
   113     def _checkOverlap(self, transcript1, transcript2):
 | 
| 
 | 
   114         if transcript1.getStart() != transcript2.getStart() or transcript1.getEnd() != transcript2.getEnd():
 | 
| 
 | 
   115             return False
 | 
| 
 | 
   116         return (not self.strands or transcript1.getDirection() == transcript2.getDirection())
 | 
| 
 | 
   117 
 | 
| 
 | 
   118     def _checkPassed(self, transcript1, transcript2):
 | 
| 
 | 
   119         return (transcript2.getStart() < transcript1.getStart())
 | 
| 
 | 
   120 
 | 
| 
 | 
   121     def collapseChromosome(self, chromosome):
 | 
| 
 | 
   122         progress            = Progress(table.getNbElements(), "Analysing chromosome %s" % (chromosome), self.verbosity)
 | 
| 
 | 
   123         command             = "SELECT * FROM %s ORDER BY start ASC, end DESC" % (table.name)
 | 
| 
 | 
   124         transcriptStart     = None
 | 
| 
 | 
   125         transcriptEnd       = None
 | 
| 
 | 
   126         transcriptDirection = None
 | 
| 
 | 
   127         currentTranscript   = None
 | 
| 
 | 
   128         if self.strands:
 | 
| 
 | 
   129             command += ", direction"
 | 
| 
 | 
   130         for index, transcript in table.selectTranscripts(command, True):
 | 
| 
 | 
   131             self.nbRead += 1
 | 
| 
 | 
   132             if not self.strands:
 | 
| 
 | 
   133                 transcript.setDirection("+")
 | 
| 
 | 
   134             if transcriptStart != transcript.getStart() or transcriptEnd != transcript.getEnd() or transcriptDirection != transcript.getDirection():
 | 
| 
 | 
   135                 self.writeTranscript(currentTranscript)
 | 
| 
 | 
   136                 transcriptStart     = transcript.getStart()
 | 
| 
 | 
   137                 transcriptEnd       = transcript.getEnd()
 | 
| 
 | 
   138                 transcriptDirection = transcript.getDirection()
 | 
| 
 | 
   139                 currentTranscript   = transcript
 | 
| 
 | 
   140             else:
 | 
| 
 | 
   141                 currentTranscript.setTagValue("nbElements", (currentTranscript.getTagValue("nbElements") + 1) if "nbElements" in currentTranscript.getTagNames() else 1)
 | 
| 
 | 
   142             progress.inc()
 | 
| 
 | 
   143         self.writeTranscript(currentTranscript)
 | 
| 
 | 
   144         progress.done()
 | 
| 
 | 
   145 
 | 
| 
 | 
   146     def collapse(self):
 | 
| 
 | 
   147         self._sortFile()
 | 
| 
 | 
   148         for chromosome in sorted(self.nbElementsPerChromosome.keys()):
 | 
| 
 | 
   149             self._iterate(chromosome)
 | 
| 
 | 
   150         self.outputWriter.close()
 | 
| 
 | 
   151         if self.verbosity > 1:
 | 
| 
 | 
   152             print "# reads read: %d" % (self.nbRead)
 | 
| 
 | 
   153             print "# reads written: %d (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbRead * 100)
 | 
| 
 | 
   154             print "# reads merges: %d" % (self.nbMerges)
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 if __name__ == "__main__":
 | 
| 
 | 
   157     
 | 
| 
 | 
   158     # parse command line
 | 
| 
 | 
   159     description = "Collapse Reads v1.0.3: Merge two reads if they have exactly the same genomic coordinates. [Category: Merge]"
 | 
| 
 | 
   160 
 | 
| 
 | 
   161     parser = OptionParser(description = description)
 | 
| 
 | 
   162     parser.add_option("-i", "--input",     dest="inputFileName",  action="store",                     type="string", help="input file [compulsory] [format: file in mapping format given by -f]")
 | 
| 
 | 
   163     parser.add_option("-f", "--format",    dest="format",         action="store",                     type="string", help="format of the file [compulsory] [format: mapping file format]")
 | 
| 
 | 
   164     parser.add_option("-o", "--output",    dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in GFF3 format]")
 | 
| 
 | 
   165     parser.add_option("-s", "--strands",   dest="strands",        action="store_true", default=False,                help="merge elements on 2 different strands [format: bool] [default: false]")
 | 
| 
 | 
   166     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [default: 1] [format: int]")
 | 
| 
 | 
   167     (options, args) = parser.parse_args()
 | 
| 
 | 
   168 
 | 
| 
 | 
   169     collapser = CollapseReads(options.verbosity)
 | 
| 
 | 
   170     collapser.setInputFile(options.inputFileName, options.format)
 | 
| 
 | 
   171     collapser.setOutputFile(options.outputFileName)
 | 
| 
 | 
   172     collapser.strands = not options.strands
 | 
| 
 | 
   173     collapser.collapse()
 | 
| 
 | 
   174     collapser.close()
 |