| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 | 
|  | 32 from optparse import OptionParser | 
|  | 33 from commons.core.parsing.FastaParser import FastaParser | 
|  | 34 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 35 from SMART.Java.Python.misc.Progress import Progress | 
|  | 36 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 37 from SMART.Java.Python.misc.Utils import * | 
|  | 38 | 
|  | 39 | 
|  | 40 class GetGenomeCoverage(object): | 
|  | 41 | 
|  | 42     def __init__(self, verbosity = 1): | 
|  | 43         self.verbosity       = verbosity | 
|  | 44         self.inputContainer  = None | 
|  | 45         self.referenceParser = None | 
|  | 46         self.outputFileName  = None | 
|  | 47         self.genomeSize      = None | 
|  | 48         self.coverage        = {} | 
|  | 49         self.distribution    = {} | 
|  | 50 | 
|  | 51 | 
|  | 52     def setInputFile(self, fileName, format): | 
|  | 53         self.inputContainer = TranscriptContainer(fileName, format, self.verbosity) | 
|  | 54 | 
|  | 55 | 
|  | 56     def setOutputFile(self, fileName): | 
|  | 57         self.outputFileName = fileName | 
|  | 58 | 
|  | 59 | 
|  | 60     def setReference(self, fileName): | 
|  | 61         self.referenceParser = FastaParser(fileName, self.verbosity) | 
|  | 62 | 
|  | 63 | 
|  | 64     def getReferenceSizes(self): | 
|  | 65         self.genomeSize = 0 | 
|  | 66         for chromosome in self.referenceParser.getRegions(): | 
|  | 67             self.genomeSize += self.referenceParser.getSizeOfRegion(chromosome) | 
|  | 68 | 
|  | 69 | 
|  | 70     def getCoverage(self): | 
|  | 71         progress = Progress(self.inputContainer.getNbTranscripts(), "Reading reads", self.verbosity) | 
|  | 72         for transcript in self.inputContainer.getIterator(): | 
|  | 73             chromosome = transcript.getChromosome() | 
|  | 74             if chromosome not in self.coverage: | 
|  | 75                 self.coverage[chromosome] = {} | 
|  | 76             for exon in transcript.getExons(): | 
|  | 77                 for pos in range(exon.getStart(), exon.getEnd() + 1): | 
|  | 78                     if pos not in self.coverage[chromosome]: | 
|  | 79                         self.coverage[chromosome][pos] = 1 | 
|  | 80                     else: | 
|  | 81                         self.coverage[chromosome][pos] += 1 | 
|  | 82             progress.inc() | 
|  | 83         progress.done() | 
|  | 84 | 
|  | 85 | 
|  | 86     def getDistribution(self): | 
|  | 87         nbNucleotides = sum([len(self.coverage[chromosome].keys()) for chromosome in self.coverage]) | 
|  | 88         progress      = Progress(nbNucleotides, "Building distribution", self.verbosity) | 
|  | 89         for chromosome in self.coverage: | 
|  | 90             for num in self.coverage[chromosome].values(): | 
|  | 91                 if num not in self.distribution: | 
|  | 92                     self.distribution[num] = 1 | 
|  | 93                 else: | 
|  | 94                     self.distribution[num] += 1 | 
|  | 95                 progress.inc() | 
|  | 96         progress.done() | 
|  | 97         self.distribution[0] = self.genomeSize - nbNucleotides | 
|  | 98 | 
|  | 99 | 
|  | 100     def plotDistribution(self): | 
|  | 101         plotter = RPlotter(self.outputFileName, self.verbosity) | 
|  | 102         plotter.setFill(0) | 
|  | 103         plotter.addLine(self.distribution) | 
|  | 104         plotter.plot() | 
|  | 105         print "min/avg/med/max reads per base: %d/%.2f/%.1f/%d" % getMinAvgMedMax(self.distribution) | 
|  | 106 | 
|  | 107 | 
|  | 108     def run(self): | 
|  | 109         self.getReferenceSizes() | 
|  | 110         self.getCoverage() | 
|  | 111         self.getDistribution() | 
|  | 112         self.plotDistribution() | 
|  | 113 | 
|  | 114 | 
|  | 115 if __name__ == "__main__": | 
|  | 116 | 
|  | 117     # parse command line | 
|  | 118     description = "Plot Genome Coverage v1.0.1: Get the coverage of a genome. [Category: Personal]" | 
|  | 119 | 
|  | 120     parser = OptionParser(description = description) | 
|  | 121     parser.add_option("-i", "--input",     dest="inputFileName",  action="store",               type="string", help="reads file [compulsory] [format: file in transcript format given by -f]") | 
|  | 122     parser.add_option("-f", "--format",    dest="format",         action="store",               type="string", help="format of file [compulsory] [format: transcript file format]") | 
|  | 123     parser.add_option("-r", "--reference", dest="reference",      action="store",               type="string", help="sequences file [compulsory] [format: file in FASTA format]") | 
|  | 124     parser.add_option("-o", "--output",    dest="outputFileName", action="store",               type="string", help="output file [compulsory] [format: output file in PNG format]") | 
|  | 125     parser.add_option("-v", "--verbosity", dest="verbosity",      action="store", default=1,    type="int",    help="trace level [format: int]") | 
|  | 126     (options, args) = parser.parse_args() | 
|  | 127 | 
|  | 128     getGenomeCoverage = GetGenomeCoverage(options.verbosity) | 
|  | 129     getGenomeCoverage.setInputFile(options.inputFileName, options.format) | 
|  | 130     getGenomeCoverage.setOutputFile(options.outputFileName) | 
|  | 131     getGenomeCoverage.setReference(options.reference) | 
|  | 132     getGenomeCoverage.run() |