| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 """Get the repartition of some elements in a chromosomes""" | 
|  | 32 | 
|  | 33 import os | 
|  | 34 from optparse import OptionParser | 
|  | 35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | 
|  | 36 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 37 from commons.core.writer.Gff3Writer import Gff3Writer | 
|  | 38 from SMART.Java.Python.misc.RPlotter import RPlotter | 
|  | 39 from SMART.Java.Python.misc.Progress import Progress | 
|  | 40 from math import * | 
|  | 41 | 
|  | 42 def divideKeyDict(dictionary, ratio): | 
|  | 43     return dict([(key / ratio, dictionary[key]) for key in dictionary]) | 
|  | 44 | 
|  | 45 | 
|  | 46 def setTranscript(chromosome, direction, start, end, name, value): | 
|  | 47     transcript = Transcript() | 
|  | 48     transcript.setChromosome(chromosome) | 
|  | 49     transcript.setDirection(direction) | 
|  | 50     transcript.setStart(start) | 
|  | 51     transcript.setEnd(end) | 
|  | 52     transcript.setName(name) | 
|  | 53     transcript.setTagValue("nbElements", value) | 
|  | 54     return transcript | 
|  | 55 | 
|  | 56 | 
|  | 57 | 
|  | 58 if __name__ == "__main__": | 
|  | 59 | 
|  | 60     magnifyingFactor = 1000 | 
|  | 61 | 
|  | 62     # parse command line | 
|  | 63     description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" | 
|  | 64 | 
|  | 65     parser = OptionParser(description = description) | 
|  | 66     parser.add_option("-i", "--input",       dest="inputFileName",     action="store",                           type="string", help="input file [compulsory] [format: file in transcript format given by -f]") | 
|  | 67     parser.add_option("-f", "--format",      dest="format",            action="store",                           type="string", help="format of the input file [compulsory] [format: transcript file format]") | 
|  | 68     parser.add_option("-o", "--output",      dest="outputFileName",    action="store",                           type="string", help="output file [compulsory] [format: output file in GFF3 format]") | 
|  | 69     parser.add_option("-r", "--reference",   dest="referenceFileName", action="store",      default=None,        type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") | 
|  | 70     parser.add_option("-n", "--nbBins",      dest="nbBins",            action="store",      default=1000,        type="int",    help="number of bins [default: 1000] [format: int]") | 
|  | 71     parser.add_option("-2", "--bothStrands", dest="bothStrands",       action="store_true", default=False,                      help="plot one curve per strand [format: bool] [default: false]") | 
|  | 72     parser.add_option("-w", "--raw",         dest="raw",               action="store_true", default=False,                      help="plot raw number of occurrences instead of density [format: bool] [default: false]") | 
|  | 73     parser.add_option("-x", "--csv",         dest="csv",               action="store_true", default=False,                      help="write a .csv file [format: bool]") | 
|  | 74     parser.add_option("-c", "--chromosome",  dest="chromosome",        action="store",      default=None,        type="string", help="plot only a chromosome [format: string]") | 
|  | 75     parser.add_option("-s", "--start",       dest="start",             action="store",      default=None,        type="int",    help="start from a given region [format: int]") | 
|  | 76     parser.add_option("-e", "--end",         dest="end",               action="store",      default=None,        type="int",    help="end from a given region [format: int]") | 
|  | 77     parser.add_option("-y", "--yMin",        dest="yMin",              action="store",      default=None,        type="int",    help="minimum value on the y-axis to plot [format: int]") | 
|  | 78     parser.add_option("-Y", "--yMax",        dest="yMax",              action="store",      default=None,        type="int",    help="maximum value on the y-axis to plot [format: int]") | 
|  | 79     parser.add_option("-g", "--gff",         dest="gff",               action="store_true", default=False,                      help="also write GFF3 file [format: bool] [default: false]") | 
|  | 80     parser.add_option("-H", "--height",      dest="height",            action="store",      default=None,        type="int",    help="height of the graphics [format: int] [default: 300]") | 
|  | 81     parser.add_option("-W", "--width",       dest="width",             action="store",      default=None,        type="int",    help="width of the graphics [format: int] [default: 1000]") | 
|  | 82     parser.add_option("-v", "--verbosity",   dest="verbosity",         action="store",      default=1,           type="int",    help="trace level [default: 1] [format: int]") | 
|  | 83     parser.add_option("-l", "--log",         dest="log",               action="store_true", default=False,                      help="write a log file [format: bool]") | 
|  | 84     parser.add_option("-D", "--directory",   dest="working_Dir",       action="store",      default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") | 
|  | 85     (options, args) = parser.parse_args() | 
|  | 86 | 
|  | 87     sizes = {} | 
|  | 88     if options.referenceFileName != None: | 
|  | 89         # get the sizes of the chromosomes | 
|  | 90         referenceHandle = open(options.referenceFileName) | 
|  | 91         name            = None | 
|  | 92         size            = 0 | 
|  | 93         maxSize         = 0 | 
|  | 94         for line in referenceHandle: | 
|  | 95             line = line.strip() | 
|  | 96             if line == "": continue | 
|  | 97             if line[0] == ">": | 
|  | 98                 if name != None: | 
|  | 99                     if options.verbosity > 10: | 
|  | 100                         print name | 
|  | 101                     sizes[name] = size | 
|  | 102                     maxSize     = max(maxSize, size) | 
|  | 103                     size        = 0 | 
|  | 104                 name = line[1:] | 
|  | 105             else: | 
|  | 106                 size += len(line) | 
|  | 107         sizes[name] = size | 
|  | 108         maxSize     = max(maxSize, size) | 
|  | 109         if options.verbosity > 1: | 
|  | 110             print "done" | 
|  | 111         start = 0 | 
|  | 112         end   = maxSize | 
|  | 113     else: | 
|  | 114         if options.chromosome == None or options.start == None or options.end == None: | 
|  | 115             raise Exception("Missing chromosome or start and end positions, or reference file") | 
|  | 116         maxSize                   = options.end | 
|  | 117         sizes[options.chromosome] = options.end | 
|  | 118         start                     = options.start | 
|  | 119         end                       = options.end | 
|  | 120 | 
|  | 121 | 
|  | 122     tmp1      = int(maxSize / float(options.nbBins)) | 
|  | 123     tmp2      = 10 ** (len("%d" % (tmp1))-2) | 
|  | 124     sliceSize = int((tmp1 / tmp2) * tmp2) | 
|  | 125 | 
|  | 126     bins      = dict() | 
|  | 127     binsPlus  = dict() | 
|  | 128     binsMinus = dict() | 
|  | 129     for chromosome in sizes: | 
|  | 130         bins[chromosome]      = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) | 
|  | 131         binsPlus[chromosome]  = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) | 
|  | 132         binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) | 
|  | 133 | 
|  | 134     parser   = TranscriptContainer(options.inputFileName, options.format, options.verbosity) | 
|  | 135     progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) | 
|  | 136     maxSlice = 0 | 
|  | 137     # count the number of reads | 
|  | 138     for transcript in parser.getIterator(): | 
|  | 139         if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end): | 
|  | 140             if transcript.getDirection() == 1: | 
|  | 141                 binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 | 
|  | 142             else: | 
|  | 143                 binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 | 
|  | 144             bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 | 
|  | 145             maxSlice = max(maxSlice, transcript.getStart() / sliceSize) | 
|  | 146         progress.inc() | 
|  | 147     progress.done() | 
|  | 148 | 
|  | 149     # compute densities | 
|  | 150     densityPlus = dict() | 
|  | 151     for chromosome in bins: | 
|  | 152         densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]]) | 
|  | 153         for bin in binsPlus[chromosome]: | 
|  | 154             densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor | 
|  | 155         # correct densities for first and last bins | 
|  | 156         if start % sliceSize != 0: | 
|  | 157             densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor | 
|  | 158         if sizes[chromosome] % sliceSize != 0: | 
|  | 159             densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor | 
|  | 160     densityMinus = dict() | 
|  | 161     for chromosome in binsMinus: | 
|  | 162         densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]]) | 
|  | 163         for bin in binsMinus[chromosome]: | 
|  | 164             densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor | 
|  | 165         # correct densities for first and last bins | 
|  | 166         if start % sliceSize != 0: | 
|  | 167             densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor | 
|  | 168         if sizes[chromosome] % sliceSize != 0: | 
|  | 169             densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor | 
|  | 170     density = dict() | 
|  | 171     for chromosome in bins: | 
|  | 172         density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]]) | 
|  | 173         for bin in bins[chromosome]: | 
|  | 174             density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin] | 
|  | 175 | 
|  | 176     for chromosome in densityMinus: | 
|  | 177         for bin in densityMinus[chromosome]: | 
|  | 178             densityMinus[chromosome][bin] *= -1 | 
|  | 179         for bin in binsMinus[chromosome]: | 
|  | 180             binsMinus[chromosome][bin] *= -1 | 
|  | 181 | 
|  | 182     for chromosome in density: | 
|  | 183         maxX = max(bins[chromosome].keys()) | 
|  | 184         if maxX <= 1000: | 
|  | 185             unit  = "nt." | 
|  | 186             ratio = 1.0 | 
|  | 187         elif maxX <= 1000000: | 
|  | 188             unit  = "kb" | 
|  | 189             ratio = 1000.0 | 
|  | 190         else: | 
|  | 191             unit  = "Mb" | 
|  | 192             ratio = 1000000.0 | 
|  | 193         outputFileName = "%s_%s" % (options.outputFileName, chromosome) | 
|  | 194         if options.start != None and options.end != None: | 
|  | 195             outputFileName += ":%d-%d" % (options.start, options.end) | 
|  | 196         outputFileName += ".png" | 
|  | 197         plotter = RPlotter(outputFileName, options.verbosity) | 
|  | 198         plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) | 
|  | 199         plotter.setYLabel("# reads") | 
|  | 200         if options.bothStrands: | 
|  | 201             plotter.setImageSize(1000, 300) | 
|  | 202         else: | 
|  | 203             plotter.setImageSize(1000, 200) | 
|  | 204         if options.height != None: | 
|  | 205             plotter.setHeight(options.height) | 
|  | 206         if options.width != None: | 
|  | 207             plotter.setWidth(options.width) | 
|  | 208         if options.yMax != None: | 
|  | 209             plotter.setMinimumY(options.yMin) | 
|  | 210         if options.yMax != None: | 
|  | 211             plotter.setMaximumY(options.yMax) | 
|  | 212         if options.bothStrands : | 
|  | 213             if options.raw: | 
|  | 214                 plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio)) | 
|  | 215             else: | 
|  | 216                 plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio)) | 
|  | 217             if options.raw: | 
|  | 218                 plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio)) | 
|  | 219             else: | 
|  | 220                 plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio)) | 
|  | 221         else: | 
|  | 222             if options.raw: | 
|  | 223                 plotter.addLine(divideKeyDict(bins[chromosome], ratio)) | 
|  | 224             else: | 
|  | 225                 plotter.addLine(divideKeyDict(density[chromosome], ratio)) | 
|  | 226         plotter.plot() | 
|  | 227 | 
|  | 228     if options.csv: | 
|  | 229         outputFileName = "%s" % (options.outputFileName) | 
|  | 230         if options.chromosome != None: | 
|  | 231             outputFileName += "_%s" % (options.chromosome) | 
|  | 232         if options.start != None and options.end != None: | 
|  | 233             outputFileName += ":%d-%d" % (options.start, options.end) | 
|  | 234         outputFileName += ".csv" | 
|  | 235         csvHandle = open(outputFileName, "w") | 
|  | 236         for slice in range(start / sliceSize, maxSlice + 1): | 
|  | 237             csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize)) | 
|  | 238         csvHandle.write("\n") | 
|  | 239         if options.bothStrands: | 
|  | 240             for chromosome in densityPlus: | 
|  | 241                 if len(densityPlus[chromosome]) > 0: | 
|  | 242                     csvHandle.write("%s [+]" % (chromosome)) | 
|  | 243                     for slice in sorted(densityPlus[chromosome].keys()): | 
|  | 244                         csvHandle.write(";%.2f" % (densityPlus[chromosome][slice])) | 
|  | 245                     csvHandle.write("\n") | 
|  | 246                 if len(densityMinus[chromosome]) > 0: | 
|  | 247                     csvHandle.write("%s [-]" % (chromosome)) | 
|  | 248                     for slice in sorted(densityPlus[chromosome].keys()): | 
|  | 249                         csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice])) | 
|  | 250                     csvHandle.write("\n") | 
|  | 251         else: | 
|  | 252             for chromosome in density: | 
|  | 253                 if len(density[chromosome]) > 0: | 
|  | 254                     csvHandle.write(chromosome) | 
|  | 255                     for slice in sorted(density[chromosome].keys()): | 
|  | 256                         csvHandle.write(";%.2f" % (density[chromosome][slice])) | 
|  | 257                     csvHandle.write("\n") | 
|  | 258         csvHandle.close() | 
|  | 259 | 
|  | 260     if options.gff: | 
|  | 261         chromosome = "" if options.chromosome == None                         else options.chromosome.capitalize() | 
|  | 262         start      = "" if options.start      == None                         else "%d" % (options.start) | 
|  | 263         end        = "" if options.end        == None                         else "%d" % (options.end) | 
|  | 264         link1      = "" if options.start      == None and options.end == None else ":" | 
|  | 265         link2      = "" if options.start      == None and options.end == None else "-" | 
|  | 266         writer     = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity) | 
|  | 267         cpt = 1 | 
|  | 268         if options.raw: | 
|  | 269             valuesPlus  = binsPlus | 
|  | 270             valuesMinus = binsMinus | 
|  | 271             values      = bins | 
|  | 272         else: | 
|  | 273             valuesPlus  = densityPlus | 
|  | 274             valuesMinus = densityMinus | 
|  | 275             values      = density | 
|  | 276         if options.bothStrands: | 
|  | 277             for chromosome in values: | 
|  | 278                 for slice in valuesPlus[chromosome]: | 
|  | 279                     writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice])) | 
|  | 280                     cpt += 1 | 
|  | 281                 for slice in valuesMinus[chromosome]: | 
|  | 282                     writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice])) | 
|  | 283                     cpt += 1 | 
|  | 284         else: | 
|  | 285             for chromosome in values: | 
|  | 286                 for slice in values[chromosome]: | 
|  | 287                     writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice])) | 
|  | 288                     cpt += 1 | 
|  | 289         writer.write() | 
|  | 290 | 
|  | 291 |