# HG changeset patch # User m-zytnicki # Date 1386949036 18000 # Node ID 80a080dc3ee82a50022a36c0dc3cecefdae15df8 # Parent 809ed01c801457258b54e358d1b7073c7498f877 Deleted selected files diff -r 809ed01c8014 -r 80a080dc3ee8 SMART/Java/Python/getDistribution.py --- a/SMART/Java/Python/getDistribution.py Mon Dec 09 04:29:22 2013 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,291 +0,0 @@ -#! /usr/bin/env python -# -# Copyright INRA-URGI 2009-2010 -# -# This software is governed by the CeCILL license under French law and -# abiding by the rules of distribution of free software. You can use, -# modify and/ or redistribute the software under the terms of the CeCILL -# license as circulated by CEA, CNRS and INRIA at the following URL -# "http://www.cecill.info". -# -# As a counterpart to the access to the source code and rights to copy, -# modify and redistribute granted by the license, users are provided only -# with a limited warranty and the software's author, the holder of the -# economic rights, and the successive licensors have only limited -# liability. -# -# In this respect, the user's attention is drawn to the risks associated -# with loading, using, modifying and/or developing or reproducing the -# software by the user in light of its specific status of free software, -# that may mean that it is complicated to manipulate, and that also -# therefore means that it is reserved for developers and experienced -# professionals having in-depth computer knowledge. Users are therefore -# encouraged to load and test the software's suitability as regards their -# requirements in conditions enabling the security of their systems and/or -# data to be ensured and, more generally, to use and operate it in the -# same conditions as regards security. -# -# The fact that you are presently reading this means that you have had -# knowledge of the CeCILL license and that you accept its terms. -# -"""Get the repartition of some elements in a chromosomes""" - -import os -from optparse import OptionParser -from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer -from SMART.Java.Python.structure.Transcript import Transcript -from commons.core.writer.Gff3Writer import Gff3Writer -from SMART.Java.Python.misc.RPlotter import RPlotter -from SMART.Java.Python.misc.Progress import Progress -from math import * - -def divideKeyDict(dictionary, ratio): - return dict([(key / ratio, dictionary[key]) for key in dictionary]) - - -def setTranscript(chromosome, direction, start, end, name, value): - transcript = Transcript() - transcript.setChromosome(chromosome) - transcript.setDirection(direction) - transcript.setStart(start) - transcript.setEnd(end) - transcript.setName(name) - transcript.setTagValue("nbElements", value) - return transcript - - - -if __name__ == "__main__": - - magnifyingFactor = 1000 - - # parse command line - description = "Get Distribution v1.0.1: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" - - parser = OptionParser(description = description) - parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") - parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [compulsory] [format: file in FASTA format]") - parser.add_option("-n", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") - parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") - parser.add_option("-w", "--raw", dest="raw", action="store_true", default=False, help="plot raw number of occurrences instead of density [format: bool] [default: false]") - parser.add_option("-x", "--csv", dest="csv", action="store_true", default=False, help="write a .csv file [format: bool]") - parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") - parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") - parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") - parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") - parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") - parser.add_option("-g", "--gff", dest="gff", action="store_true", default=False, help="also write GFF3 file [format: bool] [default: false]") - parser.add_option("-H", "--height", dest="height", action="store", default=None, type="int", help="height of the graphics [format: int] [default: 300]") - parser.add_option("-W", "--width", dest="width", action="store", default=None, type="int", help="width of the graphics [format: int] [default: 1000]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") - parser.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool]") - parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") - (options, args) = parser.parse_args() - - sizes = {} - if options.referenceFileName != None: - # get the sizes of the chromosomes - referenceHandle = open(options.referenceFileName) - name = None - size = 0 - maxSize = 0 - for line in referenceHandle: - line = line.strip() - if line == "": continue - if line[0] == ">": - if name != None: - if options.verbosity > 10: - print name - sizes[name] = size - maxSize = max(maxSize, size) - size = 0 - name = line[1:] - else: - size += len(line) - sizes[name] = size - maxSize = max(maxSize, size) - if options.verbosity > 1: - print "done" - start = 0 - end = maxSize - else: - if options.chromosome == None or options.start == None or options.end == None: - raise Exception("Missing chromosome or start and end positions, or reference file") - maxSize = options.end - sizes[options.chromosome] = options.end - start = options.start - end = options.end - - - tmp1 = int(maxSize / float(options.nbBins)) - tmp2 = 10 ** (len("%d" % (tmp1))-2) - sliceSize = int((tmp1 / tmp2) * tmp2) - - bins = dict() - binsPlus = dict() - binsMinus = dict() - for chromosome in sizes: - bins[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) - binsPlus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) - binsMinus[chromosome] = dict([(i * sliceSize + 1, 0) for i in range(start / sliceSize, sizes[chromosome] / sliceSize + 1)]) - - parser = TranscriptContainer(options.inputFileName, options.format, options.verbosity) - progress = Progress(parser.getNbTranscripts(), "Parsing %s" % (options.inputFileName), options.verbosity) - maxSlice = 0 - # count the number of reads - for transcript in parser.getIterator(): - if options.chromosome == None or (transcript.getChromosome() == options.chromosome and transcript.getStart() >= start and transcript.getStart() <= end): - if transcript.getDirection() == 1: - binsPlus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 - else: - binsMinus[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 - bins[transcript.getChromosome()][(transcript.getStart() / sliceSize) * sliceSize + 1] += 1 - maxSlice = max(maxSlice, transcript.getStart() / sliceSize) - progress.inc() - progress.done() - - # compute densities - densityPlus = dict() - for chromosome in bins: - densityPlus[chromosome] = dict([(bin, 0) for bin in binsPlus[chromosome]]) - for bin in binsPlus[chromosome]: - densityPlus[chromosome][bin] = float(binsPlus[chromosome][bin]) / sliceSize * magnifyingFactor - # correct densities for first and last bins - if start % sliceSize != 0: - densityPlus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor - if sizes[chromosome] % sliceSize != 0: - densityPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsPlus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor - densityMinus = dict() - for chromosome in binsMinus: - densityMinus[chromosome] = dict([(bin, 0) for bin in binsMinus[chromosome]]) - for bin in binsMinus[chromosome]: - densityMinus[chromosome][bin] = float(binsMinus[chromosome][bin]) / sliceSize * magnifyingFactor - # correct densities for first and last bins - if start % sliceSize != 0: - densityMinus[chromosome][(start / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(start / sliceSize) * sliceSize + 1]) / (sliceSize - (start % sliceSize)) * magnifyingFactor - if sizes[chromosome] % sliceSize != 0: - densityMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1] = float(binsMinus[chromosome][(sizes[chromosome] / sliceSize) * sliceSize + 1]) / (sizes[chromosome] % sliceSize) * magnifyingFactor - density = dict() - for chromosome in bins: - density[chromosome] = dict([(bin, 0) for bin in bins[chromosome]]) - for bin in bins[chromosome]: - density[chromosome][bin] = densityPlus[chromosome][bin] + densityMinus[chromosome][bin] - - for chromosome in densityMinus: - for bin in densityMinus[chromosome]: - densityMinus[chromosome][bin] *= -1 - for bin in binsMinus[chromosome]: - binsMinus[chromosome][bin] *= -1 - - for chromosome in density: - maxX = max(bins[chromosome].keys()) - if maxX <= 1000: - unit = "nt." - ratio = 1.0 - elif maxX <= 1000000: - unit = "kb" - ratio = 1000.0 - else: - unit = "Mb" - ratio = 1000000.0 - outputFileName = "%s_%s" % (options.outputFileName, chromosome) - if options.start != None and options.end != None: - outputFileName += ":%d-%d" % (options.start, options.end) - outputFileName += ".png" - plotter = RPlotter(outputFileName, options.verbosity) - plotter.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) - plotter.setYLabel("# reads") - if options.bothStrands: - plotter.setImageSize(1000, 300) - else: - plotter.setImageSize(1000, 200) - if options.height != None: - plotter.setHeight(options.height) - if options.width != None: - plotter.setWidth(options.width) - if options.yMax != None: - plotter.setMinimumY(options.yMin) - if options.yMax != None: - plotter.setMaximumY(options.yMax) - if options.bothStrands : - if options.raw: - plotter.addLine(divideKeyDict(binsPlus[chromosome], ratio)) - else: - plotter.addLine(divideKeyDict(densityPlus[chromosome], ratio)) - if options.raw: - plotter.addLine(divideKeyDict(binsMinus[chromosome], ratio)) - else: - plotter.addLine(divideKeyDict(densityMinus[chromosome], ratio)) - else: - if options.raw: - plotter.addLine(divideKeyDict(bins[chromosome], ratio)) - else: - plotter.addLine(divideKeyDict(density[chromosome], ratio)) - plotter.plot() - - if options.csv: - outputFileName = "%s" % (options.outputFileName) - if options.chromosome != None: - outputFileName += "_%s" % (options.chromosome) - if options.start != None and options.end != None: - outputFileName += ":%d-%d" % (options.start, options.end) - outputFileName += ".csv" - csvHandle = open(outputFileName, "w") - for slice in range(start / sliceSize, maxSlice + 1): - csvHandle.write(";%d-%d" % (slice * sliceSize + 1, (slice+1) * sliceSize)) - csvHandle.write("\n") - if options.bothStrands: - for chromosome in densityPlus: - if len(densityPlus[chromosome]) > 0: - csvHandle.write("%s [+]" % (chromosome)) - for slice in sorted(densityPlus[chromosome].keys()): - csvHandle.write(";%.2f" % (densityPlus[chromosome][slice])) - csvHandle.write("\n") - if len(densityMinus[chromosome]) > 0: - csvHandle.write("%s [-]" % (chromosome)) - for slice in sorted(densityPlus[chromosome].keys()): - csvHandle.write(";%.2f" % (-densityMinus[chromosome][slice])) - csvHandle.write("\n") - else: - for chromosome in density: - if len(density[chromosome]) > 0: - csvHandle.write(chromosome) - for slice in sorted(density[chromosome].keys()): - csvHandle.write(";%.2f" % (density[chromosome][slice])) - csvHandle.write("\n") - csvHandle.close() - - if options.gff: - chromosome = "" if options.chromosome == None else options.chromosome.capitalize() - start = "" if options.start == None else "%d" % (options.start) - end = "" if options.end == None else "%d" % (options.end) - link1 = "" if options.start == None and options.end == None else ":" - link2 = "" if options.start == None and options.end == None else "-" - writer = Gff3Writer("%s%s%s%s%s.gff3" % (options.outputFileName, link1, start, link2, end), options.verbosity) - cpt = 1 - if options.raw: - valuesPlus = binsPlus - valuesMinus = binsMinus - values = bins - else: - valuesPlus = densityPlus - valuesMinus = densityMinus - values = density - if options.bothStrands: - for chromosome in values: - for slice in valuesPlus[chromosome]: - writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), valuesPlus[chromosome][slice])) - cpt += 1 - for slice in valuesMinus[chromosome]: - writer.addTranscript(setTranscript(chromosome, -1, slice, slice + sliceSize, "region%d" % (cpt), - valuesMinus[chromosome][slice])) - cpt += 1 - else: - for chromosome in values: - for slice in values[chromosome]: - writer.addTranscript(setTranscript(chromosome, 1, slice, slice + sliceSize, "region%d" % (cpt), values[chromosome][slice])) - cpt += 1 - writer.write() - -