# HG changeset patch # User m-zytnicki # Date 1380525566 14400 # Node ID 169d364ddd916d691503f5e47a1d619be2b86b5b # Parent e454402ba9d98be7c6f48d0d61af3ccc57fb4d2d Uploaded diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/CompareOverlapping.py --- a/SMART/Java/Python/CompareOverlapping.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/CompareOverlapping.py Mon Sep 30 03:19:26 2013 -0400 @@ -206,6 +206,8 @@ if self._verbosity > 2: print "Creating %s NC-list..." % (TYPETOSTRING[type]) self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) + if "SMARTTMPPATH" in os.environ: + self._convertedFileNames[type] = os.path.join(os.environ["SMARTTMPPATH"], self._convertedFileNames[type]) ncLists = ConvertToNCList(self._verbosity) ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) ncLists.setOutputFileName(self._convertedFileNames[type]) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/CompareOverlappingSmallQuery.py --- a/SMART/Java/Python/CompareOverlappingSmallQuery.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/CompareOverlappingSmallQuery.py Mon Sep 30 03:19:26 2013 -0400 @@ -74,9 +74,6 @@ self.collinear = False self.pcOverlapQuery = False self.pcOverlapRef = False - self.minOverlap = False - self.included = False - self.including = False self.bins = {} self.overlaps = {} self.notOverlapping = False @@ -110,13 +107,6 @@ self.pcOverlapQuery = pcOverlapQuery self.pcOverlapRef = pcOverlapRef - def setMinOverlap(self, minOverlap): - self.minOverlap = minOverlap - - def setInclude(self, included, including): - self.included = included - self.including = including - def includeNotOverlapping(self, boolean): self.notOverlapping = boolean @@ -145,18 +135,12 @@ return False if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): return False - if self.included and not refTranscript.include(queryTranscript): - return False - if self.including and not queryTranscript.include(refTranscript): - return False querySize = queryTranscript.getSize() if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)): return False refSize = refTranscript.getSize() if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)): return False - if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap): - return False return True def _alterTranscript(self, transcript, type): @@ -237,11 +221,8 @@ parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") - parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]") parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") - parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]") - parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]") parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") (options, args) = parser.parse_args() @@ -255,7 +236,5 @@ cosq.setCollinear(options.collinear) cosq.setAntisense(options.antisense) cosq.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef) - cosq.setMinOverlap(options.minOverlap) - cosq.setInclude(options.included, options.including) cosq.setInvert(options.exclude) cosq.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/CompareOverlappingSmallRef.py --- a/SMART/Java/Python/CompareOverlappingSmallRef.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/CompareOverlappingSmallRef.py Mon Sep 30 03:19:26 2013 -0400 @@ -72,11 +72,8 @@ self.antisense = False self.collinear = False self.distance = None - self.minOverlap = False self.pcOverlapQuery = False self.pcOverlapRef = False - self.included = False - self.including = False self.bins = {} self.notOverlapping = False @@ -109,10 +106,6 @@ self.pcOverlapQuery = pcOverlapQuery self.pcOverlapRef = pcOverlapRef - def setInclude(self, included, including): - self.included = included - self.including = including - def includeNotOverlapping(self, boolean): self.notOverlapping = boolean @@ -146,18 +139,12 @@ return False if self.antisense and queryTranscript.getDirection() == refTranscript.getDirection(): return False - if self.included and not queryTranscript.isIncluded(refTranscript): - return False - if self.including and not refTranscript.isIncluded(queryTranscript): - return False querySize = queryTranscript.getSize() if self.pcOverlapQuery and not queryTranscript.overlapWithExon(refTranscript, int(querySize * self.pcOverlapQuery / 100.0)): return False refSize = refTranscript.getSize() if self.pcOverlapRef and not queryTranscript.overlapWithExon(refTranscript, int(refSize * self.pcOverlapRef / 100.0)): return False - if self.minOverlap and not queryTranscript.overlapWithExon(refTranscript, self.minOverlap): - return False return True def _compareTranscript(self, queryTranscript): @@ -180,9 +167,11 @@ return overlaps def _updateTranscript(self, queryTranscript, overlaps): - queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) if overlaps: + queryTranscript.setTagValue("nbOverlaps", sum(overlaps.values())) queryTranscript.setTagValue("overlapsWith", "--".join(overlaps.keys())[:100]) + else: + queryTranscript.setTagValue("nbOverlaps", "0") return queryTranscript def compare(self): @@ -222,17 +211,14 @@ parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") - parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") - parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") - parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") - parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") - parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=False, type="int", help="min. #nt overlap [format: bool] [default: false]") - parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") - parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") - parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="provide query elements which are nested in reference elements [format: bool] [default: false]") - parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="provide query elements in which reference elements are nested [format: bool] [default: false]") - parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="provide collinear features [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="provide antisense features [format: bool] [default: false]") + parser.add_option("-p", "--pcOverlapQuery", dest="pcOverlapQuery", action="store", default=False, type="int", help="min. % overlap of the query [format: bool] [default: false]") + parser.add_option("-P", "--pcOverlapRef", dest="pcOverlapRef", action="store", default=False, type="int", help="min. % overlap of the reference [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") (options, args) = parser.parse_args() cosr = CompareOverlappingSmallRef(options.verbosity) @@ -242,9 +228,7 @@ cosr.includeNotOverlapping(options.notOverlapping) cosr.setDistance(options.distance) cosr.setAntisense(options.antisense) - cosr.setInclude(options.included, options.including) cosr.setInvert(options.exclude) - cosr.setMinOverlap(options.minOverlap) cosr.setMinPercentOverlap(options.pcOverlapQuery, options.pcOverlapRef) cosr.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/ComputeCoverage.py --- a/SMART/Java/Python/ComputeCoverage.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/ComputeCoverage.py Mon Sep 30 03:19:26 2013 -0400 @@ -130,7 +130,7 @@ parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of the second file [compulsory] [format: transcript file format]") parser.add_option("-t", "--introns", dest="introns", action="store_true", default=False, help="also include introns [format: boolean] [default: false]") parser.add_option("-o", "--output", dest="outputFileName", action="store", default=None, type="string", help="output file [format: output file in GFF3 format]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", type="int", help="trace level [default: 1] [format: int]") (options, args) = parser.parse_args() computer = CoverageComputer(options.verbosity) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/CountLoci.py --- a/SMART/Java/Python/CountLoci.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/CountLoci.py Mon Sep 30 03:19:26 2013 -0400 @@ -85,7 +85,7 @@ "five_prime_UTR": "%sfive.gff3" % (self.outputBase), \ "three_prime_UTR": "%sthree.gff3" % (self.outputBase), \ "mRNA": "%smrna.gff3" % (self.outputBase), \ - "ncRNA": "%sncRNA.gff3" % (self.outputBase), \ + "ncRNA": "%sncRNA.gff3" % (self.outputBase), \ "transposable_element_gene": "%sTE.gff3" % (self.outputBase), \ "vic": "%svicinity.gff3" % (self.outputBase)} self.tmpFileNames.extend(self.referenceFiles.values()) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/GetDistribution.py --- a/SMART/Java/Python/GetDistribution.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/GetDistribution.py Mon Sep 30 03:19:26 2013 -0400 @@ -45,28 +45,33 @@ class GetDistribution(object): def __init__(self, verbosity): - self.verbosity = verbosity - self.sizes = None - self.twoStrands = False - self.start = 1 - self.names = ["nbElements"] - self.average = False - self.nbValues = {} - self.height = 300 - self.width = 600 - self.colors = None - self.gffFileName = None - self.csvFileName = None - self.yMin = None - self.yMax = None - self.chromosome = None - self.merge = False - self.nbTranscripts = None + self.verbosity = verbosity + self.sizes = None + self.nbBins = None + self.sliceSize = None + self.twoStrands = False + self.start = 1 + self.names = ["nbElements"] + self.average = False + self.nbValues = {} + self.height = 300 + self.width = 600 + self.dots = False + self.colors = None + self.gffFileName = None + self.csvFileName = None + self.yMin = None + self.yMax = None + self.chromosome = None + self.merge = False + self.nbTranscripts = None + self.factors = None + self.thicknessCurve = 1 + self.sizePoliceLegend = 1.5 - def setInputFile(self, fileName, format): - chooser = ParserChooser(self.verbosity) - chooser.findFormat(format) - self.parser = chooser.getParser(fileName) + def setInputFiles(self, fileNames, format): + self.fileNames = fileNames + self.format = format def setReferenceFile(self, fileName): if fileName == None: @@ -77,7 +82,7 @@ self.maxSize = max(self.sizes.values()) def setRegion(self, chromosome, start, end): - if chromosome == None: + if chromosome == None or start == None or end == None: return self.maxSize = options.end self.sizes = {chromosome: end} @@ -90,13 +95,20 @@ self.outputFileName = fileName def setNbBins(self, nbBins): - self.nbBins = nbBins + if nbBins != None: + self.nbBins = int(nbBins) + + def setBinSize(self, binSize): + if binSize != None: + self.sliceSize = int(binSize) def set2Strands(self, twoStrands): self.twoStrands = twoStrands def setNames(self, names): self.names = names + if len(self.names) == 1 and len(self.fileNames) > 1: + self.names = ["file %d" % (i+1) for i in range(len(self.fileNames))] def setAverage(self, average): self.average = average @@ -104,10 +116,16 @@ def setNormalization(self, normalization): self.normalization = normalization + def setNormalizationFactors(self, factors): + self.factors = dict([name, 1.0] for name in self.names) if factors == None else dict(zip(self.names, factors)) + def setImageSize(self, height, width): self.height = height self.width = width + def setDots(self, dots): + self.dots = dots + def setYLimits(self, yMin, yMax): self.yMin = yMin self.yMax = yMax @@ -124,15 +142,29 @@ def mergePlots(self, merge): self.merge = merge + def setThicknessCurve(self, thickness) : + self.thickness = thickness + + def setSizePoliceLegend(self, sizePoliceLegend): + self.sizePoliceLegend = sizePoliceLegend + def _estimateSizes(self): - progress = UnlimitedProgress(10000, "Reading input for chromosome size estimate", self.verbosity) - self.sizes = {} - for self.nbTranscripts, transcript in enumerate(self.parser.getIterator()): - chromosome = transcript.getChromosome() - start = transcript.getStart() - self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) - progress.inc() - progress.done() + self.sizes = {} + self.nbTranscripts = {} + for fileName in self.fileNames: + progress = UnlimitedProgress(10000, "Reading %s for chromosome size estimate" % (fileName), self.verbosity) + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(self.format) + parser = parserChooser.getParser(fileName) + for nbTranscripts, transcript in enumerate(parser.getIterator()): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + chromosome = transcript.getChromosome() + start = transcript.getStart() + self.sizes[chromosome] = max(start, self.sizes.get(chromosome, 0)) + progress.inc() + progress.done() + self.nbTranscripts[fileName] = nbTranscripts def _computeSliceSize(self): if self.nbBins == 0: @@ -156,37 +188,50 @@ self.bins[chromosome][name][strand] = dict([(i * self.sliceSize + 1, 0.0) for i in range(self.start / self.sliceSize, self.sizes[chromosome] / self.sliceSize + 1)]) def _populateBins(self): - if self.nbTranscripts == None: - progress = UnlimitedProgress(10000, "Counting data", self.verbosity) - else: - progress = Progress(self.nbTranscripts, "Counting data", self.verbosity) - for transcript in self.parser.getIterator(): - if transcript.__class__.__name__ == "Mapping": - transcript = transcript.getTranscript() - progress.inc() - chromosome = transcript.getChromosome() - start = transcript.getStart() - if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): - continue - strand = transcript.getDirection() if self.twoStrands else 0 - if self.nbBins != 0: - bin = (start / self.sliceSize) * self.sliceSize + 1 + for id, fileName in enumerate(self.fileNames): + if self.nbTranscripts == None: + progress = UnlimitedProgress(10000, "Counting data", self.verbosity) else: - bin = start - for name in self.names: - value = float(transcript.tags.get(name, 1)) - self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value - self.nbValues[name] = self.nbValues.get(name, 0) + value - progress.done() + progress = Progress(self.nbTranscripts[fileName], "Counting data", self.verbosity) + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(self.format) + parser = parserChooser.getParser(fileName) + for transcript in parser.getIterator(): + if transcript.__class__.__name__ == "Mapping": + transcript = transcript.getTranscript() + progress.inc() + chromosome = transcript.getChromosome() + start = transcript.getStart() + if self.chromosome and (chromosome != self.chromosome or start < self.start or start > self.end): + continue + strand = transcript.getDirection() if self.twoStrands else 0 + if self.nbBins != 0: + bin = (start / self.sliceSize) * self.sliceSize + 1 + else: + bin = start + if len(self.fileNames) > 1: + nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 + name = self.names[id] + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + nbElements + self.nbValues[name] = self.nbValues.get(name, 0) + nbElements + else: + for name in self.names: + value = float(transcript.tags.get(name, 1)) + self.bins[chromosome][name][strand][bin] = self.bins[chromosome][name][strand].get(bin, 0) + value + self.nbValues[name] = self.nbValues.get(name, 0) + value + progress.done() - def _normalize(self): - average = float(sum(self.nbValues)) / len(self.nbValues.keys()) - factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + def _normalizeFactors(self): for chromosome in self.bins: for name in self.bins[chromosome]: for strand in self.bins[chromosome][name]: for bin in self.bins[chromosome][name][strand]: - self.bins[chromosome][name][strand][bin] *= factors[name] + self.bins[chromosome][name][strand][bin] *= self.factors[name] + + def _normalize(self): + average = float(sum(self.nbValues.values())) / len(self.nbValues.keys()) + self.factors = dict([name, float(average) / self.nbValues[name]] for name in self.nbValues) + self._normalizeFactors() def _computeAverage(self): for chromosome in self.bins: @@ -198,6 +243,10 @@ def _getPlotter(self, chromosome): plot = RPlotter("%s_%s.png" % (os.path.splitext(self.outputFileName)[0], chromosome), self.verbosity) plot.setImageSize(self.width, self.height) + plot.setLineWidth(self.thickness) + plot.setSizePoliceLegend(self.sizePoliceLegend) + if self.dots: + plot.setPoints(True) if self.sizes[chromosome] <= 1000: unit = "nt." ratio = 1.0 @@ -212,10 +261,12 @@ if self.yMax != None: plot.setMaximumY(self.yMax) plot.setXLabel("Position on %s (in %s)" % (chromosome.replace("_", " "), unit)) - plot.setLegend(True) + if len(self.names) > 1: + plot.setLegend(True, True) for i, name in enumerate(self.bins[chromosome]): for strand in self.bins[chromosome][name]: - fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + #fullName = "%s %s" % (name.replace("_", " ")[:6], STRANDTOSTR[strand]) + fullName = name.replace("_", " ")[:6] factor = 1 if strand == 0 else strand correctedLine = dict([(key / ratio, value * factor) for key, value in self.bins[chromosome][name][strand].iteritems()]) plot.addLine(correctedLine, fullName, self.colors[i] if self.colors else None) @@ -299,11 +350,14 @@ def run(self): if self.sizes == None: self._estimateSizes() - self._computeSliceSize() + if self.sliceSize == None: + self._computeSliceSize() self._initBins() self._populateBins() if self.normalization: self._normalize() + if self.factors != None: + self._normalizeFactors() if self.average: self._computeAverage() self._plot() @@ -318,34 +372,40 @@ description = "Get Distribution v1.0.2: Get the distribution of the genomic coordinates on a genome. [Category: Visualization]" parser = OptionParser(description = description) - parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") - parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") - parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") - parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") - parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") - parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") - parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") - parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") - parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") - parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") - parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") - parser.add_option("-H", "--height", dest="height", action="store", default=300, type="int", help="height of the graphics [format: int] [default: 300]") - parser.add_option("-W", "--width", dest="width", action="store", default=600, type="int", help="width of the graphics [format: int] [default: 1000]") - parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") - parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") - parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") - parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") - parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + parser.add_option("-i", "--input", dest="inputFileNames", action="store", type="string", help="input files separated by commas [compulsory] [format: string]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the input file [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-r", "--reference", dest="referenceFileName", action="store", default=None, type="string", help="file containing the genome [format: file in FASTA format]") + parser.add_option("-b", "--nbBins", dest="nbBins", action="store", default=1000, type="int", help="number of bins [default: 1000] [format: int]") + parser.add_option("-B", "--binSize", dest="binSize", action="store", default=None, type="int", help="bin size [default: None] [format: int]") + parser.add_option("-2", "--bothStrands", dest="bothStrands", action="store_true", default=False, help="plot one curve per strand [format: bool] [default: false]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="plot only a chromosome [format: string]") + parser.add_option("-s", "--start", dest="start", action="store", default=None, type="int", help="start from a given region [format: int]") + parser.add_option("-e", "--end", dest="end", action="store", default=None, type="int", help="end from a given region [format: int]") + parser.add_option("-y", "--yMin", dest="yMin", action="store", default=None, type="int", help="minimum value on the y-axis to plot [format: int]") + parser.add_option("-Y", "--yMax", dest="yMax", action="store", default=None, type="int", help="maximum value on the y-axis to plot [format: int]") + parser.add_option("-x", "--csv", dest="csv", action="store", default=None, help="write a .csv file [format: output file in CSV format] [default: None]") + parser.add_option("-g", "--gff", dest="gff", action="store", default=None, help="also write GFF3 file [format: output file in GFF format] [default: None]") + parser.add_option("-H", "--height", dest="height", action="store", default=500, type="int", help="height of the graphics [format: int] [default: 300]") + parser.add_option("-W", "--width", dest="width", action="store", default=800, type="int", help="width of the graphics [format: int] [default: 1000]") + parser.add_option("-t", "--thickness", dest="lineThickness", action="store", default=1, type="int", help="thickness of the lines [format : int] [default : 1]") + parser.add_option("-d", "--policeLegend", dest="sizePoliceLegend", action="store", default=1.5, type="float", help="size of the police of the legend [format : float] [default : 1.5]") + parser.add_option("-D", "--dots", dest="dots", action="store_true", default=False, help="plot dots instead of lines [format : bool] [default : false]") + parser.add_option("-a", "--average", dest="average", action="store_true", default=False, help="plot average (instead of sum) [default: false] [format: boolean]") + parser.add_option("-n", "--names", dest="names", action="store", default="nbElements", type="string", help="name for the tags (separated by commas and no space) [default: None] [format: string]") + parser.add_option("-l", "--color", dest="colors", action="store", default=None, type="string", help="color of the lines (separated by commas and no space) [format: string]") + parser.add_option("-z", "--normalize", dest="normalize", action="store_true", default=False, help="normalize data (when panels are different) [format: bool] [default: false]") + parser.add_option("-Z", "--normalizeFac", dest="normalizeFactors", action="store", default=None, help="normalize data with given factors (when panels are different) [format: string]") + parser.add_option("-m", "--merge", dest="mergePlots", action="store_true", default=False, help="merge all plots in one figure [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") (options, args) = parser.parse_args() gt = GetDistribution(options.verbosity) - gt.setInputFile(options.inputFileName, options.format) + gt.setInputFiles(options.inputFileNames.split(","), options.format) gt.setOutputFile(options.outputFileName) gt.setReferenceFile(options.referenceFileName) - gt.setNbBins(int(options.nbBins)) + gt.setNbBins(options.nbBins) + gt.setBinSize(options.binSize) gt.set2Strands(options.bothStrands) gt.setRegion(options.chromosome, options.start, options.end) gt.setNormalization(options.normalize) @@ -355,8 +415,12 @@ gt.writeGff(options.gff) gt.setImageSize(options.height, options.width) gt.setNames(options.names.split(",")) + gt.setThicknessCurve(options.lineThickness) + gt.setSizePoliceLegend(options.sizePoliceLegend) gt.setColors(None if options.colors == None else options.colors.split(",")) + gt.setDots(options.dots) gt.setNormalization(options.normalize) + gt.setNormalizationFactors(None if options.normalizeFactors == None else [float(factor) for factor in options.normalizeFactors.split(",")]) gt.mergePlots(options.mergePlots) gt.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/GetFlanking.py --- a/SMART/Java/Python/GetFlanking.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/GetFlanking.py Mon Sep 30 03:19:26 2013 -0400 @@ -44,190 +44,188 @@ TAG_REGION = "_region" TAGS_REGION = {-1: "_upstream", 0: "", 1: "_downstream"} TAGS_RREGION = {-1: "upstream", 0: "overlapping", 1: "downstream"} -TAGS_SENSE = {-1: "antisense", 0: "", 1: "collinear"} +TAGS_SENSE = {-1: "antisense", 0: "", 1: "colinear"} STRANDSTOSTR = {-1: "(-)", 0: "", 1: "(+)"} -def getOrderKey(transcript, direction, input): - if direction == 1: - if input == QUERY: - return (transcript.getEnd(), -transcript.getStart()) - return (transcript.getStart(), -transcript.getEnd()) - if input == QUERY: - return (-transcript.getStart(), transcript.getEnd()) - return (-transcript.getEnd(), transcript.getStart()) +def getOrderKey(transcript, direction): + if direction == 1: + return transcript.getEnd() + return - transcript.getStart() + +def isInGoodRegion(transcriptRef, transcriptQuery, direction): + if direction == 1: + return transcriptQuery.getEnd() > transcriptRef.getEnd() + return transcriptQuery.getStart() < transcriptRef.getStart() class GetFlanking(object): - def __init__(self, verbosity): - self.verbosity = verbosity - self.transcripts = dict([id, {}] for id in INPUTS) - self.directions = [] - self.noOverlap = False - self.colinear = False - self.antisense = False - self.distance = None - self.minDistance = None - self.maxDistance = None - self.tagName = "flanking" + def __init__(self, verbosity): + self.verbosity = verbosity + self.transcripts = dict([id, {}] for id in INPUTS) + self.directions = [] + self.noOverlap = False + self.colinear = False + self.antisense = False + self.distance = None + self.minDistance = None + self.maxDistance = None + self.tagName = "flanking" - def setInputFile(self, fileName, format, id): - chooser = ParserChooser(self.verbosity) - chooser.findFormat(format) - parser = chooser.getParser(fileName) - for transcript in parser.getIterator(): - chromosome = transcript.getChromosome() - if chromosome not in self.transcripts[id]: - self.transcripts[id][chromosome] = [] - self.transcripts[id][chromosome].append(transcript) + def setInputFile(self, fileName, format, id): + chooser = ParserChooser(self.verbosity) + chooser.findFormat(format) + parser = chooser.getParser(fileName) + for transcript in parser.getIterator(): + chromosome = transcript.getChromosome() + if chromosome not in self.transcripts[id]: + self.transcripts[id][chromosome] = [] + self.transcripts[id][chromosome].append(transcript) - def setOutputFile(self, fileName): - self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + def setOutputFile(self, fileName): + self.writer = TranscriptWriter(fileName, "gff3", self.verbosity) + + def addUpstreamDirection(self, upstream): + if upstream: + self.directions.append(-1) + + def addDownstreamDirection(self, downstream): + if downstream: + self.directions.append(1) - def addUpstreamDirection(self, upstream): - if upstream: - self.directions.append(-1) + def setColinear(self, colinear): + self.colinear = colinear + + def setAntisense(self, antisense): + self.antisense = antisense - def addDownstreamDirection(self, downstream): - if downstream: - self.directions.append(1) + def setNoOverlap(self, noOverlap): + self.noOverlap = noOverlap - def setColinear(self, colinear): - self.colinear = colinear + def setMinDistance(self, distance): + self.minDistance = distance + + def setMaxDistance(self, distance): + self.maxDistance = distance - def setAntisense(self, antisense): - self.antisense = antisense - - def setNoOverlap(self, noOverlap): - self.noOverlap = noOverlap + def setNewTagName(self, tagName): + self.tagName = tagName - def setMinDistance(self, distance): - self.minDistance = distance - - def setMaxDistance(self, distance): - self.maxDistance = distance - - def setNewTagName(self, tagName): - self.tagName = tagName + def match(self, transcriptRef, transcriptQuery, direction): + if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): + return False + if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): + return False + if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): + return False + if self.minDistance != None or self.maxDistance != None: + distance = transcriptRef.getDistance(transcriptQuery) + if self.minDistance != None and distance < self.minDistance: + return False + if self.maxDistance != None and distance > self.maxDistance: + return False + return True - def match(self, transcriptQuery, transcriptRef, direction): - #print "comparing", transcriptQuery, "with", transcriptRef, "on direction", direction - if direction == 1 and transcriptRef.getEnd() < transcriptQuery.getStart(): - return False - if direction == -1 and transcriptQuery.getEnd() < transcriptRef.getStart(): - return False - if self.noOverlap and transcriptRef.overlapWith(transcriptQuery): - return False - if self.colinear and transcriptRef.getDirection() != transcriptQuery.getDirection(): - return False - if self.antisense and transcriptRef.getDirection() == transcriptQuery.getDirection(): - return False - if self.minDistance != None or self.maxDistance != None: - distance = transcriptRef.getDistance(transcriptQuery) - if self.minDistance != None and distance < self.minDistance: - return False - if self.maxDistance != None and distance > self.maxDistance: - return False - return True - - def getFlanking(self, chromosome, direction): - if chromosome not in self.transcripts[REFERENCE]: - return - sortedTranscripts = dict([id, {}] for id in INPUTS) - for id in INPUTS: - sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction, id)) - refIndex = 0 - progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) - for query in sortedTranscripts[QUERY]: - #print "Q: ", query - #print "R1: ", sortedTranscripts[REFERENCE][refIndex] - while not self.match(query, sortedTranscripts[REFERENCE][refIndex], direction): - refIndex += 1 - if refIndex == len(sortedTranscripts[REFERENCE]): - progress.done() - #print "done" - return - #print "R2: ", sortedTranscripts[REFERENCE][refIndex] - self.flankings[query][direction] = sortedTranscripts[REFERENCE][refIndex] - progress.inc() - progress.done() + def getFlanking(self, direction): + for chromosome in sorted(self.transcripts[REFERENCE].keys()): + if chromosome not in self.transcripts[QUERY]: + continue + sortedTranscripts = dict([id, {}] for id in INPUTS) + for id in INPUTS: + sortedTranscripts[id] = sorted(self.transcripts[id][chromosome], key = lambda t: getOrderKey(t, direction)) + refIndex = 0 + currentRefs = [] + outputs = set() + progress = Progress(len(sortedTranscripts[QUERY]), "Reading chr %s %s" % (chromosome, STRANDSTOSTR[direction]), self.verbosity) + for query in sortedTranscripts[QUERY]: + while refIndex < len(sortedTranscripts[REFERENCE]) and isInGoodRegion(sortedTranscripts[REFERENCE][refIndex], query, direction): + currentRefs.append(sortedTranscripts[REFERENCE][refIndex]) + refIndex += 1 + nextCurrentRefs = [] + for currentRef in currentRefs: + if self.match(currentRef, query, direction): + if currentRef not in self.flankings: + self.flankings[currentRef] = {} + self.flankings[currentRef][direction * currentRef.getDirection()] = query + else: + nextCurrentRefs.append(currentRef) + currentRefs = nextCurrentRefs + progress.inc() + progress.done() - def setTags(self, query, reference, direction): - refName = reference.getTagValue("ID") - if refName == None: - refName = reference.getName() - if refName == None: - refName = reference.__str__() - query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()]), refName) - query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction*query.getDirection()]), query.getDistance(reference)) - query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) - if direction == 0: - query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) - for tag in reference.getTagNames(): - if tag not in ("quality", "feature"): - query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction*query.getDirection()], tag), reference.getTagValue(tag)) - return query + def setTags(self, query, reference, direction): + refName = reference.getTagValue("ID") + if refName == None: + refName = reference.getName() + if refName == None: + refName = reference.__str__() + query.setTagValue("%s%s" % (self.tagName, TAGS_REGION[direction]), refName) + query.setTagValue("%s_%s%s" % (TAG_DISTANCE, self.tagName, TAGS_REGION[direction]), query.getDistance(reference)) + if direction == 0: + query.setTagValue("%s_%s" % (TAG_SENSE, self.tagName), TAGS_SENSE[query.getDirection() * reference.getDirection()]) + query.setTagValue("%s_%s" % (TAG_REGION, self.tagName), TAGS_RREGION[cmp(query.getRelativeDistance(reference), 0)]) + for tag in reference.getTagNames(): + if tag not in ("quality", "feature"): + query.setTagValue("%s%s_%s" % (self.tagName, TAGS_REGION[direction], tag), reference.getTagValue(tag)) + return query - def write(self): - progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) - for transcriptQuery in self.flankings.keys(): - if not self.flankings[transcriptQuery]: - self.writer.addTranscript(transcriptQuery) - elif self.directions: - for direction in self.directions: - #relativeDirection = direction if transcriptQuery.getDirection() == 1 else - direction - relativeDirection = direction * transcriptQuery.getDirection() - if relativeDirection in self.flankings[transcriptQuery]: - transcriptRef = self.flankings[transcriptQuery][relativeDirection] - transcriptQuery = self.setTags(transcriptQuery, transcriptRef, relativeDirection) - self.writer.addTranscript(transcriptQuery) - else: - transcriptRef = sorted(self.flankings[transcriptQuery].values(), key = lambda transcriptRef: transcriptQuery.getDistance(transcriptRef))[0] - self.writer.addTranscript(self.setTags(transcriptQuery, transcriptRef, 0)) - progress.inc() - progress.done() + def write(self): + outputs = set() + progress = Progress(len(self.flankings.keys()), "Printing data", self.verbosity) + for transcriptRef in self.flankings.keys(): + if self.directions: + for direction in self.directions: + if direction in self.flankings[transcriptRef]: + query = self.flankings[transcriptRef][direction] + outputs.add(self.setTags(query, transcriptRef, direction)) + else: + if self.flankings[transcriptRef]: + query = sorted(self.flankings[transcriptRef].values(), key = lambda query: query.getDistance(transcriptRef))[0] + outputs.add(self.setTags(query, transcriptRef, 0)) + progress.inc() + for transcript in sorted(list(outputs), key = lambda flanking: (flanking.getChromosome(), flanking.getStart(), flanking.getEnd())): + self.writer.addTranscript(transcript) + self.writer.close() + progress.done() - def run(self): - for chromosome in sorted(self.transcripts[QUERY].keys()): - self.flankings = dict([query, {}] for query in self.transcripts[QUERY][chromosome]) - for direction in STRANDS: - #print "comparison", chromosome, direction - self.getFlanking(chromosome, direction) - self.write() - self.writer.close() + def run(self): + self.flankings = {} + for direction in STRANDS: + self.getFlanking(direction) + self.write() if __name__ == "__main__": - - description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" + + description = "Get Flanking v1.0.1: Get the flanking regions of a set of reference. [Category: Data Selection]" - parser = OptionParser(description = description) - parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") - parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") - parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") - parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") - parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") - parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") - parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") - parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") - parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") - parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") - parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") - parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") - (options, args) = parser.parse_args() + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="query input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="reference input file [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of previous file [compulsory] [format: transcript file format]") + parser.add_option("-5", "--upstream", dest="upstream", action="store_true", default=False, help="output upstream elements [format: boolean] [default: False]") + parser.add_option("-3", "--downstream", dest="downstream", action="store_true", default=False, help="output downstream elements [format: boolean] [default: False]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="find first colinear element [format: boolean] [default: False]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="find first anti-sense element [format: boolean] [default: False]") + parser.add_option("-e", "--noOverlap", dest="noOverlap", action="store_true", default=False, help="do not consider elements which are overlapping reference elements [format: boolean] [default: False]") + parser.add_option("-d", "--minDistance", dest="minDistance", action="store", default=None, type="int", help="minimum distance between 2 elements [format: int]") + parser.add_option("-D", "--maxDistance", dest="maxDistance", action="store", default=None, type="int", help="maximum distance between 2 elements [format: int]") + parser.add_option("-t", "--tag", dest="tagName", action="store", default="flanking", type="string", help="name of the new tag [format: string] [default: flanking]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [format: output file in GFF3 format]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() - gf = GetFlanking(options.verbosity) - gf.setInputFile(options.inputFileName1, options.format1, QUERY) - gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) - gf.setOutputFile(options.outputFileName) - gf.addUpstreamDirection(options.upstream) - gf.addDownstreamDirection(options.downstream) - gf.setColinear(options.colinear) - gf.setAntisense(options.antisense) - gf.setNoOverlap(options.noOverlap) - gf.setMinDistance(options.minDistance) - gf.setMaxDistance(options.maxDistance) - gf.setNewTagName(options.tagName) - gf.run() + gf = GetFlanking(options.verbosity) + gf.setInputFile(options.inputFileName1, options.format1, QUERY) + gf.setInputFile(options.inputFileName2, options.format2, REFERENCE) + gf.setOutputFile(options.outputFileName) + gf.addUpstreamDirection(options.upstream) + gf.addDownstreamDirection(options.downstream) + gf.setColinear(options.colinear) + gf.setAntisense(options.antisense) + gf.setNoOverlap(options.noOverlap) + gf.setMinDistance(options.minDistance) + gf.setMaxDistance(options.maxDistance) + gf.setNewTagName(options.tagName) + gf.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/GetReadSizes.py --- a/SMART/Java/Python/GetReadSizes.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/GetReadSizes.py Mon Sep 30 03:19:26 2013 -0400 @@ -52,6 +52,7 @@ self.sizes = {} self.factors = {} self.regions = None + self.percentage = False self.tmpDatName = None self.tmpRName = None self.width = 800 @@ -66,6 +67,8 @@ chooser.findFormat(format) for cpt, fileName in enumerate(fileNames): self.parsers[self.names[cpt]] = chooser.getParser(fileName) + if not self.factors: + self.factors = dict([name, 1.0] for name in self.names) def setOutputFileName(self, fileName): self.outputFileName = fileName @@ -82,12 +85,17 @@ self.colors = colors def setFactors(self, factors): - self.factors = dict(zip(self.names, factors)) + if factors: + self.factors = dict(zip(self.names, factors)) def setRegionsFile(self, fileName): if fileName != None: self._loadRegions(fileName) + def setPercentage(self, percentage): + self.percentage = percentage + self.xLab = "% reads" + def setImageSize(self, width, height): if width != None: self.width = width @@ -165,6 +173,14 @@ def _checkQuorum(self, region): return (max([sum(self.sizes[region][name].values()) for name in self.sizes[region]]) > 0) + def _computePercentage(self): + for region in self.sizes: + for name in self.sizes[region]: + if self.sizes[region][name]: + sumData = float(sum(self.sizes[region][name].values())) + for size in self.sizes[region][name]: + self.sizes[region][name][size] = self.sizes[region][name][size] / sumData * 100 + def _writeData(self, region): self.tmpDatName = "tmpFile%d.dat" % (self.number) handle = open(self.tmpDatName, "w") @@ -223,6 +239,8 @@ self.log.info("START Get Read Sizes") for name in self.names: self._parse(name) + if self.percentage: + self._computePercentage() self._plot() self._cleanFiles() self.log.info("END Get Read Sizes") @@ -243,6 +261,7 @@ parser.add_option("-c", "--colors", dest="colors", action="store", default=None, type="string", help="colors of the bars, separated by commas [format: string]") parser.add_option("-a", "--factors", dest="factors", action="store", default=None, type="string", help="normalization factors, separated by commas [format: string]") parser.add_option("-r", "--regions", dest="regionsFileName", action="store", default=None, type="string", help="regions to plot [format: transcript file in GFF format]") + parser.add_option("-p", "--percent", dest="percentage", action="store_true", default=False, help="compute percentage instead [format: boolean] [default: false]") parser.add_option("-z", "--width", dest="width", action="store", default=800, type="int", help="width of the image [format: int] [default: 800]") parser.add_option("-Z", "--height", dest="height", action="store", default=300, type="int", help="height of the image [format: int] [default: 300]") parser.add_option("-A", "--arial", dest="arial", action="store_true", default=False, help="use Arial font [format: boolean] [default: false]") @@ -252,11 +271,12 @@ iGetReadSizes.setNames(options.names.split(",")) iGetReadSizes.setInputFiles(options.inputFileNames.split(","), options.format) iGetReadSizes.setOutputFileName(options.outputFileName) - iGetReadSizes.setLabs(options.xLab, options.yLab) iGetReadSizes.setSizes(options.minSize, options.maxSize) iGetReadSizes.setColors(None if options.colors == None else options.colors.split(",")) iGetReadSizes.setFactors(None if options.factors == None else map(float, options.factors.split(","))) iGetReadSizes.setRegionsFile(options.regionsFileName) + iGetReadSizes.setPercentage(options.percentage) iGetReadSizes.setImageSize(options.width, options.height) + iGetReadSizes.setLabs(options.xLab, options.yLab) iGetReadSizes.setArial(options.arial) iGetReadSizes.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/cleanGff.py --- a/SMART/Java/Python/cleanGff.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/cleanGff.py Mon Sep 30 03:19:26 2013 -0400 @@ -43,158 +43,153 @@ count = {} class ParsedLine(object): - def __init__(self, line, cpt): - self.line = line - self.cpt = cpt - self.parse() + def __init__(self, line, cpt): + self.line = line + self.cpt = cpt + self.parse() - def parse(self): - self.line = self.line.strip() - self.splittedLine = self.line.split(None, 8) - if len(self.splittedLine) < 9: - raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) - self.type = self.splittedLine[2] - self.parseOptions() - self.getId() - self.getParents() + def parse(self): + self.line = self.line.strip() + self.splittedLine = self.line.split(None, 8) + if len(self.splittedLine) < 9: + raise Exception("Line '%s' has less than 9 fields. Exiting..." % (self.line)) + self.type = self.splittedLine[2] + self.parseOptions() + self.getId() + self.getParents() - def parseOptions(self): - self.parsedOptions = {} - for option in self.splittedLine[8].split(";"): - option = option.strip() - if option == "": continue - posSpace = option.find(" ") - posEqual = option.find("=") - if posEqual != -1 and (posEqual < posSpace or posSpace == -1): - key, value = option.split("=", 1) - elif posSpace != -1: - key, value = option.split(None, 1) - else: - key = "ID" - value = option - self.parsedOptions[key.strip()] = value.strip(" \"") + def parseOptions(self): + self.parsedOptions = {} + for option in self.splittedLine[8].split(";"): + option = option.strip() + if option == "": continue + posSpace = option.find(" ") + posEqual = option.find("=") + if posEqual != -1 and (posEqual < posSpace or posSpace == -1): + key, value = option.split("=", 1) + elif posSpace != -1: + key, value = option.split(None, 1) + else: + key = "ID" + value = option + self.parsedOptions[key.strip()] = value.strip(" \"") - def getId(self): - for key in self.parsedOptions: - if key.lower() == "id": - self.id = self.parsedOptions[key] - return - if "Parent" in self.parsedOptions: - parent = self.parsedOptions["Parent"].split(",")[0] - if parent not in count: - count[parent] = {} - if self.type not in count[parent]: - count[parent][self.type] = 0 - count[parent][self.type] += 1 - self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) - else: - self.id = "smart%d" % (self.cpt) - self.parsedOptions["ID"] = self.id + def getId(self): + for key in self.parsedOptions: + if key.lower() == "id": + self.id = self.parsedOptions[key] + return + if "Parent" in self.parsedOptions: + parent = self.parsedOptions["Parent"].split(",")[0] + if parent not in count: + count[parent] = {} + if self.type not in count[parent]: + count[parent][self.type] = 0 + count[parent][self.type] += 1 + self.id = "%s-%s-%d" % (parent, self.type, count[parent][self.type]) + else: + self.id = "smart%d" % (self.cpt) + self.parsedOptions["ID"] = self.id - def getParents(self): - for key in self.parsedOptions: - if key.lower() in ("parent", "derives_from"): - self.parents = self.parsedOptions[key].split(",") - return - self.parents = None + def getParents(self): + for key in self.parsedOptions: + if key.lower() in ("parent", "derives_from"): + self.parents = self.parsedOptions[key].split(",") + return + self.parents = None - def removeParent(self): - for key in self.parsedOptions.keys(): - if key.lower() in ("parent", "derives_from"): - del self.parsedOptions[key] + def removeParent(self): + for key in self.parsedOptions.keys(): + if key.lower() in ("parent", "derives_from"): + del self.parsedOptions[key] - def export(self): - self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) - return "%s\n" % ("\t".join(self.splittedLine)) + def export(self): + self.splittedLine[8] = ";".join(["%s=%s" % (key, value) for key, value in self.parsedOptions.iteritems()]) + return "%s\n" % ("\t".join(self.splittedLine)) class CleanGff(object): - def __init__(self, verbosity = 1): - self.verbosity = verbosity - self.lines = {} - self.acceptedTypes = [] - self.parents = [] - self.children = {} + def __init__(self, verbosity = 1): + self.verbosity = verbosity + self.lines = {} + self.acceptedTypes = [] + self.parents = [] + self.children = {} - def setInputFileName(self, name): - self.inputFile = open(name) - - def setOutputFileName(self, name): - self.outputFile = open(name, "w") - - def setAcceptedTypes(self, types): - self.acceptedTypes = types + def setInputFileName(self, name): + self.inputFile = open(name) + + def setOutputFileName(self, name): + self.outputFile = open(name, "w") - def parse(self): - progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) - for cpt, line in enumerate(self.inputFile): - if not line or line[0] == "#": continue - if line[0] == ">": break - parsedLine = ParsedLine(line, cpt) - if parsedLine.type in self.acceptedTypes: - if parsedLine.id in self.lines: - cpt = 1 - while "%s-%d" % (parsedLine.id, cpt) in self.lines: - cpt += 1 - parsedLine.id = "%s-%d" % (parsedLine.id, cpt) - self.lines[parsedLine.id] = parsedLine - progress.inc() - progress.done() + def setAcceptedTypes(self, types): + self.acceptedTypes = types + + def parse(self): + progress = UnlimitedProgress(100000, "Reading input file", self.verbosity) + for cpt, line in enumerate(self.inputFile): + if not line or line[0] == "#": continue + if line[0] == ">": break + parsedLine = ParsedLine(line, cpt) + if parsedLine.type in self.acceptedTypes: + self.lines[parsedLine.id] = parsedLine + progress.inc() + progress.done() - def sort(self): - progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) - for line in self.lines.values(): - parentFound = False - if line.parents: - for parent in line.parents: - if parent in self.lines: - parentFound = True - if parent in self.children: - self.children[parent].append(line) - else: - self.children[parent] = [line] - if not parentFound: - line.removeParent() - self.parents.append(line) - progress.inc() - progress.done() + def sort(self): + progress = Progress(len(self.lines.keys()), "Sorting file", self.verbosity) + for line in self.lines.values(): + parentFound = False + if line.parents: + for parent in line.parents: + if parent in self.lines: + parentFound = True + if parent in self.children: + self.children[parent].append(line) + else: + self.children[parent] = [line] + if not parentFound: + line.removeParent() + self.parents.append(line) + progress.inc() + progress.done() - def write(self): - progress = Progress(len(self.parents), "Writing output file", self.verbosity) - for line in self.parents: - self.writeLine(line) - progress.inc() - self.outputFile.close() - progress.done() + def write(self): + progress = Progress(len(self.parents), "Writing output file", self.verbosity) + for line in self.parents: + self.writeLine(line) + progress.inc() + self.outputFile.close() + progress.done() - def writeLine(self, line): - self.outputFile.write(line.export()) - if line.id in self.children: - for child in self.children[line.id]: - self.writeLine(child) + def writeLine(self, line): + self.outputFile.write(line.export()) + if line.id in self.children: + for child in self.children[line.id]: + self.writeLine(child) - def run(self): - self.parse() - self.sort() - self.write() + def run(self): + self.parse() + self.sort() + self.write() if __name__ == "__main__": - - # parse command line - description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" + + # parse command line + description = "Clean GFF v1.0.3: Clean a GFF file (as given by NCBI) and outputs a GFF3 file. [Category: Other]" - parser = OptionParser(description = description) - parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") - parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") - (options, args) = parser.parse_args() + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file name [compulsory] [format: file in GFF format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-t", "--types", dest="types", action="store", default="mRNA,exon", type="string", help="list of comma-separated types that you want to keep [format: string] [default: mRNA,exon]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() - cleanGff = CleanGff(options.verbosity) - cleanGff.setInputFileName(options.inputFileName) - cleanGff.setOutputFileName(options.outputFileName) - cleanGff.setAcceptedTypes(options.types.split(",")) - cleanGff.run() + cleanGff = CleanGff(options.verbosity) + cleanGff.setInputFileName(options.inputFileName) + cleanGff.setOutputFileName(options.outputFileName) + cleanGff.setAcceptedTypes(options.types.split(",")) + cleanGff.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/cleaning/GffCleaner.py --- a/SMART/Java/Python/cleaning/GffCleaner.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/cleaning/GffCleaner.py Mon Sep 30 03:19:26 2013 -0400 @@ -127,11 +127,6 @@ if line[0] == ">": break parsedLine = ParsedLine(line, cpt) if self.acceptedTypes == None or parsedLine.type in self.acceptedTypes: - if parsedLine.id in self.lines: - cpt = 1 - while "%s-%d" % (parsedLine.id, cpt) in self.lines: - cpt += 1 - parsedLine.id = "%s-%d" % (parsedLine.id, cpt) self.lines[parsedLine.id] = parsedLine progress.inc() progress.done() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/clusterize.py --- a/SMART/Java/Python/clusterize.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/clusterize.py Mon Sep 30 03:19:26 2013 -0400 @@ -104,8 +104,7 @@ else: progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) - transcripts = [] - self.nbElements = 0 + transcripts = [] for newTranscript in parser.getIterator(): newTranscripts = [] if newTranscript.__class__.__name__ == "Mapping": @@ -119,7 +118,6 @@ newTranscripts.append(oldTranscript) newTranscripts.append(newTranscript) transcripts = newTranscripts - self.nbElements += 1 progress.inc() for transcript in transcripts: self._write(transcript) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/getRandomRegions.py --- a/SMART/Java/Python/getRandomRegions.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/getRandomRegions.py Mon Sep 30 03:19:26 2013 -0400 @@ -44,224 +44,228 @@ class RandomRegionsGenerator(object): - def __init__(self, verbosity): - self.verbosity = verbosity - self.strands = False - self.distribution = "uniform" - self.transcripts = None - self.sequenceParser = None - random.seed() + def __init__(self, verbosity): + self.verbosity = verbosity + self.strands = False + self.distribution = "uniform" + self.transcripts = None + self.sequenceParser = None + random.seed() - def setInput(self, fileName): - self.sequenceParser = FastaParser(fileName, self.verbosity) + def setInput(self, fileName): + self.sequenceParser = FastaParser(fileName, self.verbosity) - def setGenomeSize(self, size): - self.genomeSize = size + def setGenomeSize(self, size): + self.genomeSize = size - def setChromosomeName(self, name): - self.chromosomeName = name + def setChromosomeName(self, name): + self.chromosomeName = name - def setAnnotation(self, fileName, format): - parser = TranscriptContainer(fileName, format, self.verbosity) - self.transcripts = [] - for transcript in parser.getIterator(): - self.transcripts.append(transcript) - self.setNumber(len(self.transcripts)) - self.setSize(0) + def setAnnotation(self, fileName, format): + parser = TranscriptContainer(fileName, format, self.verbosity) + self.transcripts = [] + for transcript in parser.getIterator(): + self.transcripts.append(transcript) + self.setNumber(len(self.transcripts)) + self.setSize(0) - def setOutputFile(self, fileName): - self.outputFileName = fileName + def setOutputFile(self, fileName): + self.outputFileName = fileName - def setSize(self, size): - self.minSize = size - self.maxSize = size + def setSize(self, size): + self.minSize = size + self.maxSize = size - def setMinSize(self, size): - self.minSize = size + def setMinSize(self, size): + self.minSize = size - def setMaxSize(self, size): - self.maxSize = size + def setMaxSize(self, size): + self.maxSize = size - def setNumber(self, number): - self.number = number + def setNumber(self, number): + self.number = number - def setStrands(self, strands): - self.strands = strands + def setStrands(self, strands): + self.strands = strands - def setMaxDistribution(self, maxElements): - if maxElements == None: - return - self.maxElements = maxElements - self.distribution = "gaussian" + def setMaxDistribution(self, maxElements): + if maxElements == None: + return + self.maxElements = maxElements + self.distribution = "gaussian" - def setDeviationDistribution(self, deviation): - if deviation == None: - return - self.deviation = deviation - self.distribution = "gaussian" + def setDeviationDistribution(self, deviation): + if deviation == None: + return + self.deviation = deviation + self.distribution = "gaussian" - def getSizes(self): - if self.sequenceParser == None: - self.chromosomes = [self.chromosomeName] - self.sizes = {self.chromosomeName: self.genomeSize} - self.cumulatedSize = self.genomeSize - self.cumulatedSizes = {self.chromosomeName: self.genomeSize} - return - self.chromosomes = self.sequenceParser.getRegions() - self.sizes = {} - self.cumulatedSize = 0 - self.cumulatedSizes = {} - for chromosome in self.chromosomes: - self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) - self.cumulatedSize += self.sizes[chromosome] - self.cumulatedSizes[chromosome] = self.cumulatedSize + def getSizes(self): + if self.sequenceParser == None: + self.chromosomes = [self.chromosomeName] + self.sizes = {self.chromosomeName: self.genomeSize} + self.cumulatedSize = self.genomeSize + self.cumulatedSizes = {self.chromosomeName: self.genomeSize} + return + self.chromosomes = self.sequenceParser.getRegions() + self.sizes = {} + self.cumulatedSize = 0 + self.cumulatedSizes = {} + for chromosome in self.chromosomes: + self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) + self.cumulatedSize += self.sizes[chromosome] + self.cumulatedSizes[chromosome] = self.cumulatedSize - def findPosition(self, size = None): - if size == None: - size = random.randint(self.minSize, self.maxSize) - integer = random.randint(0, self.cumulatedSize) - for chromosome in self.chromosomes: - if self.cumulatedSizes[chromosome] > integer: - break - start = random.randint(1, self.sizes[chromosome] - size) - return (chromosome, start, size) + def findPosition(self, size = None): + if size == None: + size = random.randint(self.minSize, self.maxSize) + integer = random.randint(0, self.cumulatedSize) + for chromosome in self.chromosomes: + if self.cumulatedSizes[chromosome] > integer: + break + start = random.randint(1, self.sizes[chromosome] - size) + return (chromosome, start, size) - def createTranscript(self, chromosome, start, size, strand, cpt): - transcript = Transcript() - transcript.setChromosome(chromosome) - transcript.setStart(start) - transcript.setEnd(start + size-1) - transcript.setDirection(strand) - transcript.setName("rand_%d" % (cpt)) - return transcript + def createTranscript(self, chromosome, start, size, strand, cpt): + transcript = Transcript() + transcript.setChromosome(chromosome) + transcript.setEnd(start + size-1) + transcript.setStart(start) + transcript.setDirection(strand) + transcript.setName("rand_%d" % (cpt)) + return transcript - def moveTranscript(self, chromosome, start, transcript): - while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: - chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) - transcript.setChromosome(chromosome) - oldStart, oldEnd = transcript.getStart(), transcript.getEnd() - if transcript.getNbExons() > 1: - for exon in transcript.getNbExons(): - oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() - exon.setStart(oldExonStart + start - oldStart) - exon.setEnd(oldExonEnd + start - oldStart) - transcript.setStart(start) - transcript.setEnd(oldEnd + start - oldStart) - return [transcript] + def moveTranscript(self, chromosome, start, transcript): + while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: + chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) + newTranscript = Transcript() + newTranscript.setChromosome(chromosome) + newTranscript.tags = transcript.tags + if transcript.getNbExons() > 1: + for exon in transcript.getNbExons(): + newExon = Interval() + newExon.setChromosome(chromosome) + newExon.setEnd(exon.getEnd() + start - transcript.getStart()) + newExon.setStart(exon.getStart() + start - transcript.getStart()) + newTranscript.addExon(newExon) + newTranscript.setEnd(transcript.getEnd() + start - transcript.getStart()) + newTranscript.setStart(start) + newTranscript.setDirection(transcript.getDirection()) + return [newTranscript] - def createUniformCluster(self, chromosome, start, size, strand, cpt): - transcript = self.createTranscript(chromosome, start, size, strand, cpt) - return [transcript] - - - def findNbTranscripts(self, cpt): - return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) + def createUniformCluster(self, chromosome, start, size, strand, cpt): + transcript = self.createTranscript(chromosome, start, size, strand, cpt) + return [transcript] - def getDev(self): - deviation = 0.0 - for j in range(repetitions): - deviation += random.randint(-self.deviation, self.deviation) - deviation /= repetitions - deviation = int(round(deviation)) - return deviation + def findNbTranscripts(self, cpt): + return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) - def createGaussianCluster(self, chromosome, start, size, strand, cpt): - transcripts = [] - nbTranscripts = self.findNbTranscripts(cpt) - for i in range(nbTranscripts): - transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) - transcripts.append(transcript) - return transcripts + def getDev(self): + deviation = 0.0 + for j in range(repetitions): + deviation += random.randint(-self.deviation, self.deviation) + deviation /= repetitions + deviation = int(round(deviation)) + return deviation + + + def createGaussianCluster(self, chromosome, start, size, strand, cpt): + transcripts = [] + nbTranscripts = self.findNbTranscripts(cpt) + for i in range(nbTranscripts): + transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) + transcripts.append(transcript) + return transcripts - def writeRegions(self): - writer = Gff3Writer(self.outputFileName, self.verbosity) - outputFile = open(self.outputFileName, "w") - progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) - i = 0 - while i < self.number: - chromosome, start, size = self.findPosition() - strand = random.choice([-1, 1]) if self.strands else 1 - if self.transcripts != None: - transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) - elif self.distribution == "uniform": - transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) - else: - transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) - for transcript in transcripts: - writer.addTranscript(transcript) - i += 1 - progress.inc() - progress.done() - outputFile.close() - writer.write() - writer.close() + def writeRegions(self): + writer = Gff3Writer(self.outputFileName, self.verbosity) + outputFile = open(self.outputFileName, "w") + progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) + i = 0 + while i < self.number: + chromosome, start, size = self.findPosition() + strand = random.choice([-1, 1]) if self.strands else 1 + if self.transcripts != None: + transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) + elif self.distribution == "uniform": + transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) + else: + transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) + for transcript in transcripts: + writer.addTranscript(transcript) + i += 1 + progress.inc() + progress.done() + outputFile.close() + writer.write() + writer.close() - def run(self): - self.getSizes() - self.writeRegions() + def run(self): + self.getSizes() + self.writeRegions() if __name__ == "__main__": - - # parse command line - description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" + + # parse command line + description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" - parser = OptionParser(description = description) - parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") - parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") - parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") - parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") - parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") - parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") - parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") - parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") - parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") - parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") - parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") - (options, args) = parser.parse_args() + parser = OptionParser(description = description) + parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") + parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") + parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") + parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") + parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") + parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") + parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") + parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") + parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") + parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() - rrg = RandomRegionsGenerator(options.verbosity) - if options.reference == None: - rrg.setGenomeSize(options.referenceSize) - rrg.setChromosomeName(options.chromosome) - else: - rrg.setInput(options.reference) - rrg.setOutputFile(options.outputFileName) - if options.inputFileName == None: - if options.size != None: - rrg.setSize(options.size) - else: - rrg.setMinSize(options.minSize) - rrg.setMaxSize(options.maxSize) - rrg.setNumber(options.number) - rrg.setStrands(options.strands) - else: - rrg.setAnnotation(options.inputFileName, options.format) - rrg.setMaxDistribution(options.max) - rrg.setDeviationDistribution(options.deviation) - rrg.run() + rrg = RandomRegionsGenerator(options.verbosity) + if options.reference == None: + rrg.setGenomeSize(options.referenceSize) + rrg.setChromosomeName(options.chromosome) + else: + rrg.setInput(options.reference) + rrg.setOutputFile(options.outputFileName) + if options.inputFileName == None: + if options.size != None: + rrg.setSize(options.size) + else: + rrg.setMinSize(options.minSize) + rrg.setMaxSize(options.maxSize) + rrg.setNumber(options.number) + rrg.setStrands(options.strands) + else: + rrg.setAnnotation(options.inputFileName, options.format) + rrg.setMaxDistribution(options.max) + rrg.setDeviationDistribution(options.deviation) + rrg.run() diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/mySql/MySqlConnection.py --- a/SMART/Java/Python/mySql/MySqlConnection.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/mySql/MySqlConnection.py Mon Sep 30 03:19:26 2013 -0400 @@ -88,18 +88,6 @@ self.connection.commit() - def executeManyFormattedQueries(self, command, lines, insertion = False): - cursor = self.connection.cursor() - query = MySqlQuery(cursor, self.verbosity) - for line in lines: - result = query.executeFormat(command, line) - self.connection.commit() - if insertion: - return result - else: - return query - - def executeManyQueriesIterator(self, table): cursor = self.connection.cursor() query = MySqlQuery(cursor, self.verbosity) @@ -113,25 +101,9 @@ self.connection.commit() - def executeManyFormattedQueriesIterator(self, table): + def executeFormattedQuery(self, command, *parameters): cursor = self.connection.cursor() query = MySqlQuery(cursor, self.verbosity) - try: - for command, values in table.getIterator(): - query.executeFormat(command, values) - self.connection.commit() - except: - for command, values in table.getIterator(): - query.execute(command, values) - self.connection.commit() - - - def executeFormattedQuery(self, command, parameters, insertion = False): - cursor = self.connection.cursor() - query = MySqlQuery(cursor, self.verbosity) - result = query.executeFormat(command, parameters) + query.executeFormat(command, parameters) self.connection.commit() - if insertion: - return result - else: - return query \ No newline at end of file + return query diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/mySql/MySqlQuery.py --- a/SMART/Java/Python/mySql/MySqlQuery.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/mySql/MySqlQuery.py Mon Sep 30 03:19:26 2013 -0400 @@ -91,4 +91,4 @@ def show(self): for line in self.getIterator(): - print line \ No newline at end of file + print line diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/mySql/MySqlTable.py --- a/SMART/Java/Python/mySql/MySqlTable.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/mySql/MySqlTable.py Mon Sep 30 03:19:26 2013 -0400 @@ -116,18 +116,6 @@ self.mySqlConnection.executeManyQueries(commands) - def insertManyFormatted(self, lines): - """ - Insert many lines - @param lines: the list of values - @type lines: list of lists - """ - replacer = ["?"] * len(self.variables) - command = "INSERT INTO '%s' (%s) VALUES (%s)" % (self.name, ", ".join(self.variables), ", ".join(replacer)) - values = [[line[variable] for variable in self.variables] for line in lines] - self.mySqlConnection.executeManyFormattedQueries(command, values) - - def rename(self, name): """ Rename the table @@ -229,10 +217,6 @@ @type values: dict @return: the id of the added row """ - sqlValues = [values[variable] for variable in self.variables] - command = "INSERT INTO '%s' (%%s) VALUES (%s)" % (self.name, ", ".join(self.variables)) - id = self.mySqlConnection.executeFormattedQueryQuery(command, sqlValues, True) - return id sqlValues = [] for variable in self.variables: sqlValues.append(self.formatSql(values[variable], self.types[variable], self.sizes[variable])) @@ -347,3 +331,4 @@ query = self.mySqlConnection.executeQuery("SELECT * FROM '%s'" % (self.name)) print query.getLines() + diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/mySql/MySqlTranscriptTable.py --- a/SMART/Java/Python/mySql/MySqlTranscriptTable.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/mySql/MySqlTranscriptTable.py Mon Sep 30 03:19:26 2013 -0400 @@ -146,4 +146,4 @@ def setDefaultTagValue(self, name, value): - super(MySqlTranscriptTable, self).setDefaultTagValue(Transcript.getSqlVariables().index("tags")+1, name, value) \ No newline at end of file + super(MySqlTranscriptTable, self).setDefaultTagValue(Transcript.getSqlVariables().index("tags")+1, name, value) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/ncList/NCList.py --- a/SMART/Java/Python/ncList/NCList.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/ncList/NCList.py Mon Sep 30 03:19:26 2013 -0400 @@ -108,10 +108,12 @@ self._offsets[fileType] = offset def _setFileNames(self, fileName): + print "Got file name", fileName if self._chromosome != None and fileName != None: coreName = os.path.splitext(fileName)[0] if "SMARTTMPPATH" in os.environ: coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) + print "Used core name", coreName self._hFileName = "%s_H.bin" % (coreName) self._lFileName = "%s_L.bin" % (coreName) self._tFileName = "%s_T.bin" % (coreName) diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/plot.py --- a/SMART/Java/Python/plot.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/plot.py Mon Sep 30 03:19:26 2013 -0400 @@ -134,11 +134,7 @@ values = dict([i * step + minValue, 0] for i in range(0, self.nbBars)) top = (self.nbBars - 1) * step + minValue for key, value in line.iteritems(): - divisor = float(maxValue - minValue) * self.nbBars - tmpMinValue = top - if divisor != 0: - tmpMinValue = min(top, int(math.floor((key - minValue) / divisor))) - newKey = tmpMinValue * step + minValue + newKey = min(top, int(math.floor((key - minValue) / float(maxValue - minValue) * self.nbBars)) * step + minValue) values[newKey] += value return values diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/plotCoverage.py --- a/SMART/Java/Python/plotCoverage.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/plotCoverage.py Mon Sep 30 03:19:26 2013 -0400 @@ -32,7 +32,7 @@ from optparse import OptionParser from SMART.Java.Python.structure.Interval import Interval from SMART.Java.Python.structure.Transcript import Transcript -from commons.core.parsing.ParserChooser import ParserChooser +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer from SMART.Java.Python.misc.RPlotter import RPlotter from SMART.Java.Python.misc.Progress import Progress from commons.core.parsing.FastaParser import FastaParser @@ -42,440 +42,430 @@ colorLine = "black" def parseTargetField(field): - strand = "+" - splittedFieldSpace = field.split() - splittedFieldPlus = field.split("+", 4) - if len(splittedFieldSpace) == 3: - id, start, end = splittedFieldSpace - elif len(splittedFieldSpace) == 4: - id, start, end, strand = splittedFieldSpace - elif len(splittedFieldPlus) == 3: - id, start, end = splittedFieldPlus - elif len(splittedFieldPlus) == 4: - id, start, end, strand = splittedFieldPlus - else: - raise Exception("Cannot parse Target field '%s'." % (field)) - return (id, int(start), int(end), strand) + strand = "+" + splittedFieldSpace = field.split() + splittedFieldPlus = field.split("+", 4) + if len(splittedFieldSpace) == 3: + id, start, end = splittedFieldSpace + elif len(splittedFieldSpace) == 4: + id, start, end, strand = splittedFieldSpace + elif len(splittedFieldPlus) == 3: + id, start, end = splittedFieldPlus + elif len(splittedFieldPlus) == 4: + id, start, end, strand = splittedFieldPlus + else: + raise Exception("Cannot parse Target field '%s'." % (field)) + return (id, int(start), int(end), strand) class SimpleTranscript(object): - def __init__(self, transcript1, transcript2, color = None): - self.start = max(0, transcript1.getStart() - transcript2.getStart()) - self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) - self.strand = transcript1.getDirection() * transcript2.getDirection() - self.exons = [] - for exon in transcript1.getExons(): - if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): - start = max(0, exon.getStart() - transcript2.getStart()) - end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) - self.addExon(start, end, self.strand, color) + def __init__(self, transcript1, transcript2, color = None): + self.start = max(0, transcript1.getStart() - transcript2.getStart()) + self.end = min(transcript2.getEnd() - transcript2.getStart(), transcript1.getEnd() - transcript2.getStart()) + self.strand = transcript1.getDirection() * transcript2.getDirection() + self.exons = [] + for exon in transcript1.getExons(): + if exon.getEnd() >= transcript2.getStart() and exon.getStart() <= transcript2.getEnd(): + start = max(0, exon.getStart() - transcript2.getStart()) + end = min(transcript2.getEnd() - transcript2.getStart(), exon.getEnd() - transcript2.getStart()) + self.addExon(start, end, self.strand, color) - def addExon(self, start, end, strand, color): - exon = SimpleExon(start, end, strand, color) - self.exons.append(exon) + def addExon(self, start, end, strand, color): + exon = SimpleExon(start, end, strand, color) + self.exons.append(exon) - def getRScript(self, yOffset, height): - rString = "" - previousEnd = None - for exon in sorted(self.exons, key=lambda exon: exon.start): - if previousEnd != None: - rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) - rString += exon.getRScript(yOffset, height) - previousEnd = exon.end - return rString + def getRScript(self, yOffset, height): + rString = "" + previousEnd = None + for exon in sorted(self.exons, key=lambda exon: exon.start): + if previousEnd != None: + rString += "segments(%.1f, %.1f, %.1f, %.1f, col = \"%s\")\n" % (previousEnd, yOffset + height / 4.0, exon.start, yOffset + height / 4.0, colorLine) + rString += exon.getRScript(yOffset, height) + previousEnd = exon.end + return rString class SimpleExon(object): - def __init__(self, start, end, strand, color = None): - self.start = start - self.end = end - self.strand = strand - self.color = color - - def getRScript(self, yOffset, height): - color = self.color if self.color != None else colors[self.strand] - return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) + def __init__(self, start, end, strand, color = None): + self.start = start + self.end = end + self.strand = strand + self.color = color + + def getRScript(self, yOffset, height): + color = self.color if self.color != None else colors[self.strand] + return "rect(%.1f, %.1f, %.1f, %.1f, col=\"%s\", border = \"%s\")\n" % (self.start, yOffset, self.end, yOffset + height / 2.0, color, colorLine) class Plotter(object): - - def __init__(self, seed, index, verbosity): - self.seed = seed - self.index = index - self.verbosity = verbosity - self.maxCoverage = 0 - self.maxOverlap = 0 - self.log = "" - self.merge = False - self.width = 1500 - self.heigth = 1000 - self.xLabel = "" - self.yLabel = "" - self.title = None - self.absPath = os.getcwd() - self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) - self.coverageScript = "" - self.overlapScript = "" - self.outputFileName = None + + def __init__(self, seed, index, verbosity): + self.seed = seed + self.index = index + self.verbosity = verbosity + self.maxCoverage = 0 + self.maxOverlap = 0 + self.log = "" + self.merge = False + self.width = 1500 + self.heigth = 1000 + self.xLabel = "" + self.yLabel = "" + self.title = None + self.absPath = os.getcwd() + self.coverageDataFileName = "tmpFile_%d_%s.dat" % (seed, index) + self.coverageScript = "" + self.overlapScript = "" + self.outputFileName = None - def setOutputFileName(self, fileName): - self.outputFileName = fileName + def setOutputFileName(self, fileName): + self.outputFileName = fileName - def setTranscript(self, transcript): - self.transcript = transcript - self.name = transcript.getName() - self.size = transcript.getEnd() - transcript.getStart() + 1 - if self.title == None: - self.title = self.name - else: - self.title += " " + self.name + def setTranscript(self, transcript): + self.transcript = transcript + self.name = transcript.getName() + self.size = transcript.getEnd() - transcript.getStart() + 1 + if self.title == None: + self.title = self.name + else: + self.title += " " + self.name - def setTitle(self, title): - self.title = title + " " + self.name + def setTitle(self, title): + self.title = title + " " + self.name - def setPlotSize(self, width, height): - self.width = width - self.height = height + def setPlotSize(self, width, height): + self.width = width + self.height = height - def setLabels(self, xLabel, yLabel): - self.xLabel = xLabel - self.yLabel = yLabel + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel - def setMerge(self, merge): - self.merge = merge + def setMerge(self, merge): + self.merge = merge - def setCoverageData(self, coverage): - outputCoveragePerStrand = dict([strand, 0] for strand in strands) - outputCoverage = 0 - dataFile = open(os.path.abspath(self.coverageDataFileName), "w") - for position in range(self.size+1): - sumValue = 0 - found = False - dataFile.write("%d\t" % (position)) - for strand in strands: - value = coverage[strand].get(position, 0) - sumValue += value - dataFile.write("%d\t" % (value)) - if value > 0: - found = True - outputCoveragePerStrand[strand] += 1 - self.maxCoverage = max(self.maxCoverage, sumValue) - dataFile.write("%d\n" % (sumValue)) - if found: - outputCoverage += 1 - dataFile.close() - self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) - self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) - self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) - self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) - self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) + def setCoverageData(self, coverage): + outputCoveragePerStrand = dict([strand, 0] for strand in strands) + outputCoverage = 0 + dataFile = open(os.path.abspath(self.coverageDataFileName), "w") + for position in range(self.size+1): + sumValue = 0 + found = False + dataFile.write("%d\t" % (position)) + for strand in strands: + value = coverage[strand].get(position, 0) + sumValue += value + dataFile.write("%d\t" % (value)) + if value > 0: + found = True + outputCoveragePerStrand[strand] += 1 + self.maxCoverage = max(self.maxCoverage, sumValue) + dataFile.write("%d\n" % (sumValue)) + if found: + outputCoverage += 1 + dataFile.close() + self.log += "%s (%d nt):\n - both strands: %d (%.0f%%)\n - (+) strand: %d (%.0f%%)\n - (-) strand: %d (%.0f%%)\n" % (self.name, self.size, outputCoverage, float(outputCoverage) / self.size * 100, outputCoveragePerStrand[1], float(outputCoveragePerStrand[1]) / self.size * 100, outputCoveragePerStrand[-1], float(outputCoveragePerStrand[-1]) / self.size * 100) + self.coverageScript += "data = scan(\"%s\", list(pos = -666, minus = -666, plus = -666, sumValue = -666), sep=\"\t\")\n" % (os.path.abspath(self.coverageDataFileName)) + self.coverageScript += "lines(x = data$pos, y = data$minus, col = \"%s\")\n" % (colors[-1]) + self.coverageScript += "lines(x = data$pos, y = data$plus, col = \"%s\")\n" % (colors[1]) + self.coverageScript += "lines(x = data$pos, y = data$sumValue, col = \"%s\")\n" % (colors[0]) - def setOverlapData(self, overlap): - height = 1 - self.maxOverlap = (len(overlap) + 1) * height - thisElement = SimpleTranscript(self.transcript, self.transcript, "black") - self.overlapScript += thisElement.getRScript(0, height) - for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): - self.overlapScript += transcript.getRScript((cpt + 1) * height, height) + def setOverlapData(self, overlap): + height = 1 + self.maxOverlap = (len(overlap) + 1) * height + thisElement = SimpleTranscript(self.transcript, self.transcript, "black") + self.overlapScript += thisElement.getRScript(0, height) + for cpt, transcript in enumerate(sorted(overlap, cmp=lambda c1, c2: c1.start - c2.start if c1.start != c2.start else c1.end - c2.end)): + self.overlapScript += transcript.getRScript((cpt + 1) * height, height) - def getFirstLine(self, suffix = None): - return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) + def getFirstLine(self, suffix = None): + return "png(file = \"%s_%s%s.png\", width = %d, height = %d, bg = \"white\")\n" % (self.outputFileName, self.name, "" if suffix == None or self.merge else "_%s" % (suffix), self.width, self.height) - def getLastLine(self): - return "dev.off()\n" + def getLastLine(self): + return "dev.off()\n" - def startR(self, fileName, script): - scriptFile = open(fileName, "w") - scriptFile.write(script) - scriptFile.close() - command = "R CMD BATCH %s" % (fileName) - status = subprocess.call(command, shell=True) - if status != 0: - raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) + def startR(self, fileName, script): + scriptFile = open(fileName, "w") + scriptFile.write(script) + scriptFile.close() + command = "R CMD BATCH %s" % (fileName) + status = subprocess.call(command, shell=True) + if status != 0: + raise Exception("Problem with the execution of script file %s, status is: %s" % (fileName, status)) - def plot(self): - if self.merge: - fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) - plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) - script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() - self.startR(fileName, script) - else: - fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) - plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) - script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() - self.startR(fileName, script) - fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) - plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) - script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine() - self.startR(fileName, script) + def plot(self): + print "outputfileName is written in :", self.outputFileName + if self.merge: + fileName = "%s_%d_%s.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, max(self.maxCoverage, self.maxOverlap), self.title) + script = self.getFirstLine() + plotLine + self.overlapScript + self.coverageScript + self.getLastLine() + self.startR(fileName, script) + else: + fileName = "%s_%d_%s_overlap.R" % (self.outputFileName, self.seed, self.index) + print "overlap file is written in :", fileName + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxOverlap, self.title) + script = self.getFirstLine("overlap") + plotLine + self.overlapScript + self.getLastLine() + self.startR(fileName, script) + fileName = "%s_%d_%s_coverage.R" % (self.outputFileName, self.seed, self.index) + plotLine = "plot(x = NA, y = NA, xlab=\"%s\", ylab=\"%s\", panel.first = grid(lwd = 1.0), xlim = c(0, %d), ylim = c(0, %d), cex.axis = 2, cex.lab = 2, cex.main=2, main = \"%s\")\n" % (self.xLabel, self.yLabel, self.size, self.maxCoverage, self.title) + script = self.getFirstLine("coverage") + plotLine + self.coverageScript + self.getLastLine() + self.startR(fileName, script) class PlotParser(object): - def __init__(self, verbosity): - self.verbosity = verbosity - self.parsers = [None, None] - self.sequenceParser = None - self.seed = random.randint(0, 10000) - self.title = "" - self.merge = False + def __init__(self, verbosity): + self.verbosity = verbosity + self.parsers = [None, None] + self.sequenceParser = None + self.seed = random.randint(0, 10000) + self.title = "" + self.merge = False - def __del__(self): - for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): - os.remove(fileName) - for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): - os.remove(fileName) - for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): - os.remove(fileName) + def __del__(self): + for fileName in glob.glob("tmpFile_%d*.dat" % (self.seed)): + os.remove(fileName) + for fileName in glob.glob("%s*.R" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) + for fileName in glob.glob("%s*.Rout" % (os.path.abspath(self.outputFileName))): + os.remove(fileName) - def addInput(self, inputNb, fileName, fileFormat): - if fileName == None: - return - chooser = ParserChooser(self.verbosity) - chooser.findFormat(fileFormat) - self.parsers[inputNb] = chooser.getParser(fileName) - if inputNb == 0: - self.parsers[1] = self.parsers[0] + def addInput(self, inputNb, fileName, fileFormat): + if fileName == None: + return + self.parsers[inputNb] = TranscriptContainer(fileName, fileFormat, self.verbosity) + if inputNb == 0: + self.parsers[1] = self.parsers[0] - def addSequence(self, fileName): - if fileName == None: - return - self.sequenceParser = FastaParser(fileName, self.verbosity) + def addSequence(self, fileName): + if fileName == None: + return + self.sequenceParser = FastaParser(fileName, self.verbosity) - def setOutput(self, fileName): - self.outputFileName = fileName + def setOutput(self, fileName): + self.outputFileName = fileName - def setPlotSize(self, width, height): - self.width = width - self.height = height + def setPlotSize(self, width, height): + self.width = width + self.height = height - def setLabels(self, xLabel, yLabel): - self.xLabel = xLabel - self.yLabel = yLabel + def setLabels(self, xLabel, yLabel): + self.xLabel = xLabel + self.yLabel = yLabel - def setTitle(self, title): - self.title = title + def setTitle(self, title): + self.title = title - def setMerge(self, merge): - self.merge = merge + def setMerge(self, merge): + self.merge = merge - def initializeDataFromSequences(self): - self.sizes = {} - self.coverage = {} - self.overlap = {} - for region in self.sequenceParser.getRegions(): - self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) - self.coverage[region] = {} - self.overlap[region] = [] - for strand in strands: - self.coverage[region][strand] = {} - self.coverage[region][strand][1] = 0 - self.coverage[region][strand][self.sizes[region]] = 0 + def initializeDataFromSequences(self): + self.sizes = {} + self.coverage = {} + self.overlap = {} + for region in self.sequenceParser.getRegions(): + self.sizes[region] = self.sequenceParser.getSizeOfRegion(region) + self.coverage[region] = {} + self.overlap[region] = [] + for strand in strands: + self.coverage[region][strand] = {} + self.coverage[region][strand][1] = 0 + self.coverage[region][strand][self.sizes[region]] = 0 + - def initializeDataFromTranscripts(self): - self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) - self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) - self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) - progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) - for cpt, transcript in enumerate(self.parsers[1].getIterator()): - self.coverage[cpt] = {} - self.overlap[cpt] = [] - for strand in strands: - self.coverage[cpt][strand] = {} - self.coverage[cpt][strand][0] = 0 - self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 - for exon in transcript.getExons(): - self.sizes[cpt] += exon.getSize() - progress.inc() - progress.done() - - def initialize(self): - if self.sequenceParser == None: - self.initializeDataFromTranscripts() - else: - self.initializeDataFromSequences() + def initializeDataFromTranscripts(self): + self.coverage = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.overlap = dict([i, None] for i in range(self.parsers[1].getNbTranscripts())) + self.sizes = dict([i, 0] for i in range(self.parsers[1].getNbTranscripts())) + self.parsers[0].findData() + progress = Progress(self.parsers[1].getNbTranscripts(), "Reading regions", self.verbosity) + for cpt, transcript in enumerate(self.parsers[1].getIterator()): + self.coverage[cpt] = {} + self.overlap[cpt] = [] + for strand in strands: + self.coverage[cpt][strand] = {} + self.coverage[cpt][strand][0] = 0 + self.coverage[cpt][strand][transcript.getEnd() - transcript.getStart()] = 0 + for exon in transcript.getExons(): + self.sizes[cpt] += exon.getSize() + progress.inc() + progress.done() - def computeCoverage(self, transcript1, transcript2, id): - strand = transcript1.getDirection() * transcript2.getDirection() - for exon1 in transcript1.getExons(): - for exon2 in transcript2.getExons(): - if exon1.overlapWith(exon2): - for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): - relativePosition = position - transcript2.getStart() + 1 - self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 + def initialize(self): + if self.sequenceParser == None: + self.initializeDataFromTranscripts() + else: + self.initializeDataFromSequences() - def computeOverlap(self, transcript1, transcript2, id): - simpleTranscript = SimpleTranscript(transcript1, transcript2) - self.overlap[id].append(simpleTranscript) - - def compute2TranscriptFiles(self): - progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) - for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): - for transcript1 in self.parsers[0].getIterator(): - if transcript1.overlapWithExon(transcript2): - self.computeCoverage(transcript1, transcript2, cpt2) - self.computeOverlap(transcript1, transcript2, cpt2) - progress.inc() - progress.done() + def computeCoverage(self, transcript1, transcript2, id): + strand = transcript1.getDirection() * transcript2.getDirection() + for exon1 in transcript1.getExons(): + for exon2 in transcript2.getExons(): + if exon1.overlapWith(exon2): + for position in range(max(exon1.getStart(), exon2.getStart()), min(exon1.getEnd(), exon2.getEnd()) + 1): + relativePosition = position - transcript2.getStart() + 1 + self.coverage[id][strand][relativePosition] = self.coverage[id][strand].get(relativePosition, 0) + 1 - def extractReferenceQueryMapping(self, mapping): - queryTranscript = mapping.getTranscript() - referenceTranscript = Transcript() - referenceTranscript.setChromosome(queryTranscript.getChromosome()) - referenceTranscript.setName(queryTranscript.getChromosome()) - referenceTranscript.setDirection("+") - referenceTranscript.setEnd(self.sizes[queryTranscript.getChromosome()]) - referenceTranscript.setStart(1) - return (referenceTranscript, queryTranscript) + def computeOverlap(self, transcript1, transcript2, id): + simpleTranscript = SimpleTranscript(transcript1, transcript2) + self.overlap[id].append(simpleTranscript) + + def compute2TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + for transcript1 in self.parsers[0].getIterator(): + if transcript1.overlapWithExon(transcript2): + self.computeCoverage(transcript1, transcript2, cpt2) + self.computeOverlap(transcript1, transcript2, cpt2) + progress.inc() + progress.done() - def extractReferenceQuery(self, inputTranscript): - if "Target" not in inputTranscript.getTagNames(): - raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) - id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) - if id not in self.sizes: - raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) - referenceTranscript = Transcript() - referenceTranscript.setChromosome(id) - referenceTranscript.setName(id) - referenceTranscript.setDirection("+") - referenceTranscript.setEnd(self.sizes[id]) - referenceTranscript.setStart(1) - queryTranscript = Transcript() - queryTranscript.setChromosome(id) - queryTranscript.setName(id) - queryTranscript.setStart(start) - queryTranscript.setEnd(end) - queryTranscript.setDirection(strand) - if inputTranscript.getNbExons() > 1: - factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) - for exon in inputTranscript.getExons(): - newExon = Interval() - newExon.setChromosome(id) - newExon.setDirection(strand) - if "Target" in inputTranscript.getTagNames(): - id, start, end, strand = parseTargetField(exon.getTagValue("Target")) - newExon.setStart(start) - newExon.setEnd(end) - else: - newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) - newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) - queryTranscript.addExon(newExon) - return (referenceTranscript, queryTranscript) + def extractReferenceQuery(self, inputTranscript): + if "Target" not in inputTranscript.getTagNames(): + raise Exception("Cannot extract Target field in line '%s'." % (inputTranscript)) + id, start, end, strand = parseTargetField(inputTranscript.getTagValue("Target")) + if id not in self.sizes: + raise Exception("Target id '%s' of transcript '%s' does not correspond to anything in FASTA file." % (id, inputTranscript)) + referenceTranscript = Transcript() + referenceTranscript.setChromosome(id) + referenceTranscript.setName(id) + referenceTranscript.setDirection("+") + referenceTranscript.setEnd(self.sizes[id]) + referenceTranscript.setStart(1) + queryTranscript = Transcript() + queryTranscript.setChromosome(id) + queryTranscript.setName(id) + queryTranscript.setStart(start) + queryTranscript.setEnd(end) + queryTranscript.setDirection(strand) + if inputTranscript.getNbExons() > 1: + factor = float(end - start) / (inputTranscript.getEnd() - inputTranscript.getStart()) + for exon in inputTranscript.getExons(): + newExon = Interval() + newExon.setChromosome(id) + newExon.setDirection(strand) + if "Target" in inputTranscript.getTagNames(): + id, start, end, strand = parseTargetField(exon.getTagValue("Target")) + newExon.setStart(start) + newExon.setEnd(end) + else: + newExon.setStart(int(round((exon.getStart() - inputTranscript.getStart()) * factor)) + start) + newExon.setEnd( int(round((exon.getEnd() - inputTranscript.getStart()) * factor)) + start) + queryTranscript.addExon(newExon) + return (referenceTranscript, queryTranscript) - def compute1TranscriptFiles(self): - progress = Progress(self.parsers[1].getNbItems(), "Comparing regions", self.verbosity) - for transcript in self.parsers[1].getIterator(): - if transcript.__class__.__name__ == "Mapping": - referenceTranscript, queryTranscript = self.extractReferenceQueryMapping(transcript) - else: - referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) - self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) - self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) - progress.inc() - progress.done() + def compute1TranscriptFiles(self): + progress = Progress(self.parsers[1].getNbTranscripts(), "Comparing regions", self.verbosity) + for transcript in self.parsers[1].getIterator(): + referenceTranscript, queryTranscript = self.extractReferenceQuery(transcript) + self.computeCoverage(queryTranscript, referenceTranscript, referenceTranscript.getName()) + self.computeOverlap(queryTranscript, referenceTranscript, referenceTranscript.getName()) + progress.inc() + progress.done() - def compute(self): - if self.sequenceParser == None: - self.compute2TranscriptFiles() - else: - self.compute1TranscriptFiles() + def compute(self): + if self.sequenceParser == None: + self.compute2TranscriptFiles() + else: + self.compute1TranscriptFiles() - def plotTranscript(self, index, transcript): - plotter = Plotter(self.seed, index, self.verbosity) - plotter.setOutputFileName(self.outputFileName) - plotter.setTranscript(transcript) - plotter.setTitle(self.title) - plotter.setLabels(self.xLabel, self.yLabel) - plotter.setPlotSize(self.width, self.height) - plotter.setCoverageData(self.coverage[index]) - plotter.setOverlapData(self.overlap[index]) - plotter.setMerge(self.merge) - plotter.plot() - output = plotter.log - return output - - def plot1TranscriptFile(self): - self.outputCoverage = {} - self.outputCoveragePerStrand = {} - output = "" - progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) - for cpt2, region in enumerate(self.sequenceParser.getRegions()): - transcript = Transcript() - transcript.setName(region) - transcript.setDirection("+") - transcript.setEnd(self.sizes[region]) - transcript.setStart(1) - output += self.plotTranscript(region, transcript) - progress.inc() - progress.done() - if self.verbosity > 0: - print output + def plotTranscript(self, index, transcript): + plotter = Plotter(self.seed, index, self.verbosity) + plotter.setOutputFileName(self.outputFileName) + plotter.setTranscript(transcript) + plotter.setTitle(self.title) + plotter.setLabels(self.xLabel, self.yLabel) + plotter.setPlotSize(self.width, self.height) + plotter.setCoverageData(self.coverage[index]) + plotter.setOverlapData(self.overlap[index]) + plotter.setMerge(self.merge) + plotter.plot() + output = plotter.log + return output + + def plot1TranscriptFile(self): + self.outputCoverage = {} + self.outputCoveragePerStrand = {} + output = "" + progress = Progress(len(self.sequenceParser.getRegions()), "Plotting regions", self.verbosity) + for cpt2, region in enumerate(self.sequenceParser.getRegions()): + transcript = Transcript() + transcript.setName(region) + transcript.setDirection("+") + transcript.setEnd(self.sizes[region]) + transcript.setStart(1) + output += self.plotTranscript(region, transcript) + progress.inc() + progress.done() + if self.verbosity > 0: + print output - def plot2TranscriptFiles(self): - self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() - self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() - for cpt in range(self.parsers[1].getNbTranscripts()): - self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) - progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) - output = "" - for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): - output += self.plotTranscript(cpt2, transcript2) - progress.inc() - progress.done() - if self.verbosity > 0: - print output + def plot2TranscriptFiles(self): + self.outputCoverage = [0] * self.parsers[1].getNbTranscripts() + self.outputCoveragePerStrand = [None] * self.parsers[1].getNbTranscripts() + for cpt in range(self.parsers[1].getNbTranscripts()): + self.outputCoveragePerStrand[cpt] = dict([strand, 0] for strand in strands) + progress = Progress(self.parsers[1].getNbTranscripts(), "Plotting regions", self.verbosity) + output = "" + for cpt2, transcript2 in enumerate(self.parsers[1].getIterator()): + output += self.plotTranscript(cpt2, transcript2) + progress.inc() + progress.done() + if self.verbosity > 0: + print output - def plot(self): - if self.sequenceParser == None: - self.plot2TranscriptFiles() - else: - self.plot1TranscriptFile() + def plot(self): + if self.sequenceParser == None: + self.plot2TranscriptFiles() + else: + self.plot1TranscriptFile() - def start(self): - self.initialize() - self.compute() - self.plot() + def start(self): + self.initialize() + self.compute() + self.plot() if __name__ == "__main__": - - # parse command line - description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" + + # parse command line + description = "Plot Coverage v1.0.1: Plot the coverage of the first data with respect to the second one. [Category: Visualization]" - parser = OptionParser(description = description) - parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript or mapping format given by -f]") - parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript or mapping file format]") - parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") - parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") - parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") - parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") - parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") - parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") - parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") - parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") - parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") - parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") - parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") - parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") - parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") - parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") - (options, args) = parser.parse_args() + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--inputFormat1", dest="inputFormat1", action="store", type="string", help="format of input file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--inputFormat2", dest="inputFormat2", action="store", type="string", help="format of input file 2 [compulsory] [format: transcript file format]") + parser.add_option("-q", "--sequence", dest="inputSequence", action="store", default=None, type="string", help="input sequence file [format: file in FASTA format] [default: None]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in PNG format]") + parser.add_option("-w", "--width", dest="width", action="store", default=1500, type="int", help="width of the plots (in px) [format: int] [default: 1500]") + parser.add_option("-e", "--height", dest="height", action="store", default=1000, type="int", help="height of the plots (in px) [format: int] [default: 1000]") + parser.add_option("-t", "--title", dest="title", action="store", default="", type="string", help="title of the plots [format: string]") + parser.add_option("-x", "--xlab", dest="xLabel", action="store", default="", type="string", help="label on the x-axis [format: string]") + parser.add_option("-y", "--ylab", dest="yLabel", action="store", default="", type="string", help="label on the y-axis [format: string]") + parser.add_option("-p", "--plusColor", dest="plusColor", action="store", default="red", type="string", help="color for the elements on the plus strand [format: string] [default: red]") + parser.add_option("-m", "--minusColor", dest="minusColor", action="store", default="blue", type="string", help="color for the elements on the minus strand [format: string] [default: blue]") + parser.add_option("-s", "--sumColor", dest="sumColor", action="store", default="black", type="string", help="color for 2 strands coverage line [format: string] [default: black]") + parser.add_option("-l", "--lineColor", dest="lineColor", action="store", default="black", type="string", help="color for the lines [format: string] [default: black]") + parser.add_option("-1", "--merge", dest="merge", action="store_true", default=False, help="merge the 2 plots in 1 [format: boolean] [default: false]") + parser.add_option("-D", "--directory", dest="working_Dir", action="store", default=os.getcwd(), type="string", help="the directory to store the results [format: directory]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() - colors[1] = options.plusColor - colors[-1] = options.minusColor - colors[0] = options.sumColor - colorLine = options.lineColor + colors[1] = options.plusColor + colors[-1] = options.minusColor + colors[0] = options.sumColor + colorLine = options.lineColor - pp = PlotParser(options.verbosity) - pp.addInput(0, options.inputFileName1, options.inputFormat1) - pp.addInput(1, options.inputFileName2, options.inputFormat2) - pp.addSequence(options.inputSequence) - pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dir, options.outputFileName)) - pp.setPlotSize(options.width, options.height) - pp.setLabels(options.xLabel, options.yLabel) - pp.setTitle(options.title) - pp.setMerge(options.merge) - pp.start() + pp = PlotParser(options.verbosity) + pp.addInput(0, options.inputFileName1, options.inputFormat1) + pp.addInput(1, options.inputFileName2, options.inputFormat2) + pp.addSequence(options.inputSequence) + pp.setOutput(options.outputFileName if os.path.isabs(options.outputFileName) else os.path.join(options.working_Dirpath, options.outputFileName)) + pp.setPlotSize(options.width, options.height) + pp.setLabels(options.xLabel, options.yLabel) + pp.setTitle(options.title) + pp.setMerge(options.merge) + pp.start() + diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/structure/Interval.py --- a/SMART/Java/Python/structure/Interval.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/structure/Interval.py Mon Sep 30 03:19:26 2013 -0400 @@ -139,7 +139,7 @@ if not chromosome: self.seqname = None else: - self.seqname = chromosome.replace(".", "_").replace("|", "_") + self.seqname = chromosome.replace("|", "_") def setStart(self, start): diff -r e454402ba9d9 -r 169d364ddd91 SMART/Java/Python/structure/Transcript.py --- a/SMART/Java/Python/structure/Transcript.py Wed Sep 18 08:51:22 2013 -0400 +++ b/SMART/Java/Python/structure/Transcript.py Mon Sep 30 03:19:26 2013 -0400 @@ -347,31 +347,6 @@ newTranscript.exons = theseExons return newTranscript - - def getIntersection(self, transcript): - """ - Get the intersection between this transcript and another one - @param transcript: object to be compared to - @type transcript: class L{Transcript} - @return: an other transcript - """ - if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): - return None - newTranscript = Transcript() - newTranscript.setDirection(self.getDirection()) - newTranscript.setChromosome(self.getChromosome()) - newTranscript.setName("%s_intersect_%s" % (self.getName(), transcript.getName())) - newExons = [] - for thisExon in self.getExons(): - for thatExon in transcript.getExons(): - newExon = thisExon.getIntersection(thatExon) - if newExon != None: - newExons.append(newExon) - if not newExons: - return None - newTranscript.exons = newExons - return newTranscript - def getSqlVariables(cls): """ diff -r e454402ba9d9 -r 169d364ddd91 SMART/galaxy/CompareOverlappingAdapt.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/galaxy/CompareOverlappingAdapt.xml Mon Sep 30 03:19:26 2013 -0400 @@ -0,0 +1,153 @@ + + Provide the queries that overlap with a reference. + + PYTHONPATH + + + ../Java/Python/CompareOverlappingAdapt.py -i $formatType.inputFileName1 + #if $formatType.FormatInputFileName1 == 'bed': + -f bed + #elif $formatType.FormatInputFileName1 == 'gff': + -f gff + #elif $formatType.FormatInputFileName1 == 'gff2': + -f gff2 + #elif $formatType.FormatInputFileName1 == 'gff3': + -f gff3 + #elif $formatType.FormatInputFileName1 == 'sam': + -f sam + #elif $formatType.FormatInputFileName1 == 'gtf': + -f gtf + #end if + -j $formatType2.inputFileName2 + #if $formatType2.FormatInputFileName2 == 'bed': + -g bed + #elif $formatType2.FormatInputFileName2 == 'gff': + -g gff + #elif $formatType2.FormatInputFileName2 == 'gff2': + -g gff2 + #elif $formatType2.FormatInputFileName2 == 'gff3': + -g gff3 + #elif $formatType2.FormatInputFileName2 == 'sam': + -g sam + #elif $formatType2.FormatInputFileName2 == 'gtf': + -g gtf + #end if + -o $outputFileGff + #if $OptionDistance.Dist == 'Yes': + -d $OptionDistance.distance + #end if + #if $OptionCollinearOrAntiSens.OptionCA == 'Collinear': + -c + #elif $OptionCollinearOrAntiSens.OptionCA == 'AntiSens': + -a + #end if + $InvertMatch + $NotOverlapping + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +This script may be the most important one. It basically compares two sets of transcripts and keeps those from the first set which overlap with the second one. The first set is considered as the query set (basically, your data) and the second one is the reference set (RefSeq data, for example). + +It is vital to understand that it will output the elements of the first file which overlap with the elements of the second one. + +Various modifiers are also available: + +-Invert selection (report those which do not overlap). + +-Restrict to collinear / anti-sense overlapping data. + +-Keep the query data even if they do not strictly overlap with the reference data, but are located not further away than *n* nucleotide from some reference data. + +Some option reverses the selection. Put in other words, it performs the comparison as usual, and outputs all those query data which do not overlap. + + diff -r e454402ba9d9 -r 169d364ddd91 commons/core/parsing/FastaParser.py --- a/commons/core/parsing/FastaParser.py Wed Sep 18 08:51:22 2013 -0400 +++ b/commons/core/parsing/FastaParser.py Mon Sep 30 03:19:26 2013 -0400 @@ -80,7 +80,7 @@ if self.currentLine != None: if self.currentLine[0] != ">": raise Exception("First line is weird: %s" % (self.currentLine)) - name = self.currentLine[1:].split()[0].replace("|", "_").replace(".", "_") + name = self.currentLine[1:].split()[0] self.currentLine = None for line in self.handle: @@ -89,7 +89,7 @@ pass elif line[0] == ">": if name == None: - name = line[1:].split()[0].replace("|", "_").replace(".", "_") + name = line[1:].split()[0] else: self.currentLine = line return Sequence(name, string) diff -r e454402ba9d9 -r 169d364ddd91 commons/core/parsing/WigParser.py --- a/commons/core/parsing/WigParser.py Wed Sep 18 08:51:22 2013 -0400 +++ b/commons/core/parsing/WigParser.py Mon Sep 30 03:19:26 2013 -0400 @@ -85,11 +85,12 @@ Create an index name for a file """ directoryName = os.path.dirname(self.fileName) + baseName = os.path.splitext(os.path.basename(self.fileName))[0] if strand == None: strandName = "" else: strandName = "+" if strand == 1 else "-" - indexName = os.path.join(directoryName, ".%s%s.index" % (chromosome, strandName)) + indexName = os.path.join(directoryName, ".%s.%s%s.index" % (baseName, chromosome, strandName)) return indexName diff -r e454402ba9d9 -r 169d364ddd91 commons/core/writer/MySqlTranscriptWriter.py --- a/commons/core/writer/MySqlTranscriptWriter.py Wed Sep 18 08:51:22 2013 -0400 +++ b/commons/core/writer/MySqlTranscriptWriter.py Mon Sep 30 03:19:26 2013 -0400 @@ -164,7 +164,7 @@ @type transcriptListParser: class L{TranscriptListParser} """ self.transcriptListParser = transcriptListParser - self.mySqlConnection.executeManyFormattedQueriesIterator(self) + self.mySqlConnection.executeManyQueriesIterator(self) def getIterator(self): @@ -178,8 +178,7 @@ self.createTable(chromosome) self.nbTranscriptsByChromosome[chromosome] = self.nbTranscriptsByChromosome.get(chromosome, 0) + 1 values = transcript.getSqlValues() - #yield "INSERT INTO '%s' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join([MySqlTable.formatSql(values[variable], self.tables[chromosome].types[variable], self.tables[chromosome].sizes[variable]) for variable in self.tables[chromosome].variables])) - yield ("INSERT INTO '%s' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join(["?"] * len(self.tables[chromosome].variables))), [values[variable] for variable in self.tables[chromosome].variables]) + yield "INSERT INTO '%s' (%s) VALUES (%s)" % (self.tables[chromosome].name, ", ".join(self.tables[chromosome].variables), ", ".join([MySqlTable.formatSql(values[variable], self.tables[chromosome].types[variable], self.tables[chromosome].sizes[variable]) for variable in self.tables[chromosome].variables])) progress.inc() progress.done() @@ -191,7 +190,7 @@ """ for chromosome in self.transcriptValues: if chromosome in self.transcriptValues: - self.tables[chromosome].insertManyFormatted(self.transcriptValues[chromosome]) + self.tables[chromosome].insertMany(self.transcriptValues[chromosome]) self.transcriptValues = {} self.toBeWritten = False @@ -212,4 +211,4 @@ Drop the tables """ for chromosome in self.tables: - self.tables[chromosome].remove() \ No newline at end of file + self.tables[chromosome].remove() diff -r e454402ba9d9 -r 169d364ddd91 tool_conf.xml --- a/tool_conf.xml Wed Sep 18 08:51:22 2013 -0400 +++ b/tool_conf.xml Mon Sep 30 03:19:26 2013 -0400 @@ -1,48 +1,45 @@ -
-