diff SMART/Java/Python/getRandomRegions.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- a/SMART/Java/Python/getRandomRegions.py	Wed Sep 18 08:51:22 2013 -0400
+++ b/SMART/Java/Python/getRandomRegions.py	Mon Sep 30 03:19:26 2013 -0400
@@ -44,224 +44,228 @@
 
 class RandomRegionsGenerator(object):
 
-    def __init__(self, verbosity):
-        self.verbosity      = verbosity
-        self.strands        = False
-        self.distribution   = "uniform"
-        self.transcripts    = None
-        self.sequenceParser = None
-        random.seed()
+	def __init__(self, verbosity):
+		self.verbosity      = verbosity
+		self.strands        = False
+		self.distribution   = "uniform"
+		self.transcripts    = None
+		self.sequenceParser = None
+		random.seed()
 
 
-    def setInput(self, fileName):
-        self.sequenceParser = FastaParser(fileName, self.verbosity)
+	def setInput(self, fileName):
+		self.sequenceParser = FastaParser(fileName, self.verbosity)
 
 
-    def setGenomeSize(self, size):
-        self.genomeSize = size
+	def setGenomeSize(self, size):
+		self.genomeSize = size
 
 
-    def setChromosomeName(self, name):
-        self.chromosomeName = name
+	def setChromosomeName(self, name):
+		self.chromosomeName = name
 
 
-    def setAnnotation(self, fileName, format):
-        parser           = TranscriptContainer(fileName, format, self.verbosity)
-        self.transcripts = []
-        for transcript in parser.getIterator():
-            self.transcripts.append(transcript)
-        self.setNumber(len(self.transcripts))
-        self.setSize(0)
+	def setAnnotation(self, fileName, format):
+		parser           = TranscriptContainer(fileName, format, self.verbosity)
+		self.transcripts = []
+		for transcript in parser.getIterator():
+			self.transcripts.append(transcript)
+		self.setNumber(len(self.transcripts))
+		self.setSize(0)
 
 
-    def setOutputFile(self, fileName):
-        self.outputFileName = fileName
+	def setOutputFile(self, fileName):
+		self.outputFileName = fileName
 
 
-    def setSize(self, size):
-        self.minSize = size
-        self.maxSize = size
+	def setSize(self, size):
+		self.minSize = size
+		self.maxSize = size
 
 
-    def setMinSize(self, size):
-        self.minSize = size
+	def setMinSize(self, size):
+		self.minSize = size
 
 
-    def setMaxSize(self, size):
-        self.maxSize = size
+	def setMaxSize(self, size):
+		self.maxSize = size
 
 
-    def setNumber(self, number):
-        self.number = number
+	def setNumber(self, number):
+		self.number = number
 
 
-    def setStrands(self, strands):
-        self.strands = strands
+	def setStrands(self, strands):
+		self.strands = strands
 
 
-    def setMaxDistribution(self, maxElements):
-        if maxElements == None:
-            return
-        self.maxElements = maxElements
-        self.distribution = "gaussian"
+	def setMaxDistribution(self, maxElements):
+		if maxElements == None:
+			return
+		self.maxElements = maxElements
+		self.distribution = "gaussian"
 
 
-    def setDeviationDistribution(self, deviation):
-        if deviation == None:
-            return
-        self.deviation = deviation
-        self.distribution = "gaussian"
+	def setDeviationDistribution(self, deviation):
+		if deviation == None:
+			return
+		self.deviation = deviation
+		self.distribution = "gaussian"
 
 
-    def getSizes(self):
-        if self.sequenceParser == None:
-            self.chromosomes    = [self.chromosomeName]
-            self.sizes          = {self.chromosomeName: self.genomeSize}
-            self.cumulatedSize  = self.genomeSize
-            self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
-            return
-        self.chromosomes    = self.sequenceParser.getRegions()
-        self.sizes          = {}
-        self.cumulatedSize  = 0
-        self.cumulatedSizes = {}
-        for chromosome in self.chromosomes:
-            self.sizes[chromosome]          = self.sequenceParser.getSizeOfRegion(chromosome)
-            self.cumulatedSize             += self.sizes[chromosome]
-            self.cumulatedSizes[chromosome] = self.cumulatedSize
+	def getSizes(self):
+		if self.sequenceParser == None:
+			self.chromosomes    = [self.chromosomeName]
+			self.sizes          = {self.chromosomeName: self.genomeSize}
+			self.cumulatedSize  = self.genomeSize
+			self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
+			return
+		self.chromosomes    = self.sequenceParser.getRegions()
+		self.sizes          = {}
+		self.cumulatedSize  = 0
+		self.cumulatedSizes = {}
+		for chromosome in self.chromosomes:
+			self.sizes[chromosome]          = self.sequenceParser.getSizeOfRegion(chromosome)
+			self.cumulatedSize             += self.sizes[chromosome]
+			self.cumulatedSizes[chromosome] = self.cumulatedSize
 
 
-    def findPosition(self, size = None):
-        if size == None:
-            size = random.randint(self.minSize, self.maxSize)
-        integer = random.randint(0, self.cumulatedSize)
-        for chromosome in self.chromosomes:
-            if self.cumulatedSizes[chromosome] > integer:
-                break
-        start = random.randint(1, self.sizes[chromosome] - size)
-        return (chromosome, start, size)
+	def findPosition(self, size = None):
+		if size == None:
+			size = random.randint(self.minSize, self.maxSize)
+		integer = random.randint(0, self.cumulatedSize)
+		for chromosome in self.chromosomes:
+			if self.cumulatedSizes[chromosome] > integer:
+				break
+		start = random.randint(1, self.sizes[chromosome] - size)
+		return (chromosome, start, size)
 
 
-    def createTranscript(self, chromosome, start, size, strand, cpt):
-        transcript = Transcript()
-        transcript.setChromosome(chromosome)
-        transcript.setStart(start)
-        transcript.setEnd(start + size-1)
-        transcript.setDirection(strand)
-        transcript.setName("rand_%d" % (cpt))
-        return transcript
+	def createTranscript(self, chromosome, start, size, strand, cpt):
+		transcript = Transcript()
+		transcript.setChromosome(chromosome)
+		transcript.setEnd(start + size-1)
+		transcript.setStart(start)
+		transcript.setDirection(strand)
+		transcript.setName("rand_%d" % (cpt))
+		return transcript
 
 
-    def moveTranscript(self, chromosome, start, transcript):
-        while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
-            chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
-        transcript.setChromosome(chromosome)
-        oldStart, oldEnd = transcript.getStart(), transcript.getEnd()
-        if transcript.getNbExons() > 1:
-            for exon in transcript.getNbExons():
-                oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd()
-                exon.setStart(oldExonStart + start - oldStart)
-                exon.setEnd(oldExonEnd + start - oldStart)
-        transcript.setStart(start)
-        transcript.setEnd(oldEnd + start - oldStart)
-        return [transcript]
+	def moveTranscript(self, chromosome, start, transcript):
+		while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
+			chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
+		newTranscript = Transcript()
+		newTranscript.setChromosome(chromosome)
+		newTranscript.tags = transcript.tags
+		if transcript.getNbExons() > 1:
+			for exon in transcript.getNbExons():
+				newExon = Interval()
+				newExon.setChromosome(chromosome)
+				newExon.setEnd(exon.getEnd() + start - transcript.getStart())
+				newExon.setStart(exon.getStart() + start - transcript.getStart())
+				newTranscript.addExon(newExon)
+		newTranscript.setEnd(transcript.getEnd() + start - transcript.getStart())
+		newTranscript.setStart(start)
+		newTranscript.setDirection(transcript.getDirection())
+		return [newTranscript]
 
 
-    def createUniformCluster(self, chromosome, start, size, strand, cpt):
-        transcript = self.createTranscript(chromosome, start, size, strand, cpt)
-        return [transcript]
-
-
-    def findNbTranscripts(self, cpt):
-        return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
+	def createUniformCluster(self, chromosome, start, size, strand, cpt):
+		transcript = self.createTranscript(chromosome, start, size, strand, cpt)
+		return [transcript]
 
 
-    def getDev(self):
-        deviation = 0.0
-        for j in range(repetitions):
-            deviation += random.randint(-self.deviation, self.deviation)
-        deviation /= repetitions
-        deviation  = int(round(deviation))
-        return deviation
+	def findNbTranscripts(self, cpt):
+		return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
 
 
-    def createGaussianCluster(self, chromosome, start, size, strand, cpt):
-        transcripts   = []
-        nbTranscripts = self.findNbTranscripts(cpt)
-        for i in range(nbTranscripts):
-            transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
-            transcripts.append(transcript)
-        return transcripts
+	def getDev(self):
+		deviation = 0.0
+		for j in range(repetitions):
+			deviation += random.randint(-self.deviation, self.deviation)
+		deviation /= repetitions
+		deviation  = int(round(deviation))
+		return deviation
+
+
+	def createGaussianCluster(self, chromosome, start, size, strand, cpt):
+		transcripts   = []
+		nbTranscripts = self.findNbTranscripts(cpt)
+		for i in range(nbTranscripts):
+			transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
+			transcripts.append(transcript)
+		return transcripts
 
 
-    def writeRegions(self):
-        writer     = Gff3Writer(self.outputFileName, self.verbosity)
-        outputFile = open(self.outputFileName, "w")
-        progress   = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
-        i          = 0
-        while i < self.number:
-            chromosome, start, size = self.findPosition()
-            strand                  = random.choice([-1, 1]) if self.strands else 1
-            if self.transcripts != None:
-                transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
-            elif self.distribution == "uniform":
-                transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
-            else:
-                transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
-            for transcript in transcripts:
-                writer.addTranscript(transcript)
-                i += 1
-                progress.inc()
-        progress.done()
-        outputFile.close()
-        writer.write()
-        writer.close()
+	def writeRegions(self):
+		writer     = Gff3Writer(self.outputFileName, self.verbosity)
+		outputFile = open(self.outputFileName, "w")
+		progress   = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
+		i          = 0
+		while i < self.number:
+			chromosome, start, size = self.findPosition()
+			strand                  = random.choice([-1, 1]) if self.strands else 1
+			if self.transcripts != None:
+				transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
+			elif self.distribution == "uniform":
+				transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
+			else:
+				transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
+			for transcript in transcripts:
+				writer.addTranscript(transcript)
+				i += 1
+				progress.inc()
+		progress.done()
+		outputFile.close()
+		writer.write()
+		writer.close()
 
 
-    def run(self):
-        self.getSizes()
-        self.writeRegions()
+	def run(self):
+		self.getSizes()
+		self.writeRegions()
 
 
 if __name__ == "__main__":
-    
-    # parse command line
-    description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
+	
+	# parse command line
+	description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
 
-    parser = OptionParser(description = description)
-    parser.add_option("-r", "--reference",     dest="reference",      action="store",      default=None,  type="string", help="file that contains the sequences [format: file in FASTA format]")
-    parser.add_option("-S", "--referenceSize", dest="referenceSize",  action="store",      default=None,  type="int",    help="size of the chromosome (when no reference is given) [format: int]")
-    parser.add_option("-c", "--chromosome",    dest="chromosome",     action="store",      default=None,  type="string", help="name of the chromosome (when no reference is given) [format: string]")
-    parser.add_option("-o", "--output",        dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
-    parser.add_option("-i", "--input",         dest="inputFileName",  action="store",      default=None,  type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
-    parser.add_option("-f", "--format",        dest="format",         action="store",      default=None,  type="string", help="format of the previous file [format: transcript file format]")
-    parser.add_option("-s", "--size",          dest="size",           action="store",      default=None,  type="int",    help="size of the regions (if no region set is provided) [format: int]")
-    parser.add_option("-z", "--minSize",       dest="minSize",        action="store",      default=None,  type="int",    help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
-    parser.add_option("-Z", "--maxSize",       dest="maxSize",        action="store",      default=None,  type="int",    help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
-    parser.add_option("-n", "--number",        dest="number",         action="store",      default=None,  type="int",    help="number of regions (if no region set is provided) [format: int]")
-    parser.add_option("-t", "--strands",       dest="strands",        action="store_true", default=False,                help="use both strands (if no region set is provided) [format: boolean]")
-    parser.add_option("-m", "--max",           dest="max",            action="store",      default=None,  type="int",    help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
-    parser.add_option("-d", "--deviation",     dest="deviation",      action="store",      default=None,  type="int",    help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
-    parser.add_option("-v", "--verbosity",     dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
-    (options, args) = parser.parse_args()
+	parser = OptionParser(description = description)
+	parser.add_option("-r", "--reference",     dest="reference",      action="store",      default=None,  type="string", help="file that contains the sequences [format: file in FASTA format]")
+	parser.add_option("-S", "--referenceSize", dest="referenceSize",  action="store",      default=None,  type="int",    help="size of the chromosome (when no reference is given) [format: int]")
+	parser.add_option("-c", "--chromosome",    dest="chromosome",     action="store",      default=None,  type="string", help="name of the chromosome (when no reference is given) [format: string]")
+	parser.add_option("-o", "--output",        dest="outputFileName", action="store",                     type="string", help="output file [compulsory] [format: output file in FASTA format]")
+	parser.add_option("-i", "--input",         dest="inputFileName",  action="store",      default=None,  type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
+	parser.add_option("-f", "--format",        dest="format",         action="store",      default=None,  type="string", help="format of the previous file [format: transcript file format]")
+	parser.add_option("-s", "--size",          dest="size",           action="store",      default=None,  type="int",    help="size of the regions (if no region set is provided) [format: int]")
+	parser.add_option("-z", "--minSize",       dest="minSize",        action="store",      default=None,  type="int",    help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+	parser.add_option("-Z", "--maxSize",       dest="maxSize",        action="store",      default=None,  type="int",    help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
+	parser.add_option("-n", "--number",        dest="number",         action="store",      default=None,  type="int",    help="number of regions (if no region set is provided) [format: int]")
+	parser.add_option("-t", "--strands",       dest="strands",        action="store_true", default=False,                help="use both strands (if no region set is provided) [format: boolean]")
+	parser.add_option("-m", "--max",           dest="max",            action="store",      default=None,  type="int",    help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
+	parser.add_option("-d", "--deviation",     dest="deviation",      action="store",      default=None,  type="int",    help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
+	parser.add_option("-v", "--verbosity",     dest="verbosity",      action="store",      default=1,     type="int",    help="trace level [format: int]")
+	(options, args) = parser.parse_args()
 
-    rrg = RandomRegionsGenerator(options.verbosity)
-    if options.reference == None:
-        rrg.setGenomeSize(options.referenceSize)
-        rrg.setChromosomeName(options.chromosome)
-    else:
-        rrg.setInput(options.reference)
-    rrg.setOutputFile(options.outputFileName)
-    if options.inputFileName == None:
-        if options.size != None:
-            rrg.setSize(options.size)
-        else:
-            rrg.setMinSize(options.minSize)
-            rrg.setMaxSize(options.maxSize)
-        rrg.setNumber(options.number)
-        rrg.setStrands(options.strands)
-    else:
-        rrg.setAnnotation(options.inputFileName, options.format)
-    rrg.setMaxDistribution(options.max)
-    rrg.setDeviationDistribution(options.deviation)
-    rrg.run()
+	rrg = RandomRegionsGenerator(options.verbosity)
+	if options.reference == None:
+		rrg.setGenomeSize(options.referenceSize)
+		rrg.setChromosomeName(options.chromosome)
+	else:
+		rrg.setInput(options.reference)
+	rrg.setOutputFile(options.outputFileName)
+	if options.inputFileName == None:
+		if options.size != None:
+			rrg.setSize(options.size)
+		else:
+			rrg.setMinSize(options.minSize)
+			rrg.setMaxSize(options.maxSize)
+		rrg.setNumber(options.number)
+		rrg.setStrands(options.strands)
+	else:
+		rrg.setAnnotation(options.inputFileName, options.format)
+	rrg.setMaxDistribution(options.max)
+	rrg.setDeviationDistribution(options.deviation)
+	rrg.run()