comparison SMART/Java/Python/getRandomRegions.py @ 46:169d364ddd91

Uploaded
author m-zytnicki
date Mon, 30 Sep 2013 03:19:26 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
45:e454402ba9d9 46:169d364ddd91
42 repetitions = 100 42 repetitions = 100
43 43
44 44
45 class RandomRegionsGenerator(object): 45 class RandomRegionsGenerator(object):
46 46
47 def __init__(self, verbosity): 47 def __init__(self, verbosity):
48 self.verbosity = verbosity 48 self.verbosity = verbosity
49 self.strands = False 49 self.strands = False
50 self.distribution = "uniform" 50 self.distribution = "uniform"
51 self.transcripts = None 51 self.transcripts = None
52 self.sequenceParser = None 52 self.sequenceParser = None
53 random.seed() 53 random.seed()
54 54
55 55
56 def setInput(self, fileName): 56 def setInput(self, fileName):
57 self.sequenceParser = FastaParser(fileName, self.verbosity) 57 self.sequenceParser = FastaParser(fileName, self.verbosity)
58 58
59 59
60 def setGenomeSize(self, size): 60 def setGenomeSize(self, size):
61 self.genomeSize = size 61 self.genomeSize = size
62 62
63 63
64 def setChromosomeName(self, name): 64 def setChromosomeName(self, name):
65 self.chromosomeName = name 65 self.chromosomeName = name
66 66
67 67
68 def setAnnotation(self, fileName, format): 68 def setAnnotation(self, fileName, format):
69 parser = TranscriptContainer(fileName, format, self.verbosity) 69 parser = TranscriptContainer(fileName, format, self.verbosity)
70 self.transcripts = [] 70 self.transcripts = []
71 for transcript in parser.getIterator(): 71 for transcript in parser.getIterator():
72 self.transcripts.append(transcript) 72 self.transcripts.append(transcript)
73 self.setNumber(len(self.transcripts)) 73 self.setNumber(len(self.transcripts))
74 self.setSize(0) 74 self.setSize(0)
75 75
76 76
77 def setOutputFile(self, fileName): 77 def setOutputFile(self, fileName):
78 self.outputFileName = fileName 78 self.outputFileName = fileName
79 79
80 80
81 def setSize(self, size): 81 def setSize(self, size):
82 self.minSize = size 82 self.minSize = size
83 self.maxSize = size 83 self.maxSize = size
84 84
85 85
86 def setMinSize(self, size): 86 def setMinSize(self, size):
87 self.minSize = size 87 self.minSize = size
88 88
89 89
90 def setMaxSize(self, size): 90 def setMaxSize(self, size):
91 self.maxSize = size 91 self.maxSize = size
92 92
93 93
94 def setNumber(self, number): 94 def setNumber(self, number):
95 self.number = number 95 self.number = number
96 96
97 97
98 def setStrands(self, strands): 98 def setStrands(self, strands):
99 self.strands = strands 99 self.strands = strands
100 100
101 101
102 def setMaxDistribution(self, maxElements): 102 def setMaxDistribution(self, maxElements):
103 if maxElements == None: 103 if maxElements == None:
104 return 104 return
105 self.maxElements = maxElements 105 self.maxElements = maxElements
106 self.distribution = "gaussian" 106 self.distribution = "gaussian"
107 107
108 108
109 def setDeviationDistribution(self, deviation): 109 def setDeviationDistribution(self, deviation):
110 if deviation == None: 110 if deviation == None:
111 return 111 return
112 self.deviation = deviation 112 self.deviation = deviation
113 self.distribution = "gaussian" 113 self.distribution = "gaussian"
114 114
115 115
116 def getSizes(self): 116 def getSizes(self):
117 if self.sequenceParser == None: 117 if self.sequenceParser == None:
118 self.chromosomes = [self.chromosomeName] 118 self.chromosomes = [self.chromosomeName]
119 self.sizes = {self.chromosomeName: self.genomeSize} 119 self.sizes = {self.chromosomeName: self.genomeSize}
120 self.cumulatedSize = self.genomeSize 120 self.cumulatedSize = self.genomeSize
121 self.cumulatedSizes = {self.chromosomeName: self.genomeSize} 121 self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
122 return 122 return
123 self.chromosomes = self.sequenceParser.getRegions() 123 self.chromosomes = self.sequenceParser.getRegions()
124 self.sizes = {} 124 self.sizes = {}
125 self.cumulatedSize = 0 125 self.cumulatedSize = 0
126 self.cumulatedSizes = {} 126 self.cumulatedSizes = {}
127 for chromosome in self.chromosomes: 127 for chromosome in self.chromosomes:
128 self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome) 128 self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome)
129 self.cumulatedSize += self.sizes[chromosome] 129 self.cumulatedSize += self.sizes[chromosome]
130 self.cumulatedSizes[chromosome] = self.cumulatedSize 130 self.cumulatedSizes[chromosome] = self.cumulatedSize
131 131
132 132
133 def findPosition(self, size = None): 133 def findPosition(self, size = None):
134 if size == None: 134 if size == None:
135 size = random.randint(self.minSize, self.maxSize) 135 size = random.randint(self.minSize, self.maxSize)
136 integer = random.randint(0, self.cumulatedSize) 136 integer = random.randint(0, self.cumulatedSize)
137 for chromosome in self.chromosomes: 137 for chromosome in self.chromosomes:
138 if self.cumulatedSizes[chromosome] > integer: 138 if self.cumulatedSizes[chromosome] > integer:
139 break 139 break
140 start = random.randint(1, self.sizes[chromosome] - size) 140 start = random.randint(1, self.sizes[chromosome] - size)
141 return (chromosome, start, size) 141 return (chromosome, start, size)
142 142
143 143
144 def createTranscript(self, chromosome, start, size, strand, cpt): 144 def createTranscript(self, chromosome, start, size, strand, cpt):
145 transcript = Transcript() 145 transcript = Transcript()
146 transcript.setChromosome(chromosome) 146 transcript.setChromosome(chromosome)
147 transcript.setStart(start) 147 transcript.setEnd(start + size-1)
148 transcript.setEnd(start + size-1) 148 transcript.setStart(start)
149 transcript.setDirection(strand) 149 transcript.setDirection(strand)
150 transcript.setName("rand_%d" % (cpt)) 150 transcript.setName("rand_%d" % (cpt))
151 return transcript 151 return transcript
152 152
153 153
154 def moveTranscript(self, chromosome, start, transcript): 154 def moveTranscript(self, chromosome, start, transcript):
155 while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]: 155 while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
156 chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart()) 156 chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
157 transcript.setChromosome(chromosome) 157 newTranscript = Transcript()
158 oldStart, oldEnd = transcript.getStart(), transcript.getEnd() 158 newTranscript.setChromosome(chromosome)
159 if transcript.getNbExons() > 1: 159 newTranscript.tags = transcript.tags
160 for exon in transcript.getNbExons(): 160 if transcript.getNbExons() > 1:
161 oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd() 161 for exon in transcript.getNbExons():
162 exon.setStart(oldExonStart + start - oldStart) 162 newExon = Interval()
163 exon.setEnd(oldExonEnd + start - oldStart) 163 newExon.setChromosome(chromosome)
164 transcript.setStart(start) 164 newExon.setEnd(exon.getEnd() + start - transcript.getStart())
165 transcript.setEnd(oldEnd + start - oldStart) 165 newExon.setStart(exon.getStart() + start - transcript.getStart())
166 return [transcript] 166 newTranscript.addExon(newExon)
167 167 newTranscript.setEnd(transcript.getEnd() + start - transcript.getStart())
168 168 newTranscript.setStart(start)
169 def createUniformCluster(self, chromosome, start, size, strand, cpt): 169 newTranscript.setDirection(transcript.getDirection())
170 transcript = self.createTranscript(chromosome, start, size, strand, cpt) 170 return [newTranscript]
171 return [transcript] 171
172 172
173 173 def createUniformCluster(self, chromosome, start, size, strand, cpt):
174 def findNbTranscripts(self, cpt): 174 transcript = self.createTranscript(chromosome, start, size, strand, cpt)
175 return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1) 175 return [transcript]
176 176
177 177
178 def getDev(self): 178 def findNbTranscripts(self, cpt):
179 deviation = 0.0 179 return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
180 for j in range(repetitions): 180
181 deviation += random.randint(-self.deviation, self.deviation) 181
182 deviation /= repetitions 182 def getDev(self):
183 deviation = int(round(deviation)) 183 deviation = 0.0
184 return deviation 184 for j in range(repetitions):
185 185 deviation += random.randint(-self.deviation, self.deviation)
186 186 deviation /= repetitions
187 def createGaussianCluster(self, chromosome, start, size, strand, cpt): 187 deviation = int(round(deviation))
188 transcripts = [] 188 return deviation
189 nbTranscripts = self.findNbTranscripts(cpt) 189
190 for i in range(nbTranscripts): 190
191 transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i) 191 def createGaussianCluster(self, chromosome, start, size, strand, cpt):
192 transcripts.append(transcript) 192 transcripts = []
193 return transcripts 193 nbTranscripts = self.findNbTranscripts(cpt)
194 194 for i in range(nbTranscripts):
195 195 transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
196 def writeRegions(self): 196 transcripts.append(transcript)
197 writer = Gff3Writer(self.outputFileName, self.verbosity) 197 return transcripts
198 outputFile = open(self.outputFileName, "w") 198
199 progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity) 199
200 i = 0 200 def writeRegions(self):
201 while i < self.number: 201 writer = Gff3Writer(self.outputFileName, self.verbosity)
202 chromosome, start, size = self.findPosition() 202 outputFile = open(self.outputFileName, "w")
203 strand = random.choice([-1, 1]) if self.strands else 1 203 progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
204 if self.transcripts != None: 204 i = 0
205 transcripts = self.moveTranscript(chromosome, start, self.transcripts[i]) 205 while i < self.number:
206 elif self.distribution == "uniform": 206 chromosome, start, size = self.findPosition()
207 transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1) 207 strand = random.choice([-1, 1]) if self.strands else 1
208 else: 208 if self.transcripts != None:
209 transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1) 209 transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
210 for transcript in transcripts: 210 elif self.distribution == "uniform":
211 writer.addTranscript(transcript) 211 transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
212 i += 1 212 else:
213 progress.inc() 213 transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
214 progress.done() 214 for transcript in transcripts:
215 outputFile.close() 215 writer.addTranscript(transcript)
216 writer.write() 216 i += 1
217 writer.close() 217 progress.inc()
218 218 progress.done()
219 219 outputFile.close()
220 def run(self): 220 writer.write()
221 self.getSizes() 221 writer.close()
222 self.writeRegions() 222
223
224 def run(self):
225 self.getSizes()
226 self.writeRegions()
223 227
224 228
225 if __name__ == "__main__": 229 if __name__ == "__main__":
226 230
227 # parse command line 231 # parse command line
228 description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]" 232 description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
229 233
230 parser = OptionParser(description = description) 234 parser = OptionParser(description = description)
231 parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]") 235 parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]")
232 parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]") 236 parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]")
233 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]") 237 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]")
234 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]") 238 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]")
235 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]") 239 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
236 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]") 240 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]")
237 parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]") 241 parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]")
238 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]") 242 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
239 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]") 243 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
240 parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]") 244 parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]")
241 parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]") 245 parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]")
242 parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]") 246 parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
243 parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]") 247 parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
244 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") 248 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
245 (options, args) = parser.parse_args() 249 (options, args) = parser.parse_args()
246 250
247 rrg = RandomRegionsGenerator(options.verbosity) 251 rrg = RandomRegionsGenerator(options.verbosity)
248 if options.reference == None: 252 if options.reference == None:
249 rrg.setGenomeSize(options.referenceSize) 253 rrg.setGenomeSize(options.referenceSize)
250 rrg.setChromosomeName(options.chromosome) 254 rrg.setChromosomeName(options.chromosome)
251 else: 255 else:
252 rrg.setInput(options.reference) 256 rrg.setInput(options.reference)
253 rrg.setOutputFile(options.outputFileName) 257 rrg.setOutputFile(options.outputFileName)
254 if options.inputFileName == None: 258 if options.inputFileName == None:
255 if options.size != None: 259 if options.size != None:
256 rrg.setSize(options.size) 260 rrg.setSize(options.size)
257 else: 261 else:
258 rrg.setMinSize(options.minSize) 262 rrg.setMinSize(options.minSize)
259 rrg.setMaxSize(options.maxSize) 263 rrg.setMaxSize(options.maxSize)
260 rrg.setNumber(options.number) 264 rrg.setNumber(options.number)
261 rrg.setStrands(options.strands) 265 rrg.setStrands(options.strands)
262 else: 266 else:
263 rrg.setAnnotation(options.inputFileName, options.format) 267 rrg.setAnnotation(options.inputFileName, options.format)
264 rrg.setMaxDistribution(options.max) 268 rrg.setMaxDistribution(options.max)
265 rrg.setDeviationDistribution(options.deviation) 269 rrg.setDeviationDistribution(options.deviation)
266 rrg.run() 270 rrg.run()
267 271