6
|
1 #! /usr/bin/env python
|
|
2 #
|
|
3 # Copyright INRA-URGI 2009-2010
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30 #
|
|
31 """Find random regions in a genome"""
|
|
32
|
|
33 import random, math
|
|
34 from optparse import OptionParser
|
|
35 from commons.core.parsing.FastaParser import *
|
|
36 from commons.core.writer.Gff3Writer import *
|
|
37 from commons.core.writer.MySqlTranscriptWriter import *
|
|
38 from SMART.Java.Python.misc.Progress import *
|
|
39 from SMART.Java.Python.structure.Transcript import Transcript
|
|
40 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
|
|
41
|
|
42 repetitions = 100
|
|
43
|
|
44
|
|
45 class RandomRegionsGenerator(object):
|
|
46
|
|
47 def __init__(self, verbosity):
|
|
48 self.verbosity = verbosity
|
|
49 self.strands = False
|
|
50 self.distribution = "uniform"
|
|
51 self.transcripts = None
|
|
52 self.sequenceParser = None
|
|
53 random.seed()
|
|
54
|
|
55
|
|
56 def setInput(self, fileName):
|
|
57 self.sequenceParser = FastaParser(fileName, self.verbosity)
|
|
58
|
|
59
|
|
60 def setGenomeSize(self, size):
|
|
61 self.genomeSize = size
|
|
62
|
|
63
|
|
64 def setChromosomeName(self, name):
|
|
65 self.chromosomeName = name
|
|
66
|
|
67
|
|
68 def setAnnotation(self, fileName, format):
|
|
69 parser = TranscriptContainer(fileName, format, self.verbosity)
|
|
70 self.transcripts = []
|
|
71 for transcript in parser.getIterator():
|
|
72 self.transcripts.append(transcript)
|
|
73 self.setNumber(len(self.transcripts))
|
|
74 self.setSize(0)
|
|
75
|
|
76
|
|
77 def setOutputFile(self, fileName):
|
|
78 self.outputFileName = fileName
|
|
79
|
|
80
|
|
81 def setSize(self, size):
|
|
82 self.minSize = size
|
|
83 self.maxSize = size
|
|
84
|
|
85
|
|
86 def setMinSize(self, size):
|
|
87 self.minSize = size
|
|
88
|
|
89
|
|
90 def setMaxSize(self, size):
|
|
91 self.maxSize = size
|
|
92
|
|
93
|
|
94 def setNumber(self, number):
|
|
95 self.number = number
|
|
96
|
|
97
|
|
98 def setStrands(self, strands):
|
|
99 self.strands = strands
|
|
100
|
|
101
|
|
102 def setMaxDistribution(self, maxElements):
|
|
103 if maxElements == None:
|
|
104 return
|
|
105 self.maxElements = maxElements
|
|
106 self.distribution = "gaussian"
|
|
107
|
|
108
|
|
109 def setDeviationDistribution(self, deviation):
|
|
110 if deviation == None:
|
|
111 return
|
|
112 self.deviation = deviation
|
|
113 self.distribution = "gaussian"
|
|
114
|
|
115
|
|
116 def getSizes(self):
|
|
117 if self.sequenceParser == None:
|
|
118 self.chromosomes = [self.chromosomeName]
|
|
119 self.sizes = {self.chromosomeName: self.genomeSize}
|
|
120 self.cumulatedSize = self.genomeSize
|
|
121 self.cumulatedSizes = {self.chromosomeName: self.genomeSize}
|
|
122 return
|
|
123 self.chromosomes = self.sequenceParser.getRegions()
|
|
124 self.sizes = {}
|
|
125 self.cumulatedSize = 0
|
|
126 self.cumulatedSizes = {}
|
|
127 for chromosome in self.chromosomes:
|
|
128 self.sizes[chromosome] = self.sequenceParser.getSizeOfRegion(chromosome)
|
|
129 self.cumulatedSize += self.sizes[chromosome]
|
|
130 self.cumulatedSizes[chromosome] = self.cumulatedSize
|
|
131
|
|
132
|
|
133 def findPosition(self, size = None):
|
|
134 if size == None:
|
|
135 size = random.randint(self.minSize, self.maxSize)
|
|
136 integer = random.randint(0, self.cumulatedSize)
|
|
137 for chromosome in self.chromosomes:
|
|
138 if self.cumulatedSizes[chromosome] > integer:
|
|
139 break
|
|
140 start = random.randint(1, self.sizes[chromosome] - size)
|
|
141 return (chromosome, start, size)
|
|
142
|
|
143
|
|
144 def createTranscript(self, chromosome, start, size, strand, cpt):
|
|
145 transcript = Transcript()
|
|
146 transcript.setChromosome(chromosome)
|
|
147 transcript.setStart(start)
|
|
148 transcript.setEnd(start + size-1)
|
|
149 transcript.setDirection(strand)
|
|
150 transcript.setName("rand_%d" % (cpt))
|
|
151 return transcript
|
|
152
|
|
153
|
|
154 def moveTranscript(self, chromosome, start, transcript):
|
|
155 while transcript.getEnd() + start - transcript.getStart() > self.cumulatedSizes[chromosome]:
|
|
156 chromosome, start, size = self.findPosition(transcript.getEnd() - transcript.getStart())
|
|
157 transcript.setChromosome(chromosome)
|
|
158 oldStart, oldEnd = transcript.getStart(), transcript.getEnd()
|
|
159 if transcript.getNbExons() > 1:
|
|
160 for exon in transcript.getNbExons():
|
|
161 oldExonStart, oldExonEnd = exon.getStart(), exon.getEnd()
|
|
162 exon.setStart(oldExonStart + start - oldStart)
|
|
163 exon.setEnd(oldExonEnd + start - oldStart)
|
|
164 transcript.setStart(start)
|
|
165 transcript.setEnd(oldEnd + start - oldStart)
|
|
166 return [transcript]
|
|
167
|
|
168
|
|
169 def createUniformCluster(self, chromosome, start, size, strand, cpt):
|
|
170 transcript = self.createTranscript(chromosome, start, size, strand, cpt)
|
|
171 return [transcript]
|
|
172
|
|
173
|
|
174 def findNbTranscripts(self, cpt):
|
|
175 return min(int(round(math.exp(random.random() * math.log(self.maxElements)))), self.number - cpt + 1)
|
|
176
|
|
177
|
|
178 def getDev(self):
|
|
179 deviation = 0.0
|
|
180 for j in range(repetitions):
|
|
181 deviation += random.randint(-self.deviation, self.deviation)
|
|
182 deviation /= repetitions
|
|
183 deviation = int(round(deviation))
|
|
184 return deviation
|
|
185
|
|
186
|
|
187 def createGaussianCluster(self, chromosome, start, size, strand, cpt):
|
|
188 transcripts = []
|
|
189 nbTranscripts = self.findNbTranscripts(cpt)
|
|
190 for i in range(nbTranscripts):
|
|
191 transcript = self.createTranscript(chromosome, start + self.getDev(), size + self.getDev(), strand, cpt + i)
|
|
192 transcripts.append(transcript)
|
|
193 return transcripts
|
|
194
|
|
195
|
|
196 def writeRegions(self):
|
|
197 writer = Gff3Writer(self.outputFileName, self.verbosity)
|
|
198 outputFile = open(self.outputFileName, "w")
|
|
199 progress = Progress(self.number, "Writing to %s" % (self.outputFileName), self.verbosity)
|
|
200 i = 0
|
|
201 while i < self.number:
|
|
202 chromosome, start, size = self.findPosition()
|
|
203 strand = random.choice([-1, 1]) if self.strands else 1
|
|
204 if self.transcripts != None:
|
|
205 transcripts = self.moveTranscript(chromosome, start, self.transcripts[i])
|
|
206 elif self.distribution == "uniform":
|
|
207 transcripts = self.createUniformCluster(chromosome, start, size, strand, i+1)
|
|
208 else:
|
|
209 transcripts = self.createGaussianCluster(chromosome, start, size, strand, i+1)
|
|
210 for transcript in transcripts:
|
|
211 writer.addTranscript(transcript)
|
|
212 i += 1
|
|
213 progress.inc()
|
|
214 progress.done()
|
|
215 outputFile.close()
|
|
216 writer.write()
|
|
217 writer.close()
|
|
218
|
|
219
|
|
220 def run(self):
|
|
221 self.getSizes()
|
|
222 self.writeRegions()
|
|
223
|
|
224
|
|
225 if __name__ == "__main__":
|
|
226
|
|
227 # parse command line
|
|
228 description = "Get Random Regions v1.0.2: Get some random coordinates on a genome. May use uniform or gaussian distribution (in gaussion distribution, # of element per cluster follows a power law). [Category: Other]"
|
|
229
|
|
230 parser = OptionParser(description = description)
|
|
231 parser.add_option("-r", "--reference", dest="reference", action="store", default=None, type="string", help="file that contains the sequences [format: file in FASTA format]")
|
|
232 parser.add_option("-S", "--referenceSize", dest="referenceSize", action="store", default=None, type="int", help="size of the chromosome (when no reference is given) [format: int]")
|
|
233 parser.add_option("-c", "--chromosome", dest="chromosome", action="store", default=None, type="string", help="name of the chromosome (when no reference is given) [format: string]")
|
|
234 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in FASTA format]")
|
|
235 parser.add_option("-i", "--input", dest="inputFileName", action="store", default=None, type="string", help="optional file containing regions to shuffle [format: file in transcript format given by -f]")
|
|
236 parser.add_option("-f", "--format", dest="format", action="store", default=None, type="string", help="format of the previous file [format: transcript file format]")
|
|
237 parser.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="size of the regions (if no region set is provided) [format: int]")
|
|
238 parser.add_option("-z", "--minSize", dest="minSize", action="store", default=None, type="int", help="minimum size of the regions (if no region set nor a fixed size are provided) [format: int]")
|
|
239 parser.add_option("-Z", "--maxSize", dest="maxSize", action="store", default=None, type="int", help="maximum size of the regions (if no region set nor a fixed size are provided) [format: int]")
|
|
240 parser.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="number of regions (if no region set is provided) [format: int]")
|
|
241 parser.add_option("-t", "--strands", dest="strands", action="store_true", default=False, help="use both strands (if no region set is provided) [format: boolean]")
|
|
242 parser.add_option("-m", "--max", dest="max", action="store", default=None, type="int", help="max. # reads in a cluster (for Gaussian dist.) [format: int]")
|
|
243 parser.add_option("-d", "--deviation", dest="deviation", action="store", default=None, type="int", help="deviation around the center of the cluster (for Gaussian dist.) [format: int]")
|
|
244 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]")
|
|
245 (options, args) = parser.parse_args()
|
|
246
|
|
247 rrg = RandomRegionsGenerator(options.verbosity)
|
|
248 if options.reference == None:
|
|
249 rrg.setGenomeSize(options.referenceSize)
|
|
250 rrg.setChromosomeName(options.chromosome)
|
|
251 else:
|
|
252 rrg.setInput(options.reference)
|
|
253 rrg.setOutputFile(options.outputFileName)
|
|
254 if options.inputFileName == None:
|
|
255 if options.size != None:
|
|
256 rrg.setSize(options.size)
|
|
257 else:
|
|
258 rrg.setMinSize(options.minSize)
|
|
259 rrg.setMaxSize(options.maxSize)
|
|
260 rrg.setNumber(options.number)
|
|
261 rrg.setStrands(options.strands)
|
|
262 else:
|
|
263 rrg.setAnnotation(options.inputFileName, options.format)
|
|
264 rrg.setMaxDistribution(options.max)
|
|
265 rrg.setDeviationDistribution(options.deviation)
|
|
266 rrg.run()
|
|
267
|