comparison smart_toolShed/SMART/Java/Python/mapperAnalyzer.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 """
32 Read a mapping file (many formats supported) and select some of them
33 Mappings should be sorted by read names
34 """
35 import os, random, shelve
36 from optparse import OptionParser, OptionGroup
37 from commons.core.parsing.ParserChooser import ParserChooser
38 from commons.core.parsing.FastaParser import FastaParser
39 from commons.core.parsing.FastqParser import FastqParser
40 from commons.core.parsing.GffParser import GffParser
41 from commons.core.writer.BedWriter import BedWriter
42 from commons.core.writer.UcscWriter import UcscWriter
43 from commons.core.writer.GbWriter import GbWriter
44 from commons.core.writer.Gff2Writer import Gff2Writer
45 from commons.core.writer.Gff3Writer import Gff3Writer
46 from commons.core.writer.FastaWriter import FastaWriter
47 from commons.core.writer.FastqWriter import FastqWriter
48 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
49 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
50 from SMART.Java.Python.mySql.MySqlTable import MySqlTable
51 from SMART.Java.Python.misc.RPlotter import RPlotter
52 from SMART.Java.Python.misc.Progress import Progress
53 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
54
55
56 distanceExons = 20
57 exonSize = 20
58
59
60 class MapperAnalyzer(object):
61 """
62 Analyse the output of a parser
63 """
64
65 def __init__(self, verbosity = 0):
66 self.verbosity = verbosity
67 self.mySqlConnection = MySqlConnection(verbosity)
68 self.tooShort = 0
69 self.tooManyMismatches = 0
70 self.tooManyGaps = 0
71 self.tooShortExons = 0
72 self.tooManyMappings = 0
73 self.nbMappings = 0
74 self.nbSequences = 0
75 self.nbAlreadyMapped = 0
76 self.nbAlreadyMappedSequences = 0
77 self.nbWrittenMappings = 0
78 self.nbWrittenSequences = 0
79 self.parser = None
80 self.logHandle = None
81 self.randomNumber = random.randint(0, 100000)
82 self.gff3Writer = None
83 self.alreadyMappedReader = None
84 self.unmatchedWriter = None
85 self.sequenceListParser = None
86 self.sequences = None
87 self.alreadyMapped = None
88 self.mappedNamesTable = None
89 self.minSize = None
90 self.minId = None
91 self.maxMismatches = None
92 self.maxGaps = None
93 self.maxMappings = None
94 self.merge = False
95 self.checkExons = False
96 self.suffix = None
97 self.tmpDirectory = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else ""
98
99
100 def __del__(self):
101 if self.sequences != None:
102 self.sequences.close()
103 if self.alreadyMapped != None:
104 self.alreadyMapped.close()
105 if self.mappedNamesTable != None:
106 self.mappedNamesTable.remove()
107 if self.gff3Writer != None:
108 self.gff3Writer.close()
109
110 if self.logHandle != None:
111 self.logHandle.close()
112
113
114 def setMappingFile(self, fileName, format):
115 parserChooser = ParserChooser(self.verbosity)
116 parserChooser.findFormat(format, "mapping")
117 self.parser = parserChooser.getParser(fileName)
118
119
120 def setSequenceFile(self, fileName, format):
121 if format == "fasta":
122 self.sequenceListParser = FastaParser(fileName, self.verbosity)
123 elif format == "fastq":
124 self.sequenceListParser = FastqParser(fileName, self.verbosity)
125 else:
126 raise Exception("Do not understand sequence format %s" % (format))
127
128
129 def setOutputFile(self, fileName, title):
130 self.gff3Writer = Gff3Writer(fileName, self.verbosity)
131 self.gff3Writer.setTitle(title)
132
133
134 def setAlreadyMatched(self, fileName):
135 self.alreadyMappedReader = GffParser(fileName, self.verbosity)
136
137
138 def setRemainingFile(self, fileName, format):
139 if format == "fasta":
140 self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity)
141 elif format == "fastq":
142 self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity)
143 else:
144 raise Exception("Do not understand %s format." % (format))
145 self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity)
146 self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50})
147 self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True)
148
149
150 def setLog(self, fileName):
151 self.logHandle = open(fileName, "w")
152
153
154 def setMinSize(self, size):
155 self.minSize = size
156
157
158 def setMinId(self, id):
159 self.minId = id
160
161
162 def setMaxMismatches(self, mismatches):
163 self.maxMismatches = mismatches
164
165
166 def setMaxGaps(self, gaps):
167 self.maxGaps = gaps
168
169
170 def setMaxMappings(self, mappings):
171 self.maxMappings = mappings
172
173
174 def mergeExons(self, b):
175 self.merge = b
176
177
178 def acceptShortExons(self, b):
179 self.checkExons = not b
180
181
182 def countMappings(self):
183 self.nbMappings = self.parser.getNbMappings()
184 if self.verbosity > 0:
185 print "%i matches found" % (self.nbMappings)
186
187
188 def storeAlreadyMapped(self):
189 self.alreadyMapped = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber))
190 progress = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity)
191 self.nbAlreadyMappedSequences = 0
192 for transcript in self.alreadyMappedReader.getIterator():
193 if not self.alreadyMapped.has_key(transcript.getName()):
194 self.alreadyMapped[transcript.getName()] = 1
195 self.nbAlreadyMappedSequences += 1
196 progress.inc()
197 progress.done()
198 self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts()
199
200
201 def storeSequences(self):
202 self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber))
203 progress = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity)
204 for sequence in self.sequenceListParser.getIterator():
205 self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence())
206 self.nbSequences += 1
207 progress.inc()
208 progress.done()
209 if self.verbosity > 0:
210 print "%i sequences read" % (self.nbSequences)
211
212
213 def checkOrder(self):
214 names = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber))
215 previousName = None
216 progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity)
217 for mapping in self.parser.getIterator():
218 name = mapping.queryInterval.getName()
219 if name != previousName and previousName != None:
220 if names.has_key(previousName):
221 raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName))
222 names[previousName] = 1
223 previousName = name
224 progress.inc()
225 progress.done()
226 names.close()
227
228
229 def checkPreviouslyMapped(self, name):
230 if self.alreadyMappedReader == None:
231 return False
232 return self.alreadyMapped.has_key(name)
233
234
235 def findOriginalSize(self, name):
236 alternate = "%s/1" % (name)
237 if (self.suffix == None) or (not self.suffix):
238 if self.sequences.has_key(name):
239 self.suffix = False
240 return self.sequences[name]
241 if self.suffix == None:
242 self.suffix = True
243 else:
244 raise Exception("Cannot find name %n" % (name))
245 if (self.suffix):
246 if self.sequences.has_key(alternate):
247 return self.sequences[alternate]
248 raise Exception("Cannot find name %s" % (name))
249
250
251 def checkErrors(self, mapping):
252 accepted = True
253 # short size
254 if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size:
255 self.tooShort += 1
256 accepted = False
257 if self.logHandle != None:
258 self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size))
259 # low identity
260 if self.minId != None and mapping.getTagValue("identity") < self.minId:
261 self.tooManyMismatches += 1
262 accepted = False
263 if self.logHandle != None:
264 self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping)))
265 # too many mismatches
266 if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches:
267 self.tooManyMismatches += 1
268 accepted = False
269 if self.logHandle != None:
270 self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches))
271 # too many gaps
272 if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps:
273 self.tooManyGaps += 1
274 accepted = False
275 if self.logHandle != None:
276 self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps))
277 # short exons
278 if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize:
279 self.tooShortExons += 1
280 accepted = False
281 if self.logHandle != None:
282 self.logHandle.write("sequence %s maps as too short exons\n" % (mapping))
283 return accepted
284
285
286 def checkNbMappings(self, mappings):
287 nbOccurrences = 0
288 for mapping in mappings:
289 nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
290 if (self.maxMappings != None and nbOccurrences > self.maxMappings):
291 self.tooManyMappings += 1
292 if self.logHandle != None:
293 self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences))
294 return False
295 return (nbOccurrences > 0)
296
297
298 def sortMappings(self, mappings):
299 nbOccurrences = 0
300 for mapping in mappings:
301 nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
302
303 orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore())
304 cpt = 1
305 rank = 1
306 previousMapping = None
307 previousScore = None
308 wasLastTie = False
309 rankedMappings = []
310 bestRegion = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd())
311 for mapping in orderedMappings:
312 mapping.setNbOccurrences(nbOccurrences)
313 mapping.setOccurrence(cpt)
314
315 score = mapping.getErrorScore()
316 if previousScore != None and previousScore == score:
317 if "Rank" in previousMapping.getTagNames():
318 if not wasLastTie:
319 previousMapping.setRank("%sTie" % (rank))
320 mapping.setRank("%sTie" % (rank))
321 wasLastTie = True
322 else:
323 rank = cpt
324 mapping.setRank(rank)
325 wasLastTie = False
326 if cpt != 1:
327 mapping.setBestRegion(bestRegion)
328
329 rankedMappings.append(mapping)
330 previousMapping = mapping
331 previousScore = score
332 cpt += 1
333 return rankedMappings
334
335
336 def processMappings(self, mappings):
337 if not mappings:
338 return
339 selectedMappings = []
340 name = mappings[0].queryInterval.getName()
341 size = self.findOriginalSize(name)
342 for mapping in mappings:
343 if self.merge:
344 mapping.mergeExons(distanceExons)
345 mapping.queryInterval.size = size
346 if self.checkErrors(mapping):
347 selectedMappings.append(mapping)
348
349 if self.checkNbMappings(selectedMappings):
350 if self.unmatchedWriter != None:
351 query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name)))
352 self.nbWrittenSequences += 1
353 mappings = self.sortMappings(selectedMappings)
354 for mapping in mappings:
355 self.nbWrittenMappings += 1
356 self.gff3Writer.addTranscript(mapping.getTranscript())
357
358
359 def readMappings(self):
360 previousQueryName = None
361 mappings = []
362 self.parser.reset()
363 progress = Progress(self.nbMappings, "Reading mappings", self.verbosity)
364 for mapping in self.parser.getIterator():
365 queryName = mapping.queryInterval.getName().split(" ")[0]
366 if self.checkPreviouslyMapped(queryName):
367 if self.logHandle != None:
368 self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName))
369 else:
370 if previousQueryName == queryName:
371 mappings.append(mapping)
372 else:
373 if previousQueryName != None:
374 self.processMappings(mappings)
375 previousQueryName = queryName
376 mappings = [mapping, ]
377 progress.inc()
378 self.processMappings(mappings)
379 self.gff3Writer.write()
380 self.gff3Writer.close()
381 progress.done()
382
383
384 def writeUnmatched(self):
385 progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity)
386 for sequence in self.sequenceListParser.getIterator():
387 name = sequence.getName().split(" ")[0]
388 query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name))
389 if query.isEmpty():
390 self.unmatchedWriter.addSequence(sequence)
391 progress.inc()
392 progress.done()
393
394
395 def analyze(self):
396 self.countMappings()
397 self.checkOrder()
398 self.storeSequences()
399 if self.alreadyMappedReader != None:
400 self.storeAlreadyMapped()
401 self.readMappings()
402 if self.unmatchedWriter != None:
403 self.writeUnmatched()
404
405
406
407
408 if __name__ == "__main__":
409
410 # parse command line
411 description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]"
412
413 parser = OptionParser(description = description)
414 compGroup = OptionGroup(parser, "Compulsory options")
415 filtGroup = OptionGroup(parser, "Filtering options")
416 tranGroup = OptionGroup(parser, "Transformation options")
417 outpGroup = OptionGroup(parser, "Output options")
418 otheGroup = OptionGroup(parser, "Other options")
419 compGroup.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]")
420 compGroup.add_option("-f", "--format", dest="format", action="store", default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]")
421 compGroup.add_option("-q", "--sequences", dest="sequencesFileName", action="store", type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]")
422 compGroup.add_option("-k", "--seqFormat", dest="sequenceFormat", action="store", default="fasta", type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]")
423 compGroup.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
424 filtGroup.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="max. number of occurrences of a sequence [format: int]")
425 filtGroup.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="minimum pourcentage of size [format: int]")
426 filtGroup.add_option("-d", "--identity", dest="identity", action="store", default=None, type="int", help="minimum pourcentage of identity [format: int]")
427 filtGroup.add_option("-m", "--mismatch", dest="mismatch", action="store", default=None, type="int", help="maximum number of mismatches [format: int]")
428 filtGroup.add_option("-p", "--gap", dest="gap", action="store", default=None, type="int", help="maximum number of gaps [format: int]")
429 tranGroup.add_option("-e", "--mergeExons", dest="mergeExons", action="store_true", default=False, help="merge exons when introns are short [format: bool] [default: false]")
430 tranGroup.add_option("-x", "--removeExons", dest="removeExons", action="store_true", default=False, help="remove transcripts when exons are short [format: bool] [default: false]")
431 outpGroup.add_option("-t", "--title", dest="title", action="store", default="SMART", type="string", help="title of the UCSC track [format: string] [default: SMART]")
432 outpGroup.add_option("-r", "--remaining", dest="remaining", action="store_true", default=False, help="print the unmatched sequences [format: bool] [default: false]")
433 otheGroup.add_option("-a", "--append", dest="appendFileName", action="store", default=None, type="string", help="append to GFF3 file [format: file in GFF3 format]")
434 otheGroup.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")
435 otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]")
436 parser.add_option_group(compGroup)
437 parser.add_option_group(filtGroup)
438 parser.add_option_group(tranGroup)
439 parser.add_option_group(outpGroup)
440 parser.add_option_group(otheGroup)
441 (options, args) = parser.parse_args()
442
443
444 analyzer = MapperAnalyzer(options.verbosity)
445 analyzer.setMappingFile(options.inputFileName, options.format)
446 analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)
447 analyzer.setOutputFile(options.outputFileName, options.title)
448 if options.appendFileName != None:
449 analyzer.setAlreadyMatched(options.appendFileName)
450 if options.remaining:
451 analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)
452 if options.number != None:
453 analyzer.setMaxMappings(options.number)
454 if options.size != None:
455 analyzer.setMinSize(options.size)
456 if options.identity != None:
457 analyzer.setMinId(options.identity)
458 if options.mismatch != None:
459 analyzer.setMaxMismatches(options.mismatch)
460 if options.gap != None:
461 analyzer.setMaxGaps(options.gap)
462 if options.mergeExons:
463 analyzer.mergeExons(True)
464 if options.removeExons:
465 analyzer.acceptShortExons(False)
466 if options.log:
467 analyzer.setLog("%s.log" % (options.outputFileName))
468 analyzer.analyze()
469
470 if options.verbosity > 0:
471 print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
472 if options.appendFileName != None:
473 print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
474 print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)
475 if options.appendFileName != None:
476 print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)
477 print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)
478 print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)
479 print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)
480 print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)
481 print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)
482 print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
483 if options.appendFileName != None:
484 print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
485
486