s_mart: SMART/Java/Python/mapperAnalyzer.py comparison

comparison SMART/Java/Python/mapperAnalyzer.py @ 6:769e306b7933

Change the repository level.

author	yufei-luo
date	Fri, 18 Jan 2013 04:54:14 -0500
parents
children

comparison

equal deleted inserted replaced

-:ea3082881bf8
+:769e306b7933
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""
+Read a mapping file (many formats supported) and select some of them
+Mappings should be sorted by read names
+"""
+import os, random, shelve
+from optparse import OptionParser, OptionGroup
+from commons.core.parsing.ParserChooser import ParserChooser
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.parsing.FastqParser import FastqParser
+from commons.core.parsing.GffParser import GffParser
+from commons.core.writer.BedWriter import BedWriter
+from commons.core.writer.UcscWriter import UcscWriter
+from commons.core.writer.GbWriter import GbWriter
+from commons.core.writer.Gff2Writer import Gff2Writer
+from commons.core.writer.Gff3Writer import Gff3Writer
+from commons.core.writer.FastaWriter import FastaWriter
+from commons.core.writer.FastqWriter import FastqWriter
+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
+from SMART.Java.Python.mySql.MySqlTable import MySqlTable
+from SMART.Java.Python.misc.RPlotter import RPlotter
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
+distanceExons = 20
+exonSize      = 20
+class MapperAnalyzer(object):
+"""
+Analyse the output of a parser
+"""
+def __init__(self, verbosity = 0):
+self.verbosity                = verbosity
+self.mySqlConnection          = MySqlConnection(verbosity)
+self.tooShort                 = 0
+self.tooManyMismatches        = 0
+self.tooManyGaps              = 0
+self.tooShortExons            = 0
+self.tooManyMappings          = 0
+self.nbMappings               = 0
+self.nbSequences              = 0
+self.nbAlreadyMapped          = 0
+self.nbAlreadyMappedSequences = 0
+self.nbWrittenMappings        = 0
+self.nbWrittenSequences       = 0
+self.parser                   = None
+self.logHandle                = None
+self.randomNumber             = random.randint(0, 100000)
+self.gff3Writer               = None
+self.alreadyMappedReader      = None
+self.unmatchedWriter          = None
+self.sequenceListParser       = None
+self.sequences                = None
+self.alreadyMapped            = None
+self.mappedNamesTable         = None
+self.minSize                  = None
+self.minId                    = None
+self.maxMismatches            = None
+self.maxGaps                  = None
+self.maxMappings              = None
+self.merge                    = False
+self.checkExons               = False
+self.suffix                   = None
+self.tmpDirectory             = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else ""
+def __del__(self):
+if self.sequences != None:
+self.sequences.close()
+if self.alreadyMapped != None:
+self.alreadyMapped.close()
+if self.mappedNamesTable != None:
+self.mappedNamesTable.remove()
+if self.gff3Writer != None:
+self.gff3Writer.close()
+if self.logHandle != None:
+self.logHandle.close()
+def setMappingFile(self, fileName, format):
+parserChooser = ParserChooser(self.verbosity)
+parserChooser.findFormat(format, "mapping")
+self.parser = parserChooser.getParser(fileName)
+def setSequenceFile(self, fileName, format):
+if format == "fasta":
+self.sequenceListParser = FastaParser(fileName, self.verbosity)
+elif format == "fastq":
+self.sequenceListParser = FastqParser(fileName, self.verbosity)
+else:
+raise Exception("Do not understand sequence format %s" % (format))
+def setOutputFile(self, fileName, title):
+self.gff3Writer = Gff3Writer(fileName, self.verbosity)
+self.gff3Writer.setTitle(title)
+def setAlreadyMatched(self, fileName):
+self.alreadyMappedReader = GffParser(fileName, self.verbosity)
+def setRemainingFile(self, fileName, format):
+if format == "fasta":
+self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity)
+elif format == "fastq":
+self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity)
+else:
+raise Exception("Do not understand %s format." % (format))
+self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity)
+self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50})
+self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True)
+def setLog(self, fileName):
+self.logHandle = open(fileName, "w")
+def setMinSize(self, size):
+self.minSize = size
+def setMinId(self, id):
+self.minId = id
+def setMaxMismatches(self, mismatches):
+self.maxMismatches = mismatches
+def setMaxGaps(self, gaps):
+self.maxGaps = gaps
+def setMaxMappings(self, mappings):
+self.maxMappings = mappings
+def mergeExons(self, b):
+self.merge = b
+def acceptShortExons(self, b):
+self.checkExons = not b
+def countMappings(self):
+self.nbMappings = self.parser.getNbMappings()
+if self.verbosity > 0:
+print "%i matches found" % (self.nbMappings)
+def storeAlreadyMapped(self):
+self.alreadyMapped            = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber))
+progress                      = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity)
+self.nbAlreadyMappedSequences = 0
+for transcript in self.alreadyMappedReader.getIterator():
+if not self.alreadyMapped.has_key(transcript.getName()):
+self.alreadyMapped[transcript.getName()] = 1
+self.nbAlreadyMappedSequences           += 1
+progress.inc()
+progress.done()
+self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts()
+def storeSequences(self):
+self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber))
+progress       = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity)
+for sequence in self.sequenceListParser.getIterator():
+self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence())
+self.nbSequences += 1
+progress.inc()
+progress.done()
+if self.verbosity > 0:
+print "%i sequences read" % (self.nbSequences)
+def checkOrder(self):
+names        = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber))
+previousName = None
+progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity)
+for mapping in self.parser.getIterator():
+name = mapping.queryInterval.getName()
+if name != previousName and previousName != None:
+if names.has_key(previousName):
+raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName))
+names[previousName] = 1
+previousName        = name
+progress.inc()
+progress.done()
+names.close()
+def checkPreviouslyMapped(self, name):
+if self.alreadyMappedReader == None:
+return False
+return self.alreadyMapped.has_key(name)
+def findOriginalSize(self, name):
+alternate    = "%s/1" % (name)
+if (self.suffix == None) or (not self.suffix):
+if self.sequences.has_key(name):
+self.suffix = False
+return self.sequences[name]
+if self.suffix == None:
+self.suffix = True
+else:
+raise Exception("Cannot find name %n" % (name))
+if (self.suffix):
+if self.sequences.has_key(alternate):
+return self.sequences[alternate]
+raise Exception("Cannot find name %s" % (name))
+def checkErrors(self, mapping):
+accepted = True
+# short size
+if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size:
+self.tooShort += 1
+accepted    = False
+if self.logHandle != None:
+self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size))
+# low identity
+if self.minId != None and mapping.getTagValue("identity") < self.minId:
+self.tooManyMismatches += 1
+accepted                = False
+if self.logHandle != None:
+self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping)))
+# too many mismatches
+if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches:
+self.tooManyMismatches += 1
+accepted                = False
+if self.logHandle != None:
+self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches))
+# too many gaps
+if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps:
+self.tooManyGaps += 1
+accepted         = False
+if self.logHandle != None:
+self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps))
+# short exons
+if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize:
+self.tooShortExons += 1
+accepted            = False
+if self.logHandle != None:
+self.logHandle.write("sequence %s maps as too short exons\n" % (mapping))
+return accepted
+def checkNbMappings(self, mappings):
+nbOccurrences = 0
+for mapping in mappings:
+nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
+if (self.maxMappings != None and nbOccurrences > self.maxMappings):
+self.tooManyMappings += 1
+if self.logHandle != None:
+self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences))
+return False
+return (nbOccurrences > 0)
+def sortMappings(self, mappings):
+nbOccurrences = 0
+for mapping in mappings:
+nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
+orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore())
+cpt             = 1
+rank            = 1
+previousMapping = None
+previousScore   = None
+wasLastTie      = False
+rankedMappings  = []
+bestRegion      = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd())
+for mapping in orderedMappings:
+mapping.setNbOccurrences(nbOccurrences)
+mapping.setOccurrence(cpt)
+score = mapping.getErrorScore()
+if previousScore != None and previousScore == score:
+if "Rank" in previousMapping.getTagNames():
+if not wasLastTie:
+previousMapping.setRank("%sTie" % (rank))
+mapping.setRank("%sTie" % (rank))
+wasLastTie = True
+else:
+rank = cpt
+mapping.setRank(rank)
+wasLastTie = False
+if cpt != 1:
+mapping.setBestRegion(bestRegion)
+rankedMappings.append(mapping)
+previousMapping = mapping
+previousScore   = score
+cpt            += 1
+return rankedMappings
+def processMappings(self, mappings):
+if not mappings:
+return
+selectedMappings = []
+name             = mappings[0].queryInterval.getName()
+size             = self.findOriginalSize(name)
+for mapping in mappings:
+if self.merge:
+mapping.mergeExons(distanceExons)
+mapping.queryInterval.size = size
+if self.checkErrors(mapping):
+selectedMappings.append(mapping)
+if self.checkNbMappings(selectedMappings):
+if self.unmatchedWriter != None:
+query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name)))
+self.nbWrittenSequences += 1
+mappings = self.sortMappings(selectedMappings)
+for mapping in mappings:
+self.nbWrittenMappings += 1
+self.gff3Writer.addTranscript(mapping.getTranscript())
+def readMappings(self):
+previousQueryName = None
+mappings          = []
+self.parser.reset()
+progress = Progress(self.nbMappings, "Reading mappings", self.verbosity)
+for mapping in self.parser.getIterator():
+queryName = mapping.queryInterval.getName().split(" ")[0]
+if self.checkPreviouslyMapped(queryName):
+if self.logHandle != None:
+self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName))
+else:
+if previousQueryName == queryName:
+mappings.append(mapping)
+else:
+if previousQueryName != None:
+self.processMappings(mappings)
+previousQueryName = queryName
+mappings          = [mapping, ]
+progress.inc()
+self.processMappings(mappings)
+self.gff3Writer.write()
+self.gff3Writer.close()
+progress.done()
+def writeUnmatched(self):
+progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity)
+for sequence in self.sequenceListParser.getIterator():
+name = sequence.getName().split(" ")[0]
+query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name))
+if query.isEmpty():
+self.unmatchedWriter.addSequence(sequence)
+progress.inc()
+progress.done()
+def analyze(self):
+self.countMappings()
+self.checkOrder()
+self.storeSequences()
+if self.alreadyMappedReader != None:
+self.storeAlreadyMapped()
+self.readMappings()
+if self.unmatchedWriter != None:
+self.writeUnmatched()
+if __name__ == "__main__":
+# parse command line
+description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]"
+parser = OptionParser(description = description)
+compGroup = OptionGroup(parser, "Compulsory options")
+filtGroup = OptionGroup(parser, "Filtering options")
+tranGroup = OptionGroup(parser, "Transformation options")
+outpGroup = OptionGroup(parser, "Output options")
+otheGroup = OptionGroup(parser, "Other options")
+compGroup.add_option("-i", "--input",            dest="inputFileName",     action="store",                        type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]")
+compGroup.add_option("-f", "--format",           dest="format",            action="store",      default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]")
+compGroup.add_option("-q", "--sequences",        dest="sequencesFileName", action="store",                        type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]")
+compGroup.add_option("-k", "--seqFormat",        dest="sequenceFormat",    action="store",      default="fasta",  type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]")
+compGroup.add_option("-o", "--output",           dest="outputFileName",    action="store",                        type="string", help="output file [compulsory] [format: output file in GFF3 format]")
+filtGroup.add_option("-n", "--number",           dest="number",            action="store",      default=None,     type="int",    help="max. number of occurrences of a sequence [format: int]")
+filtGroup.add_option("-s", "--size",             dest="size",              action="store",      default=None,     type="int",    help="minimum pourcentage of size [format: int]")
+filtGroup.add_option("-d", "--identity",         dest="identity",          action="store",      default=None,     type="int",    help="minimum pourcentage of identity [format: int]")
+filtGroup.add_option("-m", "--mismatch",         dest="mismatch",          action="store",      default=None,     type="int",    help="maximum number of mismatches [format: int]")
+filtGroup.add_option("-p", "--gap",              dest="gap",               action="store",      default=None,     type="int",    help="maximum number of gaps [format: int]")
+tranGroup.add_option("-e", "--mergeExons",       dest="mergeExons",        action="store_true", default=False,                   help="merge exons when introns are short [format: bool] [default: false]")
+tranGroup.add_option("-x", "--removeExons",      dest="removeExons",       action="store_true", default=False,                   help="remove transcripts when exons are short [format: bool] [default: false]")
+outpGroup.add_option("-t", "--title",            dest="title",             action="store",      default="SMART",  type="string", help="title of the UCSC track [format: string] [default: SMART]")
+outpGroup.add_option("-r", "--remaining",        dest="remaining",         action="store_true", default=False,                   help="print the unmatched sequences [format: bool] [default: false]")
+otheGroup.add_option("-a", "--append",           dest="appendFileName",    action="store",      default=None,     type="string", help="append to GFF3 file [format: file in GFF3 format]")
+otheGroup.add_option("-v", "--verbosity",        dest="verbosity",         action="store",      default=1,        type="int",    help="trace level [default: 1] [format: int]")
+otheGroup.add_option("-l", "--log",              dest="log",               action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
+parser.add_option_group(compGroup)
+parser.add_option_group(filtGroup)
+parser.add_option_group(tranGroup)
+parser.add_option_group(outpGroup)
+parser.add_option_group(otheGroup)
+(options, args) = parser.parse_args()
+analyzer = MapperAnalyzer(options.verbosity)
+analyzer.setMappingFile(options.inputFileName, options.format)
+analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)
+analyzer.setOutputFile(options.outputFileName, options.title)
+if options.appendFileName != None:
+analyzer.setAlreadyMatched(options.appendFileName)
+if options.remaining:
+analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)
+if options.number != None:
+analyzer.setMaxMappings(options.number)
+if options.size != None:
+analyzer.setMinSize(options.size)
+if options.identity != None:
+analyzer.setMinId(options.identity)
+if options.mismatch != None:
+analyzer.setMaxMismatches(options.mismatch)
+if options.gap != None:
+analyzer.setMaxGaps(options.gap)
+if options.mergeExons:
+analyzer.mergeExons(True)
+if options.removeExons:
+analyzer.acceptShortExons(False)
+if options.log:
+analyzer.setLog("%s.log" % (options.outputFileName))
+analyzer.analyze()
+if options.verbosity > 0:
+print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
+if options.appendFileName != None:
+print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
+print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)
+if options.appendFileName != None:
+print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)
+print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)
+print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)
+print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)
+print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)
+print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)
+print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
+if options.appendFileName != None:
+print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)

Mercurial > repos > yufei-luo > s_mart

comparison SMART/Java/Python/mapperAnalyzer.py @ 6:769e306b7933