| 
6
 | 
     1 #! /usr/bin/env python
 | 
| 
 | 
     2 #
 | 
| 
 | 
     3 # Copyright INRA-URGI 2009-2010
 | 
| 
 | 
     4 # 
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software. You can use,
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info".
 | 
| 
 | 
    10 # 
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty and the software's author, the holder of the
 | 
| 
 | 
    14 # economic rights, and the successive licensors have only limited
 | 
| 
 | 
    15 # liability.
 | 
| 
 | 
    16 # 
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading, using, modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean that it is complicated to manipulate, and that also
 | 
| 
 | 
    21 # therefore means that it is reserved for developers and experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or
 | 
| 
 | 
    25 # data to be ensured and, more generally, to use and operate it in the
 | 
| 
 | 
    26 # same conditions as regards security.
 | 
| 
 | 
    27 # 
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 #
 | 
| 
 | 
    31 """
 | 
| 
 | 
    32 Read a mapping file (many formats supported) and select some of them
 | 
| 
 | 
    33 Mappings should be sorted by read names
 | 
| 
 | 
    34 """
 | 
| 
 | 
    35 import os, random, shelve
 | 
| 
 | 
    36 from optparse import OptionParser, OptionGroup
 | 
| 
 | 
    37 from commons.core.parsing.ParserChooser import ParserChooser
 | 
| 
 | 
    38 from commons.core.parsing.FastaParser import FastaParser
 | 
| 
 | 
    39 from commons.core.parsing.FastqParser import FastqParser
 | 
| 
 | 
    40 from commons.core.parsing.GffParser import GffParser
 | 
| 
 | 
    41 from commons.core.writer.BedWriter import BedWriter
 | 
| 
 | 
    42 from commons.core.writer.UcscWriter import UcscWriter
 | 
| 
 | 
    43 from commons.core.writer.GbWriter import GbWriter
 | 
| 
 | 
    44 from commons.core.writer.Gff2Writer import Gff2Writer
 | 
| 
 | 
    45 from commons.core.writer.Gff3Writer import Gff3Writer
 | 
| 
 | 
    46 from commons.core.writer.FastaWriter import FastaWriter
 | 
| 
 | 
    47 from commons.core.writer.FastqWriter import FastqWriter
 | 
| 
 | 
    48 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
 | 
| 
 | 
    49 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
 | 
| 
 | 
    50 from SMART.Java.Python.mySql.MySqlTable import MySqlTable
 | 
| 
 | 
    51 from SMART.Java.Python.misc.RPlotter import RPlotter
 | 
| 
 | 
    52 from SMART.Java.Python.misc.Progress import Progress
 | 
| 
 | 
    53 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 
 | 
| 
 | 
    56 distanceExons = 20
 | 
| 
 | 
    57 exonSize      = 20
 | 
| 
 | 
    58 
 | 
| 
 | 
    59 
 | 
| 
 | 
    60 class MapperAnalyzer(object):
 | 
| 
 | 
    61     """
 | 
| 
 | 
    62     Analyse the output of a parser
 | 
| 
 | 
    63     """
 | 
| 
 | 
    64 
 | 
| 
 | 
    65     def __init__(self, verbosity = 0):
 | 
| 
 | 
    66         self.verbosity                = verbosity
 | 
| 
 | 
    67         self.mySqlConnection          = MySqlConnection(verbosity)
 | 
| 
 | 
    68         self.tooShort                 = 0
 | 
| 
 | 
    69         self.tooManyMismatches        = 0
 | 
| 
 | 
    70         self.tooManyGaps              = 0
 | 
| 
 | 
    71         self.tooShortExons            = 0
 | 
| 
 | 
    72         self.tooManyMappings          = 0
 | 
| 
 | 
    73         self.nbMappings               = 0
 | 
| 
 | 
    74         self.nbSequences              = 0
 | 
| 
 | 
    75         self.nbAlreadyMapped          = 0
 | 
| 
 | 
    76         self.nbAlreadyMappedSequences = 0
 | 
| 
 | 
    77         self.nbWrittenMappings        = 0
 | 
| 
 | 
    78         self.nbWrittenSequences       = 0
 | 
| 
 | 
    79         self.parser                   = None
 | 
| 
 | 
    80         self.logHandle                = None
 | 
| 
 | 
    81         self.randomNumber             = random.randint(0, 100000)
 | 
| 
 | 
    82         self.gff3Writer               = None
 | 
| 
 | 
    83         self.alreadyMappedReader      = None
 | 
| 
 | 
    84         self.unmatchedWriter          = None
 | 
| 
 | 
    85         self.sequenceListParser       = None
 | 
| 
 | 
    86         self.sequences                = None
 | 
| 
 | 
    87         self.alreadyMapped            = None
 | 
| 
 | 
    88         self.mappedNamesTable         = None
 | 
| 
 | 
    89         self.minSize                  = None
 | 
| 
 | 
    90         self.minId                    = None
 | 
| 
 | 
    91         self.maxMismatches            = None 
 | 
| 
 | 
    92         self.maxGaps                  = None 
 | 
| 
 | 
    93         self.maxMappings              = None 
 | 
| 
 | 
    94         self.merge                    = False
 | 
| 
 | 
    95         self.checkExons               = False
 | 
| 
 | 
    96         self.suffix                   = None
 | 
| 
 | 
    97         self.tmpDirectory             = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else ""
 | 
| 
 | 
    98 
 | 
| 
 | 
    99 
 | 
| 
 | 
   100     def __del__(self):
 | 
| 
 | 
   101         if self.sequences != None:
 | 
| 
 | 
   102             self.sequences.close()
 | 
| 
 | 
   103         if self.alreadyMapped != None:
 | 
| 
 | 
   104             self.alreadyMapped.close()
 | 
| 
 | 
   105         if self.mappedNamesTable != None:
 | 
| 
 | 
   106             self.mappedNamesTable.remove()
 | 
| 
 | 
   107         if self.gff3Writer != None:
 | 
| 
 | 
   108             self.gff3Writer.close()
 | 
| 
 | 
   109 
 | 
| 
 | 
   110         if self.logHandle != None:
 | 
| 
 | 
   111             self.logHandle.close()
 | 
| 
 | 
   112         
 | 
| 
 | 
   113 
 | 
| 
 | 
   114     def setMappingFile(self, fileName, format):
 | 
| 
 | 
   115         parserChooser = ParserChooser(self.verbosity)
 | 
| 
 | 
   116         parserChooser.findFormat(format, "mapping")
 | 
| 
 | 
   117         self.parser = parserChooser.getParser(fileName)
 | 
| 
 | 
   118 
 | 
| 
 | 
   119 
 | 
| 
 | 
   120     def setSequenceFile(self, fileName, format):
 | 
| 
 | 
   121         if format == "fasta":
 | 
| 
 | 
   122             self.sequenceListParser = FastaParser(fileName, self.verbosity)
 | 
| 
 | 
   123         elif format == "fastq":
 | 
| 
 | 
   124             self.sequenceListParser = FastqParser(fileName, self.verbosity)
 | 
| 
 | 
   125         else:
 | 
| 
 | 
   126             raise Exception("Do not understand sequence format %s" % (format))
 | 
| 
 | 
   127 
 | 
| 
 | 
   128     
 | 
| 
 | 
   129     def setOutputFile(self, fileName, title):
 | 
| 
 | 
   130         self.gff3Writer = Gff3Writer(fileName, self.verbosity)
 | 
| 
 | 
   131         self.gff3Writer.setTitle(title)
 | 
| 
 | 
   132 
 | 
| 
 | 
   133     
 | 
| 
 | 
   134     def setAlreadyMatched(self, fileName):
 | 
| 
 | 
   135         self.alreadyMappedReader = GffParser(fileName, self.verbosity)
 | 
| 
 | 
   136 
 | 
| 
 | 
   137 
 | 
| 
 | 
   138     def setRemainingFile(self, fileName, format):
 | 
| 
 | 
   139         if format == "fasta":
 | 
| 
 | 
   140             self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity)
 | 
| 
 | 
   141         elif format == "fastq":
 | 
| 
 | 
   142             self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity)
 | 
| 
 | 
   143         else:
 | 
| 
 | 
   144             raise Exception("Do not understand %s format." % (format))
 | 
| 
 | 
   145         self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity)
 | 
| 
 | 
   146         self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50})
 | 
| 
 | 
   147         self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True)
 | 
| 
 | 
   148 
 | 
| 
 | 
   149 
 | 
| 
 | 
   150     def setLog(self, fileName):
 | 
| 
 | 
   151         self.logHandle = open(fileName, "w")
 | 
| 
 | 
   152 
 | 
| 
 | 
   153 
 | 
| 
 | 
   154     def setMinSize(self, size):
 | 
| 
 | 
   155         self.minSize = size
 | 
| 
 | 
   156 
 | 
| 
 | 
   157 
 | 
| 
 | 
   158     def setMinId(self, id):
 | 
| 
 | 
   159         self.minId = id
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 
 | 
| 
 | 
   162     def setMaxMismatches(self, mismatches):
 | 
| 
 | 
   163         self.maxMismatches = mismatches
 | 
| 
 | 
   164 
 | 
| 
 | 
   165 
 | 
| 
 | 
   166     def setMaxGaps(self, gaps):
 | 
| 
 | 
   167         self.maxGaps = gaps
 | 
| 
 | 
   168 
 | 
| 
 | 
   169 
 | 
| 
 | 
   170     def setMaxMappings(self, mappings):
 | 
| 
 | 
   171         self.maxMappings = mappings
 | 
| 
 | 
   172 
 | 
| 
 | 
   173 
 | 
| 
 | 
   174     def mergeExons(self, b):
 | 
| 
 | 
   175         self.merge = b
 | 
| 
 | 
   176 
 | 
| 
 | 
   177 
 | 
| 
 | 
   178     def acceptShortExons(self, b):
 | 
| 
 | 
   179         self.checkExons = not b
 | 
| 
 | 
   180 
 | 
| 
 | 
   181 
 | 
| 
 | 
   182     def countMappings(self):
 | 
| 
 | 
   183         self.nbMappings = self.parser.getNbMappings()
 | 
| 
 | 
   184         if self.verbosity > 0:
 | 
| 
 | 
   185             print "%i matches found" % (self.nbMappings)
 | 
| 
 | 
   186 
 | 
| 
 | 
   187 
 | 
| 
 | 
   188     def storeAlreadyMapped(self):
 | 
| 
 | 
   189         self.alreadyMapped            = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber))
 | 
| 
 | 
   190         progress                      = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity)
 | 
| 
 | 
   191         self.nbAlreadyMappedSequences = 0
 | 
| 
 | 
   192         for transcript in self.alreadyMappedReader.getIterator():
 | 
| 
 | 
   193             if not self.alreadyMapped.has_key(transcript.getName()):
 | 
| 
 | 
   194                 self.alreadyMapped[transcript.getName()] = 1
 | 
| 
 | 
   195                 self.nbAlreadyMappedSequences           += 1
 | 
| 
 | 
   196             progress.inc()
 | 
| 
 | 
   197         progress.done()
 | 
| 
 | 
   198         self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts()
 | 
| 
 | 
   199 
 | 
| 
 | 
   200 
 | 
| 
 | 
   201     def storeSequences(self):
 | 
| 
 | 
   202         self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber))
 | 
| 
 | 
   203         progress       = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity)
 | 
| 
 | 
   204         for sequence in self.sequenceListParser.getIterator():
 | 
| 
 | 
   205             self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence())
 | 
| 
 | 
   206             self.nbSequences += 1
 | 
| 
 | 
   207             progress.inc()
 | 
| 
 | 
   208         progress.done()
 | 
| 
 | 
   209         if self.verbosity > 0:
 | 
| 
 | 
   210             print "%i sequences read" % (self.nbSequences)
 | 
| 
 | 
   211 
 | 
| 
 | 
   212 
 | 
| 
 | 
   213     def checkOrder(self):
 | 
| 
 | 
   214         names        = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber))
 | 
| 
 | 
   215         previousName = None
 | 
| 
 | 
   216         progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity)
 | 
| 
 | 
   217         for mapping in self.parser.getIterator():
 | 
| 
 | 
   218             name = mapping.queryInterval.getName()
 | 
| 
 | 
   219             if name != previousName and previousName != None:
 | 
| 
 | 
   220                 if names.has_key(previousName):
 | 
| 
 | 
   221                     raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName))
 | 
| 
 | 
   222                 names[previousName] = 1
 | 
| 
 | 
   223                 previousName        = name
 | 
| 
 | 
   224             progress.inc()
 | 
| 
 | 
   225         progress.done()
 | 
| 
 | 
   226         names.close()
 | 
| 
 | 
   227 
 | 
| 
 | 
   228 
 | 
| 
 | 
   229     def checkPreviouslyMapped(self, name):
 | 
| 
 | 
   230         if self.alreadyMappedReader == None:
 | 
| 
 | 
   231             return False
 | 
| 
 | 
   232         return self.alreadyMapped.has_key(name)
 | 
| 
 | 
   233 
 | 
| 
 | 
   234 
 | 
| 
 | 
   235     def findOriginalSize(self, name):
 | 
| 
 | 
   236         alternate    = "%s/1" % (name)
 | 
| 
 | 
   237         if (self.suffix == None) or (not self.suffix):
 | 
| 
 | 
   238             if self.sequences.has_key(name):
 | 
| 
 | 
   239                 self.suffix = False
 | 
| 
 | 
   240                 return self.sequences[name]
 | 
| 
 | 
   241             if self.suffix == None:
 | 
| 
 | 
   242                 self.suffix = True
 | 
| 
 | 
   243             else:
 | 
| 
 | 
   244                 raise Exception("Cannot find name %n" % (name))
 | 
| 
 | 
   245         if (self.suffix):
 | 
| 
 | 
   246             if self.sequences.has_key(alternate):
 | 
| 
 | 
   247                 return self.sequences[alternate]        
 | 
| 
 | 
   248         raise Exception("Cannot find name %s" % (name))
 | 
| 
 | 
   249         
 | 
| 
 | 
   250 
 | 
| 
 | 
   251     def checkErrors(self, mapping):
 | 
| 
 | 
   252         accepted = True
 | 
| 
 | 
   253         # short size
 | 
| 
 | 
   254         if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size:
 | 
| 
 | 
   255             self.tooShort += 1
 | 
| 
 | 
   256             accepted    = False
 | 
| 
 | 
   257             if self.logHandle != None:
 | 
| 
 | 
   258                 self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size))
 | 
| 
 | 
   259         # low identity
 | 
| 
 | 
   260         if self.minId != None and mapping.getTagValue("identity") < self.minId:
 | 
| 
 | 
   261             self.tooManyMismatches += 1
 | 
| 
 | 
   262             accepted                = False
 | 
| 
 | 
   263             if self.logHandle != None:
 | 
| 
 | 
   264                 self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping)))
 | 
| 
 | 
   265         # too many mismatches
 | 
| 
 | 
   266         if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches:
 | 
| 
 | 
   267             self.tooManyMismatches += 1
 | 
| 
 | 
   268             accepted                = False
 | 
| 
 | 
   269             if self.logHandle != None:
 | 
| 
 | 
   270                 self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches))
 | 
| 
 | 
   271         # too many gaps
 | 
| 
 | 
   272         if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps:
 | 
| 
 | 
   273             self.tooManyGaps += 1
 | 
| 
 | 
   274             accepted         = False
 | 
| 
 | 
   275             if self.logHandle != None:
 | 
| 
 | 
   276                 self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps))
 | 
| 
 | 
   277         # short exons
 | 
| 
 | 
   278         if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize:
 | 
| 
 | 
   279             self.tooShortExons += 1
 | 
| 
 | 
   280             accepted            = False
 | 
| 
 | 
   281             if self.logHandle != None:
 | 
| 
 | 
   282                 self.logHandle.write("sequence %s maps as too short exons\n" % (mapping))
 | 
| 
 | 
   283         return accepted
 | 
| 
 | 
   284 
 | 
| 
 | 
   285     
 | 
| 
 | 
   286     def checkNbMappings(self, mappings):
 | 
| 
 | 
   287         nbOccurrences = 0
 | 
| 
 | 
   288         for mapping in mappings:
 | 
| 
 | 
   289             nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
 | 
| 
 | 
   290         if (self.maxMappings != None and nbOccurrences > self.maxMappings):
 | 
| 
 | 
   291             self.tooManyMappings += 1
 | 
| 
 | 
   292             if self.logHandle != None:
 | 
| 
 | 
   293                 self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences))
 | 
| 
 | 
   294             return False
 | 
| 
 | 
   295         return (nbOccurrences > 0)
 | 
| 
 | 
   296 
 | 
| 
 | 
   297 
 | 
| 
 | 
   298     def sortMappings(self, mappings):
 | 
| 
 | 
   299         nbOccurrences = 0
 | 
| 
 | 
   300         for mapping in mappings:
 | 
| 
 | 
   301             nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
 | 
| 
 | 
   302 
 | 
| 
 | 
   303         orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore())
 | 
| 
 | 
   304         cpt             = 1
 | 
| 
 | 
   305         rank            = 1
 | 
| 
 | 
   306         previousMapping = None
 | 
| 
 | 
   307         previousScore   = None
 | 
| 
 | 
   308         wasLastTie      = False
 | 
| 
 | 
   309         rankedMappings  = []
 | 
| 
 | 
   310         bestRegion      = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd())
 | 
| 
 | 
   311         for mapping in orderedMappings:
 | 
| 
 | 
   312             mapping.setNbOccurrences(nbOccurrences)
 | 
| 
 | 
   313             mapping.setOccurrence(cpt)
 | 
| 
 | 
   314 
 | 
| 
 | 
   315             score = mapping.getErrorScore()
 | 
| 
 | 
   316             if previousScore != None and previousScore == score:
 | 
| 
 | 
   317                 if "Rank" in previousMapping.getTagNames():
 | 
| 
 | 
   318                     if not wasLastTie:
 | 
| 
 | 
   319                         previousMapping.setRank("%sTie" % (rank))
 | 
| 
 | 
   320                     mapping.setRank("%sTie" % (rank))
 | 
| 
 | 
   321                     wasLastTie = True
 | 
| 
 | 
   322             else:
 | 
| 
 | 
   323                 rank = cpt
 | 
| 
 | 
   324                 mapping.setRank(rank)
 | 
| 
 | 
   325                 wasLastTie = False
 | 
| 
 | 
   326             if cpt != 1:
 | 
| 
 | 
   327                 mapping.setBestRegion(bestRegion)
 | 
| 
 | 
   328 
 | 
| 
 | 
   329             rankedMappings.append(mapping)
 | 
| 
 | 
   330             previousMapping = mapping
 | 
| 
 | 
   331             previousScore   = score
 | 
| 
 | 
   332             cpt            += 1
 | 
| 
 | 
   333         return rankedMappings
 | 
| 
 | 
   334 
 | 
| 
 | 
   335 
 | 
| 
 | 
   336     def processMappings(self, mappings):
 | 
| 
 | 
   337         if not mappings:
 | 
| 
 | 
   338             return
 | 
| 
 | 
   339         selectedMappings = []
 | 
| 
 | 
   340         name             = mappings[0].queryInterval.getName()
 | 
| 
 | 
   341         size             = self.findOriginalSize(name)
 | 
| 
 | 
   342         for mapping in mappings:
 | 
| 
 | 
   343             if self.merge:
 | 
| 
 | 
   344                 mapping.mergeExons(distanceExons)
 | 
| 
 | 
   345             mapping.queryInterval.size = size
 | 
| 
 | 
   346             if self.checkErrors(mapping):
 | 
| 
 | 
   347                 selectedMappings.append(mapping)
 | 
| 
 | 
   348 
 | 
| 
 | 
   349         if self.checkNbMappings(selectedMappings):
 | 
| 
 | 
   350             if self.unmatchedWriter != None:
 | 
| 
 | 
   351                 query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name)))
 | 
| 
 | 
   352             self.nbWrittenSequences += 1
 | 
| 
 | 
   353             mappings = self.sortMappings(selectedMappings)
 | 
| 
 | 
   354             for mapping in mappings:
 | 
| 
 | 
   355                 self.nbWrittenMappings += 1
 | 
| 
 | 
   356                 self.gff3Writer.addTranscript(mapping.getTranscript())
 | 
| 
 | 
   357 
 | 
| 
 | 
   358 
 | 
| 
 | 
   359     def readMappings(self):
 | 
| 
 | 
   360         previousQueryName = None
 | 
| 
 | 
   361         mappings          = []
 | 
| 
 | 
   362         self.parser.reset()
 | 
| 
 | 
   363         progress = Progress(self.nbMappings, "Reading mappings", self.verbosity)
 | 
| 
 | 
   364         for mapping in self.parser.getIterator():
 | 
| 
 | 
   365             queryName = mapping.queryInterval.getName().split(" ")[0]
 | 
| 
 | 
   366             if self.checkPreviouslyMapped(queryName):
 | 
| 
 | 
   367                 if self.logHandle != None:
 | 
| 
 | 
   368                     self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName))
 | 
| 
 | 
   369             else:
 | 
| 
 | 
   370                 if previousQueryName == queryName:
 | 
| 
 | 
   371                     mappings.append(mapping)
 | 
| 
 | 
   372                 else:
 | 
| 
 | 
   373                     if previousQueryName != None:
 | 
| 
 | 
   374                         self.processMappings(mappings)
 | 
| 
 | 
   375                     previousQueryName = queryName
 | 
| 
 | 
   376                     mappings          = [mapping, ]
 | 
| 
 | 
   377             progress.inc()
 | 
| 
 | 
   378         self.processMappings(mappings)
 | 
| 
 | 
   379         self.gff3Writer.write()
 | 
| 
 | 
   380         self.gff3Writer.close()
 | 
| 
 | 
   381         progress.done()
 | 
| 
 | 
   382         
 | 
| 
 | 
   383 
 | 
| 
 | 
   384     def writeUnmatched(self):
 | 
| 
 | 
   385         progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity)
 | 
| 
 | 
   386         for sequence in self.sequenceListParser.getIterator():
 | 
| 
 | 
   387             name = sequence.getName().split(" ")[0]
 | 
| 
 | 
   388             query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name))
 | 
| 
 | 
   389             if query.isEmpty():
 | 
| 
 | 
   390                 self.unmatchedWriter.addSequence(sequence)
 | 
| 
 | 
   391             progress.inc()
 | 
| 
 | 
   392         progress.done() 
 | 
| 
 | 
   393 
 | 
| 
 | 
   394 
 | 
| 
 | 
   395     def analyze(self):
 | 
| 
 | 
   396         self.countMappings()
 | 
| 
 | 
   397         self.checkOrder()
 | 
| 
 | 
   398         self.storeSequences()
 | 
| 
 | 
   399         if self.alreadyMappedReader != None:
 | 
| 
 | 
   400             self.storeAlreadyMapped()
 | 
| 
 | 
   401         self.readMappings()
 | 
| 
 | 
   402         if self.unmatchedWriter != None:
 | 
| 
 | 
   403             self.writeUnmatched()
 | 
| 
 | 
   404 
 | 
| 
 | 
   405 
 | 
| 
 | 
   406 
 | 
| 
 | 
   407 
 | 
| 
 | 
   408 if __name__ == "__main__":
 | 
| 
 | 
   409     
 | 
| 
 | 
   410     # parse command line
 | 
| 
 | 
   411     description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]"
 | 
| 
 | 
   412 
 | 
| 
 | 
   413     parser = OptionParser(description = description)
 | 
| 
 | 
   414     compGroup = OptionGroup(parser, "Compulsory options")
 | 
| 
 | 
   415     filtGroup = OptionGroup(parser, "Filtering options")
 | 
| 
 | 
   416     tranGroup = OptionGroup(parser, "Transformation options")
 | 
| 
 | 
   417     outpGroup = OptionGroup(parser, "Output options")
 | 
| 
 | 
   418     otheGroup = OptionGroup(parser, "Other options")
 | 
| 
 | 
   419     compGroup.add_option("-i", "--input",            dest="inputFileName",     action="store",                        type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]")
 | 
| 
 | 
   420     compGroup.add_option("-f", "--format",           dest="format",            action="store",      default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]")
 | 
| 
 | 
   421     compGroup.add_option("-q", "--sequences",        dest="sequencesFileName", action="store",                        type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]")
 | 
| 
 | 
   422     compGroup.add_option("-k", "--seqFormat",        dest="sequenceFormat",    action="store",      default="fasta",  type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]")
 | 
| 
 | 
   423     compGroup.add_option("-o", "--output",           dest="outputFileName",    action="store",                        type="string", help="output file [compulsory] [format: output file in GFF3 format]")
 | 
| 
 | 
   424     filtGroup.add_option("-n", "--number",           dest="number",            action="store",      default=None,     type="int",    help="max. number of occurrences of a sequence [format: int]")
 | 
| 
 | 
   425     filtGroup.add_option("-s", "--size",             dest="size",              action="store",      default=None,     type="int",    help="minimum pourcentage of size [format: int]")
 | 
| 
 | 
   426     filtGroup.add_option("-d", "--identity",         dest="identity",          action="store",      default=None,     type="int",    help="minimum pourcentage of identity [format: int]")
 | 
| 
 | 
   427     filtGroup.add_option("-m", "--mismatch",         dest="mismatch",          action="store",      default=None,     type="int",    help="maximum number of mismatches [format: int]")
 | 
| 
 | 
   428     filtGroup.add_option("-p", "--gap",              dest="gap",               action="store",      default=None,     type="int",    help="maximum number of gaps [format: int]")
 | 
| 
 | 
   429     tranGroup.add_option("-e", "--mergeExons",       dest="mergeExons",        action="store_true", default=False,                   help="merge exons when introns are short [format: bool] [default: false]")
 | 
| 
 | 
   430     tranGroup.add_option("-x", "--removeExons",      dest="removeExons",       action="store_true", default=False,                   help="remove transcripts when exons are short [format: bool] [default: false]")
 | 
| 
 | 
   431     outpGroup.add_option("-t", "--title",            dest="title",             action="store",      default="SMART",  type="string", help="title of the UCSC track [format: string] [default: SMART]")
 | 
| 
 | 
   432     outpGroup.add_option("-r", "--remaining",        dest="remaining",         action="store_true", default=False,                   help="print the unmatched sequences [format: bool] [default: false]")
 | 
| 
 | 
   433     otheGroup.add_option("-a", "--append",           dest="appendFileName",    action="store",      default=None,     type="string", help="append to GFF3 file [format: file in GFF3 format]")    
 | 
| 
 | 
   434     otheGroup.add_option("-v", "--verbosity",        dest="verbosity",         action="store",      default=1,        type="int",    help="trace level [default: 1] [format: int]")
 | 
| 
 | 
   435     otheGroup.add_option("-l", "--log",              dest="log",               action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
 | 
| 
 | 
   436     parser.add_option_group(compGroup)
 | 
| 
 | 
   437     parser.add_option_group(filtGroup)
 | 
| 
 | 
   438     parser.add_option_group(tranGroup)
 | 
| 
 | 
   439     parser.add_option_group(outpGroup)
 | 
| 
 | 
   440     parser.add_option_group(otheGroup)
 | 
| 
 | 
   441     (options, args) = parser.parse_args()
 | 
| 
 | 
   442 
 | 
| 
 | 
   443     
 | 
| 
 | 
   444     analyzer = MapperAnalyzer(options.verbosity)
 | 
| 
 | 
   445     analyzer.setMappingFile(options.inputFileName, options.format)
 | 
| 
 | 
   446     analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)
 | 
| 
 | 
   447     analyzer.setOutputFile(options.outputFileName, options.title)
 | 
| 
 | 
   448     if options.appendFileName != None:
 | 
| 
 | 
   449         analyzer.setAlreadyMatched(options.appendFileName)
 | 
| 
 | 
   450     if options.remaining:
 | 
| 
 | 
   451         analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)
 | 
| 
 | 
   452     if options.number != None:
 | 
| 
 | 
   453         analyzer.setMaxMappings(options.number)
 | 
| 
 | 
   454     if options.size != None:
 | 
| 
 | 
   455         analyzer.setMinSize(options.size)
 | 
| 
 | 
   456     if options.identity != None:
 | 
| 
 | 
   457         analyzer.setMinId(options.identity)
 | 
| 
 | 
   458     if options.mismatch != None:
 | 
| 
 | 
   459         analyzer.setMaxMismatches(options.mismatch)
 | 
| 
 | 
   460     if options.gap != None:
 | 
| 
 | 
   461         analyzer.setMaxGaps(options.gap)
 | 
| 
 | 
   462     if options.mergeExons:
 | 
| 
 | 
   463         analyzer.mergeExons(True)
 | 
| 
 | 
   464     if options.removeExons:
 | 
| 
 | 
   465         analyzer.acceptShortExons(False)
 | 
| 
 | 
   466     if options.log:
 | 
| 
 | 
   467         analyzer.setLog("%s.log" % (options.outputFileName))
 | 
| 
 | 
   468     analyzer.analyze()
 | 
| 
 | 
   469     
 | 
| 
 | 
   470     if options.verbosity > 0:
 | 
| 
 | 
   471         print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
 | 
| 
 | 
   472         if options.appendFileName != None:
 | 
| 
 | 
   473             print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
 | 
| 
 | 
   474         print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)
 | 
| 
 | 
   475         if options.appendFileName != None:
 | 
| 
 | 
   476             print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)
 | 
| 
 | 
   477         print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)
 | 
| 
 | 
   478         print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)
 | 
| 
 | 
   479         print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)
 | 
| 
 | 
   480         print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)
 | 
| 
 | 
   481         print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)
 | 
| 
 | 
   482         print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
 | 
| 
 | 
   483         if options.appendFileName != None:
 | 
| 
 | 
   484             print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
 | 
| 
 | 
   485 
 | 
| 
 | 
   486 
 |