diff commons/core/parsing/SamParser.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/parsing/SamParser.py	Tue Apr 30 15:02:29 2013 -0400
@@ -0,0 +1,234 @@
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import re
+import sys
+from commons.core.parsing.MapperParser import MapperParser
+from SMART.Java.Python.structure.Mapping import Mapping
+from SMART.Java.Python.structure.SubMapping import SubMapping
+from SMART.Java.Python.structure.Interval import Interval
+
+class SamParser(MapperParser):
+    """A class that parses SAM format (as given by BWA)"""
+
+    def __init__(self, fileName, verbosity = 0):
+        super(SamParser, self).__init__(fileName, verbosity)
+
+
+    def __del__(self):
+        super(SamParser, self).__del__()
+
+
+    def getFileFormats():
+        return ["sam"]
+    getFileFormats = staticmethod(getFileFormats)
+
+
+    def skipFirstLines(self):
+        pass
+
+
+    def getInfos(self):
+        self.chromosomes = set()
+        self.nbMappings  = 0
+        self.size        = 0
+        self.reset()
+        if self.verbosity >= 10:
+            print "Getting information on SAM file"
+        self.reset()
+        for line in self.handle:
+            line = line.strip()
+            if line == "" or line[0] == "@":
+                continue
+            parts      = line.split("\t")
+            chromosome = parts[2]
+            if chromosome != "*":
+                self.chromosomes.add(chromosome)
+            self.nbMappings += 1
+            self.size += len(parts[8])
+            if self.verbosity >= 10 and self.nbMappings % 100000 == 0:
+                sys.stdout.write("    %d mappings read\r" % (self.nbMappings))
+                sys.stdout.flush()
+        self.reset()
+        if self.verbosity >= 10:
+            print "    %d mappings read" % (self.nbMappings)
+            print "Done."
+
+
+    def parseLine(self, line):
+
+        line = line.strip()
+        if line[0] == "@":
+            return
+
+        fields = line.split("\t")
+        if len(fields) < 11:
+            raise Exception("Line %d '%s' does not look like a SAM line (number of fields is %d instead of 11)" % (self.currentLineNb, line, len(fields)))
+
+        name = fields[0]
+        flag = int(fields[1])
+
+        if (flag & 0x4) == 0x4:
+            return None
+
+        direction       = 1 if (flag & 0x10) == 0x0 else -1
+        chromosome      = fields[2]
+        genomeStart     = int(fields[3])
+        quality         = fields[4]
+        cigar           = fields[5]
+        mate            = fields[6]
+        mateGenomeStart = fields[7]
+        gapSize         = fields[8]
+        sequence        = fields[9]
+        quality         = fields[10]
+        tags            = fields[11:]
+
+        if mateGenomeStart != "*":
+            mateGenomeStart = int(mateGenomeStart)
+
+        mapping       = Mapping()
+        nbOccurrences = 1
+        nbMismatches  = 0
+        nbMatches     = 0
+        nbGaps        = 0
+        subMapping    = None
+        queryOffset   = 0
+        targetOffset  = 0
+        currentNumber = 0
+        readStart     = None
+
+        for tag in tags:
+            key = tag[:2]
+            if key == "X0":
+                nbOccurrences = int(tag[5:])
+            elif key == "X1":
+                nbOccurrences += int(tag[5:])
+            elif key == "XM":
+                nbMismatches = int(tag[5:])
+        mapping.setTagValue("nbOccurrences", nbOccurrences)
+        mapping.setTagValue("quality", int(fields[4]))
+
+        for char in cigar:
+            m = re.match(r"[0-9]", char)
+            if m != None:
+                currentNumber = currentNumber * 10 + (ord(char) - ord("0"))
+                continue
+            # match
+            m = re.match(r"[M]", char)
+            if m != None:
+                if readStart == None:
+                    readStart = queryOffset
+                if subMapping == None:
+                    subMapping = SubMapping()
+                    subMapping.setSize(currentNumber)
+                    subMapping.setDirection(direction)
+                    subMapping.queryInterval.setName(name)
+                    subMapping.queryInterval.setStart(queryOffset)
+                    subMapping.queryInterval.setDirection(direction)
+                    subMapping.targetInterval.setChromosome(chromosome)
+                    subMapping.targetInterval.setStart(genomeStart + targetOffset)
+                    subMapping.targetInterval.setDirection(1)
+                nbMatches    += currentNumber
+                targetOffset += currentNumber
+                queryOffset  += currentNumber
+                currentNumber = 0
+                continue
+            # insertion on the read
+            m = re.match(r"[I]", char)
+            if m != None:
+                nbGaps       += 1
+                queryOffset  += currentNumber
+                currentNumber = 0
+                continue
+            # insertion on the genome
+            m = re.match(r"[D]", char)
+            if m != None:
+                if subMapping != None:
+                    subMapping.queryInterval.setEnd(queryOffset - 1)
+                    subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+                    mapping.addSubMapping(subMapping)
+                subMapping    = None
+                nbGaps       += 1
+                targetOffset += currentNumber
+                currentNumber = 0
+                continue
+            # intron
+            m = re.match(r"[N]", char)
+            if m != None:
+                if subMapping != None:
+                    subMapping.queryInterval.setEnd(queryOffset - 1)
+                    subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+                    mapping.addSubMapping(subMapping)
+                subMapping    = None
+                targetOffset += currentNumber
+                currentNumber = 0
+                continue
+            # soft clipping (substitution)
+            m = re.match(r"[S]", char)
+            if m != None:
+                nbMismatches += currentNumber
+                targetOffset += currentNumber
+                queryOffset  += currentNumber
+                currentNumber = 0
+                continue
+            # hard clipping
+            m = re.match(r"[H]", char)
+            if m != None:
+                targetOffset += currentNumber
+                queryOffset  += currentNumber
+                currentNumber = 0
+                continue
+            # padding
+            m = re.match(r"[P]", char)
+            if m != None:
+                continue
+            raise Exception("Do not understand paramer '%s' in line %s" % (char, line))
+
+        if subMapping != None:
+            subMapping.queryInterval.setEnd(queryOffset - 1)
+            subMapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+            mapping.addSubMapping(subMapping)
+
+        mapping.queryInterval.setStart(readStart)
+        mapping.queryInterval.setEnd(queryOffset - 1)
+        mapping.targetInterval.setEnd(genomeStart + targetOffset - 1)
+        mapping.setNbMismatches(nbMismatches)
+        mapping.setNbGaps(nbGaps)
+
+        mapping.queryInterval.setName(name)
+        mapping.queryInterval.setDirection(direction)
+        mapping.targetInterval.setChromosome(chromosome)
+        mapping.targetInterval.setStart(genomeStart)
+        mapping.targetInterval.setDirection(direction)
+        mapping.setSize(len(sequence))
+        mapping.setDirection(direction)
+
+        return mapping
+
+