| 36 | 1 # | 
|  | 2 # Copyright INRA-URGI 2009-2010 | 
|  | 3 # | 
|  | 4 # This software is governed by the CeCILL license under French law and | 
|  | 5 # abiding by the rules of distribution of free software. You can use, | 
|  | 6 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 7 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 8 # "http://www.cecill.info". | 
|  | 9 # | 
|  | 10 # As a counterpart to the access to the source code and rights to copy, | 
|  | 11 # modify and redistribute granted by the license, users are provided only | 
|  | 12 # with a limited warranty and the software's author, the holder of the | 
|  | 13 # economic rights, and the successive licensors have only limited | 
|  | 14 # liability. | 
|  | 15 # | 
|  | 16 # In this respect, the user's attention is drawn to the risks associated | 
|  | 17 # with loading, using, modifying and/or developing or reproducing the | 
|  | 18 # software by the user in light of its specific status of free software, | 
|  | 19 # that may mean that it is complicated to manipulate, and that also | 
|  | 20 # therefore means that it is reserved for developers and experienced | 
|  | 21 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 22 # encouraged to load and test the software's suitability as regards their | 
|  | 23 # requirements in conditions enabling the security of their systems and/or | 
|  | 24 # data to be ensured and, more generally, to use and operate it in the | 
|  | 25 # same conditions as regards security. | 
|  | 26 # | 
|  | 27 # The fact that you are presently reading this means that you have had | 
|  | 28 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 29 # | 
|  | 30 import re | 
|  | 31 import sys | 
|  | 32 from SMART.Java.Python.structure.Mapping import Mapping | 
|  | 33 from SMART.Java.Python.structure.SubMapping import SubMapping | 
|  | 34 from commons.core.parsing.MapperParser import MapperParser | 
|  | 35 from SMART.Java.Python.misc import Utils | 
|  | 36 from SMART.Java.Python.misc.Utils import getHammingDistance | 
|  | 37 | 
|  | 38 | 
|  | 39 class AxtParser(MapperParser): | 
|  | 40     """A class that parses AXT (as given by Mosaik)""" | 
|  | 41 | 
|  | 42     def __init__(self, fileName, verbosity = 0): | 
|  | 43         super(AxtParser, self).__init__(fileName, verbosity) | 
|  | 44         self.queryLine = None | 
|  | 45         self.subjectLine = None | 
|  | 46 | 
|  | 47     def __del__(self): | 
|  | 48         super(AxtParser, self).__del__() | 
|  | 49 | 
|  | 50 | 
|  | 51     def getFileFormats(): | 
|  | 52         return ["axt"] | 
|  | 53     getFileFormats = staticmethod(getFileFormats) | 
|  | 54 | 
|  | 55 | 
|  | 56     def skipFirstLines(self): | 
|  | 57         pass | 
|  | 58 | 
|  | 59 | 
|  | 60     def getInfos(self): | 
|  | 61         self.chromosomes = set() | 
|  | 62         self.nbMappings  = 0 | 
|  | 63         self.size        = 0 | 
|  | 64         cpt              = 0 | 
|  | 65         self.reset() | 
|  | 66         for line in self.handle: | 
|  | 67             line = line.strip() | 
|  | 68             if line == "": continue | 
|  | 69             if cpt % 3 == 0: | 
|  | 70                 line    = line.strip() | 
|  | 71                 parts = line.split(" ") | 
|  | 72                 self.chromosomes.add(parts[1]) | 
|  | 73                 self.size       += int(parts[6]) | 
|  | 74                 self.nbMappings += 1 | 
|  | 75             cpt += 1 | 
|  | 76             if self.verbosity >= 10 and self.nbMappings % 100000 == 0: | 
|  | 77                 sys.stdout.write("    %d mappings read\r" % (self.nbMappings)) | 
|  | 78                 sys.stdout.flush() | 
|  | 79         self.reset() | 
|  | 80         if self.verbosity >= 10: | 
|  | 81             print "    %d mappings read" % (self.nbMappings) | 
|  | 82             print "Done." | 
|  | 83 | 
|  | 84 | 
|  | 85     def parseLine(self, line): | 
|  | 86 | 
|  | 87         if line.strip() == "": | 
|  | 88             for line in self.handle: | 
|  | 89                 self.currentLineNb += 1 | 
|  | 90                 break | 
|  | 91         if line.strip() == "": | 
|  | 92             return None | 
|  | 93 | 
|  | 94         m = re.search(r"^\s*\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s*$", line) | 
|  | 95         if m != None: | 
|  | 96             mapping = Mapping() | 
|  | 97             subMapping = SubMapping() | 
|  | 98 | 
|  | 99             subMapping.queryInterval.setName(m.group(4)) | 
|  | 100             subMapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) | 
|  | 101             subMapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) | 
|  | 102             subMapping.queryInterval.setDirection(m.group(7)) | 
|  | 103 | 
|  | 104             subMapping.targetInterval.setChromosome(m.group(1)) | 
|  | 105             subMapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) | 
|  | 106             subMapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) | 
|  | 107             subMapping.targetInterval.setDirection(1) | 
|  | 108 | 
|  | 109             subMapping.setSize(min(subMapping.targetInterval.getSize(), subMapping.queryInterval.getSize())) | 
|  | 110             subMapping.setDirection(m.group(7)) | 
|  | 111 | 
|  | 112             mapping.addSubMapping(subMapping) | 
|  | 113 | 
|  | 114             mapping.setDirection(m.group(7)) | 
|  | 115             mapping.targetInterval.setChromosome(m.group(1)) | 
|  | 116             mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3)))) | 
|  | 117             mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3)))) | 
|  | 118 | 
|  | 119             mapping.queryInterval.setName(m.group(4)) | 
|  | 120             mapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6)))) | 
|  | 121             mapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6)))) | 
|  | 122 | 
|  | 123             mapping.setSize(min(mapping.targetInterval.getSize(), mapping.queryInterval.getSize())) | 
|  | 124 | 
|  | 125             self.currentMapping = mapping | 
|  | 126             return None | 
|  | 127         if self.queryLine == None: | 
|  | 128             self.queryLine = line | 
|  | 129             return None | 
|  | 130         self.subjectLine = line | 
|  | 131         seqLen = float(len(self.subjectLine)) | 
|  | 132         dist = float(getHammingDistance(self.queryLine, self.subjectLine)) | 
|  | 133         self.currentMapping.setNbMismatches(getHammingDistance(self.queryLine, self.subjectLine)) | 
|  | 134         self.currentMapping.setNbGaps(0) | 
|  | 135         self.queryLine = None | 
|  | 136         self.subjectLine = None | 
|  | 137         return self.currentMapping | 
|  | 138 | 
|  | 139 | 
|  | 140 |