| 6 | 1 # | 
|  | 2 # Copyright INRA-URGI 2009-2010 | 
|  | 3 # | 
|  | 4 # This software is governed by the CeCILL license under French law and | 
|  | 5 # abiding by the rules of distribution of free software. You can use, | 
|  | 6 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 7 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 8 # "http://www.cecill.info". | 
|  | 9 # | 
|  | 10 # As a counterpart to the access to the source code and rights to copy, | 
|  | 11 # modify and redistribute granted by the license, users are provided only | 
|  | 12 # with a limited warranty and the software's author, the holder of the | 
|  | 13 # economic rights, and the successive licensors have only limited | 
|  | 14 # liability. | 
|  | 15 # | 
|  | 16 # In this respect, the user's attention is drawn to the risks associated | 
|  | 17 # with loading, using, modifying and/or developing or reproducing the | 
|  | 18 # software by the user in light of its specific status of free software, | 
|  | 19 # that may mean that it is complicated to manipulate, and that also | 
|  | 20 # therefore means that it is reserved for developers and experienced | 
|  | 21 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 22 # encouraged to load and test the software's suitability as regards their | 
|  | 23 # requirements in conditions enabling the security of their systems and/or | 
|  | 24 # data to be ensured and, more generally, to use and operate it in the | 
|  | 25 # same conditions as regards security. | 
|  | 26 # | 
|  | 27 # The fact that you are presently reading this means that you have had | 
|  | 28 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 29 # | 
|  | 30 import sys | 
|  | 31 import re | 
|  | 32 from commons.core.seq.Bioseq import Bioseq | 
|  | 33 | 
|  | 34 reverseComplementString = { | 
|  | 35     "A": "T", | 
|  | 36     "C": "G", | 
|  | 37     "G": "C", | 
|  | 38     "T": "A", | 
|  | 39     "U": "A", | 
|  | 40     "M": "K", | 
|  | 41     "R": "Y", | 
|  | 42     "W": "W", | 
|  | 43     "S": "S", | 
|  | 44     "Y": "R", | 
|  | 45     "K": "M", | 
|  | 46     "V": "B", | 
|  | 47     "H": "D", | 
|  | 48     "D": "H", | 
|  | 49     "B": "V", | 
|  | 50     "N": "N", | 
|  | 51     "a": "t", | 
|  | 52     "c": "g", | 
|  | 53     "g": "c", | 
|  | 54     "t": "a", | 
|  | 55     "u": "a", | 
|  | 56     "m": "k", | 
|  | 57     "r": "y", | 
|  | 58     "w": "w", | 
|  | 59     "s": "s", | 
|  | 60     "y": "r", | 
|  | 61     "k": "m", | 
|  | 62     "v": "b", | 
|  | 63     "h": "d", | 
|  | 64     "d": "h", | 
|  | 65     "b": "v", | 
|  | 66     "n": "n" | 
|  | 67 } | 
|  | 68 | 
|  | 69 class Sequence(Bioseq): | 
|  | 70     """A class that codes for a sequence""" | 
|  | 71 | 
|  | 72     def __init__(self, name = "", sequence = ""): | 
|  | 73         super(Sequence, self).__init__(name, sequence) | 
|  | 74         self.name            = self.header | 
|  | 75         self.quality         = None | 
|  | 76         self.chunkedSequence = None | 
|  | 77         self.chunkedQuality  = None | 
|  | 78         self.integerQuality  = False | 
|  | 79 | 
|  | 80     def setName(self, name=""): | 
|  | 81         super(Sequence, self).setHeader(name) | 
|  | 82 | 
|  | 83     def getName(self): | 
|  | 84         return self.getHeader() | 
|  | 85 | 
|  | 86     def setSequence(self, seq=""): | 
|  | 87         super(Sequence, self).setSequence(seq) | 
|  | 88 | 
|  | 89     def setQuality(self, quality): | 
|  | 90         if quality == None: | 
|  | 91             self.quality = None | 
|  | 92             return | 
|  | 93         if " " in quality: | 
|  | 94             self.quality        = quality.split() | 
|  | 95             self.integerQuality = True | 
|  | 96         else: | 
|  | 97             self.quality = list(quality) | 
|  | 98 | 
|  | 99     def getQuality(self): | 
|  | 100         if self.quality == None: | 
|  | 101             return None | 
|  | 102         if self.integerQuality: | 
|  | 103             return " ".join(self.quality) | 
|  | 104         return "".join(self.quality) | 
|  | 105 | 
|  | 106     def getSize(self): | 
|  | 107         return len(self.getSequence()) | 
|  | 108 | 
|  | 109 | 
|  | 110     def copy(self, sequence): | 
|  | 111         self.setName(sequence.getName()) | 
|  | 112         self.setSequence(sequence.getSequence()) | 
|  | 113         self.setQuality(sequence.getQuality()) | 
|  | 114         self.chunkedSequence = None | 
|  | 115         self.chunkedQuality  = None | 
|  | 116 | 
|  | 117 | 
|  | 118     def chunkSequence(self): | 
|  | 119         self.chunkedSequence = [] | 
|  | 120         for i in range (0, self.getSize() / 60 + 1): | 
|  | 121             self.chunkedSequence.append(self.getSequence()[i * 60 : min(self.getSize(), (i+1) * 60)]) | 
|  | 122         if self.quality != None: | 
|  | 123             self.chunkedQuality = [] | 
|  | 124             for i in range (0, self.getSize() / 60 + 1): | 
|  | 125                 self.chunkedQuality.append(self.quality[i * 60 : min(self.getSize(), (i+1) * 60)]) | 
|  | 126 | 
|  | 127     def concatenate(self, seq): | 
|  | 128         sequence  = self.getSequence() | 
|  | 129         sequence += seq.getSequence() | 
|  | 130         self.setSequence(sequence) | 
|  | 131         if self.quality != None: | 
|  | 132             sep = " " if self.integerQuality else "" | 
|  | 133             self.setQuality(self.getQuality() + sep + seq.getQuality()) | 
|  | 134         self.chunkedSequence = None | 
|  | 135         self.chunkedQuality  = None | 
|  | 136 | 
|  | 137 | 
|  | 138     def printFasta(self): | 
|  | 139         if self.chunkedSequence == None: | 
|  | 140             self.chunkSequence() | 
|  | 141         return ">%s\n%s\n" % (self.getHeader(), "\n".join(self.chunkedSequence)) | 
|  | 142 | 
|  | 143 | 
|  | 144     def printFastq(self): | 
|  | 145         if self.chunkedSequence == None: | 
|  | 146             self.chunkSequence() | 
|  | 147         return "@%s\n%s\n+%s\n%s\n" % (self.getHeader(), self.getSequence(), self.getHeader(), self.getQuality()) | 
|  | 148 | 
|  | 149 | 
|  | 150     def reverseComplement(self): | 
|  | 151         seq = "" | 
|  | 152         self.chunkedSequence = None | 
|  | 153         self.chunkedQuality  = None | 
|  | 154         for i in range(0, self.getSize()): | 
|  | 155             char = self.getSequence()[i:i+1] | 
|  | 156             if char not in reverseComplementString: | 
|  | 157                 sys.exit("Cannot understand character %s from string %s" % (char, self.getSequence())) | 
|  | 158             seq = "%s%s" % (reverseComplementString[char], seq) | 
|  | 159         self.setSequence(seq) | 
|  | 160         if self.quality != None: | 
|  | 161             self.quality = self.quality[::-1] | 
|  | 162 | 
|  | 163 | 
|  | 164     def containsAmbiguousNucleotides(self): | 
|  | 165         m = re.search("[^ACGTUacgtu]", self.getSequence()) | 
|  | 166         if m != None: | 
|  | 167             return True | 
|  | 168         return False | 
|  | 169 | 
|  | 170 | 
|  | 171     def shrinkToFirstNucleotides(self, nbNucleotides): | 
|  | 172         self.chunkedSequence = None | 
|  | 173         self.chunkedQuality  = None | 
|  | 174         self.setSequence(self.getSequence()[0:nbNucleotides]) | 
|  | 175         if self.quality != None: | 
|  | 176             self.quality = self.quality[0:nbNucleotides] | 
|  | 177 | 
|  | 178 | 
|  | 179     def shrinkToLastNucleotides(self, nbNucleotides): | 
|  | 180         self.chunkedSequence = None | 
|  | 181         self.chunkedQuality  = None | 
|  | 182         self.setSequence(self.getSequence()[-nbNucleotides:]) | 
|  | 183         if self.quality != None: | 
|  | 184             self.quality = self.quality[-nbNucleotides:] |