annotate TEisotools-1.1.a/commons/core/seq/AlignedBioseqDB.py @ 15:255c852351c5 draft

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:36:44 -0400
parents feef9a0db09d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
1 # Copyright INRA (Institut National de la Recherche Agronomique)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
2 # http://www.inra.fr
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
3 # http://urgi.versailles.inra.fr
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
4 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
5 # This software is governed by the CeCILL license under French law and
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
6 # abiding by the rules of distribution of free software. You can use,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
7 # modify and/ or redistribute the software under the terms of the CeCILL
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
8 # license as circulated by CEA, CNRS and INRIA at the following URL
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
9 # "http://www.cecill.info".
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
10 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
11 # As a counterpart to the access to the source code and rights to copy,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
12 # modify and redistribute granted by the license, users are provided only
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
13 # with a limited warranty and the software's author, the holder of the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
14 # economic rights, and the successive licensors have only limited
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
15 # liability.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
16 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
17 # In this respect, the user's attention is drawn to the risks associated
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
18 # with loading, using, modifying and/or developing or reproducing the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
19 # software by the user in light of its specific status of free software,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
20 # that may mean that it is complicated to manipulate, and that also
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
21 # therefore means that it is reserved for developers and experienced
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
22 # professionals having in-depth computer knowledge. Users are therefore
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
23 # encouraged to load and test the software's suitability as regards their
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
24 # requirements in conditions enabling the security of their systems and/or
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
25 # data to be ensured and, more generally, to use and operate it in the
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
26 # same conditions as regards security.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
27 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
28 # The fact that you are presently reading this means that you have had
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
29 # knowledge of the CeCILL license and that you accept its terms.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
30
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
31
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
32 import sys
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
33 from commons.core.seq.BioseqDB import BioseqDB
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
34 from commons.core.seq.Bioseq import Bioseq
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
35 from commons.core.coord.Align import Align
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
36 from commons.core.coord.Range import Range
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
37 from commons.core.stat.Stat import Stat
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
38 from math import log
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
39
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
40
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
41 ## Multiple Sequence Alignment Representation
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
42 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
43 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
44 class AlignedBioseqDB( BioseqDB ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
45
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
46 def __init__( self, name="" ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
47 BioseqDB.__init__( self, name )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
48 seqLength = self.getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
49 if self.getSize() > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
50 for bs in self.db[1:]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
51 if bs.getLength() != seqLength:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
52 print "ERROR: aligned sequences have different length"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
53
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
54
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
55 ## Get length of the alignment
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
56 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
57 # @return length
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
58 # @warning name before migration was 'length'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
59 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
60 def getLength( self ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
61 length = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
62 if self.db != []:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
63 length = self.db[0].getLength()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
64 return length
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
65
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
66
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
67 ## Get the true length of a given sequence (without gaps)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
68 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
69 # @param header string header of the sequence to analyze
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
70 # @return length integer
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
71 # @warning name before migration was 'true_length'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
72 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
73 def getSeqLengthWithoutGaps( self, header ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
74 bs = self.fetch( header )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
75 count = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
76 for pos in xrange(0,len(bs.sequence)):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
77 if bs.sequence[pos] != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
78 count += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
79 return count
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
80
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
81 def cleanMSA( self ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
82 #TODO: Refactoring
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
83 """clean the MSA"""
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
84 i2del = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
85
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
86 # for each sequence in the MSA
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
87 for seqi in xrange(0,self.getSize()):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
88 if seqi in i2del:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
89 continue
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
90 #define it as the reference
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
91 ref = self.db[seqi].sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
92 refHeader = self.db[seqi].header
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
93 # for each following sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
94 for seq_next in xrange(seqi+1,self.getSize()):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
95 if seq_next in i2del:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
96 continue
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
97 keep = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
98 # for each position along the MSA
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
99 for posx in xrange(0,self.getLength()):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
100 seq = self.db[seq_next].sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
101 if seq[posx] != '-' and ref[posx] != '-':
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
102 keep = 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
103 break
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
104 seqHeader = self.db[seq_next].header
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
105 # if there is at least one gap between the ref seq and the other seq
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
106 # keep track of the shortest by recording it in "i2del"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
107 if keep == 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
108
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
109 if self.getSeqLengthWithoutGaps(refHeader) < self.getSeqLengthWithoutGaps(seqHeader):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
110 if seqi not in i2del:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
111 i2del.append( seqi )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
112 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
113 if seq_next not in i2del:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
114 i2del.append( seq_next )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
115
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
116 # delete from the MSA each seq present in the list "i2del"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
117 for i in reversed(sorted(set(i2del))):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
118 del self.db[i]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
119
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
120 self.idx = {}
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
121 count = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
122 for i in self.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
123 self.idx[i.header] = count
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
124 count += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
125
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
126 ## Record the occurrences of symbols (A, T, G, C, N, -, ...) at each site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
127 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
128 # @return: list of dico whose keys are symbols and values are their occurrences
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
129 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
130 def getListOccPerSite( self ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
131 lOccPerSite = [] # list of dictionaries, one per position on the sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
132 n = 0 # nb of sequences parsed from the input file
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
133 firstSeq = True
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
134
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
135 # for each sequence in the bank
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
136 for bs in self.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
137 if bs.sequence == None:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
138 break
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
139 n += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
140
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
141 # if it is the first to be parsed, create a dico at each site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
142 if firstSeq:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
143 for i in xrange(0,len(bs.sequence)):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
144 lOccPerSite.append( {} )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
145 firstSeq = False
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
146
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
147 # for each site, add its nucleotide
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
148 for i in xrange(0,len(bs.sequence)):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
149 nuc = bs.sequence[i].upper()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
150 if lOccPerSite[i].has_key( nuc ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
151 lOccPerSite[i][nuc] += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
152 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
153 lOccPerSite[i][nuc] = 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
154
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
155 return lOccPerSite
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
156
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
157 #TODO: review minNbNt !!! It should be at least 2 nucleotides to build a consensus...
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
158 ## Make a consensus from the MSA
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
159 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
160 # @param minNbNt: minimum nb of nucleotides to edit a consensus
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
161 # @param minPropNt: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
162 # @param verbose: level of information sent to stdout (default=0/1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
163 # @return: consensus
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
164 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
165 def getConsensus( self, minNbNt, minPropNt=0.0, verbose=0 , isHeaderSAtannot=False):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
166
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
167 maxPropN = 0.40 # discard consensus if more than 40% of N's
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
168
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
169 nbInSeq = self.getSize()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
170 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
171 print "nb of aligned sequences: %i" % ( nbInSeq ); sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
172 if nbInSeq < 2:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
173 print "ERROR: can't make a consensus with less than 2 sequences"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
174 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
175 if minNbNt >= nbInSeq:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
176 minNbNt = nbInSeq - 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
177 print "minNbNt=%i" % ( minNbNt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
178 if minPropNt >= 1.0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
179 print "ERROR: minPropNt=%.2f should be a proportion (below 1.0)" % ( minPropNt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
180 sys.exit(1)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
181
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
182 lOccPerSite = self.getListOccPerSite()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
183 nbSites = len(lOccPerSite)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
184 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
185 print "nb of sites: %i" % ( nbSites ); sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
186
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
187 seqConsensus = ""
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
188
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
189 # for each site (i.e. each column of the MSA)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
190 nbRmvColumns = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
191 countSites = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
192 for dNt2Occ in lOccPerSite:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
193 countSites += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
194 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
195 print "site %s / %i" % ( str(countSites).zfill( len(str(nbSites)) ),
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
196 nbSites )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
197 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
198 occMaxNt = 0 # occurrences of the predominant nucleotide at this site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
199 lBestNt = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
200 nbNt = 0 # total nb of A, T, G and C (no gap)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
201
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
202 # for each distinct symbol at this site (A, T, G, C, N, -,...)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
203 for j in dNt2Occ.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
204 if j != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
205 nbNt += dNt2Occ[j]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
206 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
207 print "%s: %i" % ( j, dNt2Occ[j] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
208 if dNt2Occ[j] > occMaxNt:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
209 occMaxNt = dNt2Occ[j]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
210 lBestNt = [ j ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
211 elif dNt2Occ[j] == occMaxNt:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
212 lBestNt.append( j )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
213 if nbNt == 0: # some MSA programs can remove some sequences (e.g. Muscle after Recon) or when using Refalign (non-alignable TE fragments put together via a refseq)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
214 nbRmvColumns += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
215
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
216 if len( lBestNt ) >= 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
217 bestNt = lBestNt[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
218
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
219 # if the predominant nucleotide occurs in less than x% of the sequences, put a "N"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
220 if minPropNt > 0.0 and nbNt != 0 and float(occMaxNt)/float(nbNt) < minPropNt:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
221 bestNt = "N"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
222
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
223 if int(nbNt) >= int(minNbNt):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
224 seqConsensus += bestNt
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
225 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
226 print "-> %s" % ( bestNt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
227
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
228 if nbRmvColumns:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
229 if nbRmvColumns == 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
230 print "WARNING: 1 site was removed (%.2f%%)" % (nbRmvColumns / float(nbSites) * 100)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
231 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
232 print "WARNING: %i sites were removed (%.2f%%)" % ( nbRmvColumns, nbRmvColumns / float(nbSites) * 100 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
233 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
234 if seqConsensus == "":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
235 print "WARNING: no consensus can be built (no sequence left)"
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
236 return
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
237
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
238 propN = seqConsensus.count("N") / float(len(seqConsensus))
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
239 if propN >= maxPropN:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
240 print "WARNING: no consensus can be built (%i%% of N's >= %i%%)" % ( propN * 100, maxPropN * 100 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
241 return
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
242 elif propN >= maxPropN * 0.5:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
243 print "WARNING: %i%% of N's" % ( propN * 100 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
244
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
245 consensus = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
246 consensus.sequence = seqConsensus
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
247 if isHeaderSAtannot:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
248 header = self.db[0].header
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
249 pyramid = header.split("Gr")[1].split("Cl")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
250 pile = header.split("Cl")[1].split(" ")[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
251 consensus.header = "consensus=%s length=%i nbAlign=%i pile=%s pyramid=%s" % (self.name, len(seqConsensus), self.getSize(), pile, pyramid)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
252 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
253 consensus.header = "consensus=%s length=%i nbAlign=%i" % ( self.name, len(seqConsensus), self.getSize() )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
254
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
255 if verbose > 0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
256
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
257 statEntropy = self.getEntropy( verbose - 1 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
258 print "entropy: %s" % ( statEntropy.stringQuantiles() )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
259 sys.stdout.flush()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
260
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
261 return consensus
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
262
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
263
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
264 ## Get the entropy of the whole multiple alignment (only for A, T, G and C)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
265 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
266 # @param verbose level of verbosity
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
267 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
268 # @return statistics about the entropy of the MSA
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
269 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
270 def getEntropy( self, verbose=0 ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
271
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
272 stats = Stat()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
273
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
274 # get the occurrences of symbols at each site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
275 lOccPerSite = self.getListOccPerSite()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
276
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
277 countSite = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
278
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
279 # for each site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
280 for dSymbol2Occ in lOccPerSite:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
281 countSite += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
282
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
283 # count the number of nucleotides (A, T, G and C, doesn't count gap '-')
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
284 nbNt = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
285 dATGC2Occ = {}
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
286 for base in ["A","T","G","C"]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
287 dATGC2Occ[ base ] = 0.0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
288 for nt in dSymbol2Occ.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
289 if nt != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
290 nbNt += dSymbol2Occ[ nt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
291 checkedNt = self.getATGCNFromIUPAC( nt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
292 if checkedNt in ["A","T","G","C"] and dSymbol2Occ.has_key( checkedNt ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
293 dATGC2Occ[ checkedNt ] += 1 * dSymbol2Occ[ checkedNt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
294 else: # for 'N'
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
295 if dSymbol2Occ.has_key( checkedNt ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
296 dATGC2Occ[ "A" ] += 0.25 * dSymbol2Occ[ checkedNt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
297 dATGC2Occ[ "T" ] += 0.25 * dSymbol2Occ[ checkedNt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
298 dATGC2Occ[ "G" ] += 0.25 * dSymbol2Occ[ checkedNt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
299 dATGC2Occ[ "C" ] += 0.25 * dSymbol2Occ[ checkedNt ]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
300 if verbose > 2:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
301 for base in dATGC2Occ.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
302 print "%s: %i" % ( base, dATGC2Occ[ base ] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
303
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
304 # compute the entropy for the site
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
305 entropySite = 0.0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
306 for nt in dATGC2Occ.keys():
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
307 entropySite += self.computeEntropy( dATGC2Occ[ nt ], nbNt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
308 if verbose > 1:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
309 print "site %i (%i nt): entropy = %.3f" % ( countSite, nbNt, entropySite )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
310 stats.add( entropySite )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
311
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
312 return stats
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
313
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
314
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
315 ## Get A, T, G, C or N from an IUPAC letter
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
316 # IUPAC = ['A','T','G','C','U','R','Y','M','K','W','S','B','D','H','V','N']
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
317 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
318 # @return A, T, G, C or N
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
319 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
320 def getATGCNFromIUPAC( self, nt ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
321 iBs = Bioseq()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
322 return iBs.getATGCNFromIUPAC( nt )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
323
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
324
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
325 ## Compute the entropy based on the occurrences of a certain nucleotide and the total number of nucleotides
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
326 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
327 def computeEntropy( self, nbOcc, nbNt ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
328 if nbOcc == 0.0:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
329 return 0.0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
330 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
331 freq = nbOcc / float(nbNt)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
332 return - freq * log(freq) / log(2)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
333
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
334
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
335 ## Save the multiple alignment as a matrix with '0' if gap, '1' otherwise
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
336 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
337 def saveAsBinaryMatrix( self, outFile ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
338 outFileHandler = open( outFile, "w" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
339 for bs in self.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
340 string = "%s" % ( bs.header )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
341 for nt in bs.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
342 if nt != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
343 string += "\t%i" % ( 1 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
344 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
345 string += "\t%i" % ( 0 )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
346 outFileHandler.write( "%s\n" % ( string ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
347 outFileHandler.close()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
348
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
349
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
350 ## Return a list of Align instances corresponding to the aligned regions (without gaps)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
351 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
352 # @param query string header of the sequence considered as query
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
353 # @param subject string header of the sequence considered as subject
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
354 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
355 def getAlignList( self, query, subject ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
356 lAligns = []
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
357 alignQ = self.fetch( query ).sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
358 alignS = self.fetch( subject ).sequence
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
359 createNewAlign = True
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
360 indexAlign = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
361 indexQ = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
362 indexS = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
363 while indexAlign < len(alignQ):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
364 if alignQ[ indexAlign ] != "-" and alignS[ indexAlign ] != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
365 indexQ += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
366 indexS += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
367 if createNewAlign:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
368 iAlign = Align( Range( query, indexQ, indexQ ),
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
369 Range( subject, indexS, indexS ),
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
370 0,
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
371 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ),
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
372 int( alignQ[ indexAlign ] == alignS[ indexAlign ] ) )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
373 lAligns.append( iAlign )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
374 createNewAlign = False
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
375 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
376 lAligns[-1].range_query.end += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
377 lAligns[-1].range_subject.end += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
378 lAligns[-1].score += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
379 lAligns[-1].identity += int( alignQ[ indexAlign ] == alignS[ indexAlign ] )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
380 else:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
381 if not createNewAlign:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
382 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
383 createNewAlign = True
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
384 if alignQ[ indexAlign ] != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
385 indexQ += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
386 elif alignS[ indexAlign ] != "-":
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
387 indexS += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
388 indexAlign += 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
389 if not createNewAlign:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
390 lAligns[-1].identity = 100 * lAligns[-1].identity / lAligns[-1].getLengthOnQuery()
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
391 return lAligns
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
392
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
393
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
394 def removeGaps(self):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
395 for iBs in self.db:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
396 iBs.removeSymbol( "-" )
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
397
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
398 ## Compute mean per cent identity for MSA.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
399 # First sequence in MSA is considered as reference sequence.
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
400 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
401 #
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
402 def computeMeanPcentIdentity(self):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
403 seqRef = self.db[0]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
404 sumPcentIdentity = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
405
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
406 for seq in self.db[1:]:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
407 pcentIdentity = self._computePcentIdentityBetweenSeqRefAndCurrentSeq(seqRef, seq)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
408 sumPcentIdentity = sumPcentIdentity + pcentIdentity
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
409
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
410 nbSeq = len(self.db[1:])
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
411 meanPcentIdentity = round (sumPcentIdentity/nbSeq)
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
412
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
413 return meanPcentIdentity
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
414
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
415 def _computePcentIdentityBetweenSeqRefAndCurrentSeq(self, seqRef, seq):
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
416 indexOnSeqRef = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
417 sumIdentity = 0
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
418 for nuclSeq in seq.sequence:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
419 nuclRef = seqRef.sequence[indexOnSeqRef]
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
420
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
421 if nuclRef != "-" and nuclRef == nuclSeq:
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
422 sumIdentity = sumIdentity + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
423 indexOnSeqRef = indexOnSeqRef + 1
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
424
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
425 return float(sumIdentity) / float(seqRef.getLength()) * 100
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
426
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
427
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
428
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
429
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
430
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
431
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
432
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
433
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
434
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
435
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
436
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
437
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
438
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
439
feef9a0db09d Uploaded
urgi-team
parents:
diff changeset
440