comparison commons/tools/RmvPairAlignInChunkOverlaps.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/env python
2
3 """
4 Remove hits due to chunk overlaps.
5 """
6
7 import os
8 import sys
9 import getopt
10 import exceptions
11 import copy
12 from commons.core.coord.Align import *
13
14
15 class RmvPairAlignInChunkOverlaps( object ):
16 """
17 Remove hits due to chunk overlaps.
18 """
19
20
21 def __init__( self, inFileName="", chunkLength=200000, chunkOverlap=10000, margin=10, outFileName="", verbose=0 ):
22 """
23 Constructor.
24 """
25 self._inFileName = inFileName
26 self._chunkLength = chunkLength
27 self._chunkOverlap = chunkOverlap
28 self._margin = margin
29 self._outFileName = outFileName
30 self._verbose = verbose
31
32 def help( self ):
33 """
34 Display the help.
35 """
36 print
37 print "usage: %s [ options ]" % ( sys.argv[0] )
38 print "options:"
39 print " -h: this help"
40 print " -i: name of the input file (format='align')"
41 print " -l: chunk length (in bp)"
42 print " -o: chunk overlap (in bp)"
43 print " -m: margin to remove match included into a chunk overlap (default=10)"
44 print " -O: name of the output file (default=inFileName+'.not_over')"
45 print " -v: verbose (default=0/1)"
46 print
47
48 def setAttributesFromCmdLine( self ):
49 """
50 Set attributes from the command-line arguments.
51 """
52 try:
53 opts, args = getopt.getopt(sys.argv[1:],"h:i:l:o:m:O:v:")
54 except getopt.GetoptError, err:
55 print str(err); self.help(); sys.exit(1)
56 for o,a in opts:
57 if o == "-h":
58 self.help(); sys.exit(0)
59 elif o == "-i":
60 self.setInputFileName( a )
61 elif o == "-l":
62 self.setChunkLength( a )
63 elif o == "-o":
64 self.setChunkOverlap( a )
65 elif o == "-m":
66 self.setMargin( a )
67 elif o == "-O":
68 self.setOutputFileName( a )
69 elif o == "-v":
70 self.setVerbosityLevel( a )
71
72 def setInputFileName( self, inFileName ):
73 self._inFileName = inFileName
74
75 def setChunkLength( self, chunkLength ):
76 self._chunkLength = int(chunkLength)
77
78 def setChunkOverlap( self, chunkOverlap ):
79 self._chunkOverlap = int(chunkOverlap)
80
81 def setMargin( self, margin ):
82 self._margin = int(margin)
83
84 def setOutputFileName( self, outFileName ):
85 self._outFileName = outFileName
86
87 def setVerbosityLevel( self, verbose ):
88 self._verbose = int(verbose)
89
90 def checkAttributes( self ):
91 """
92 Before running, check the required attributes are properly filled.
93 """
94 if self._inFileName == "":
95 print "ERROR: missing input file"; self.help(); sys.exit(1)
96 if not os.path.exists(self._inFileName ):
97 print "ERROR: input file '%s' doesn't exist" %( self._inFileName )
98 if self._outFileName == "":
99 self._outFileName = "%s.not_over" % ( self._inFileName )
100
101
102 def isPairAlignAChunkOverlap( self, a, chunkQuery, chunkSubject ):
103 """
104 Return True if the pairwise alignment exactly corresponds to a 2-chunk overlap, False otherwise.
105 Take into account cases specific to BLASTER or PALS.
106 """
107
108 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
109 if self._verbose > 1: print "on different strand"
110 return False
111
112 if chunkQuery == chunkSubject + 1:
113 if self._verbose > 1: print "query > subject"
114 if a.range_query.start == 1 and a.range_subject.end == self._chunkLength \
115 and ( a.range_query.getLength() == self._chunkOverlap \
116 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
117 and ( a.range_subject.getLength() == self._chunkOverlap \
118 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
119 if self._verbose > 1: print "chunk overlap"
120 return True
121
122 elif chunkQuery == chunkSubject - 1:
123 if self._verbose > 1: print "query < subject"
124 if a.range_query.end == self._chunkLength and a.range_subject.start == 1 \
125 and ( a.range_query.getLength() == self._chunkOverlap \
126 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
127 and ( a.range_subject.getLength() == self._chunkOverlap \
128 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
129 if self._verbose > 1: print "chunk overlap"
130 return True
131
132 if self._verbose > 1: print "not a chunk overlap"
133 return False
134
135
136 def isInInterval( self, coord, midpoint ):
137 """
138 Check if a given coordinate is inside the interval [midpoint-margin;midpoint+margin].
139 """
140 if coord >= midpoint - self._margin and coord <= midpoint + self._margin:
141 return True
142 return False
143
144
145 def isPairAlignWithinAndDueToAChunkOverlap( self, a, chunkQuery, chunkSubject ):
146 """
147 Return True if the pairwise alignment lies within an overlap between two contiguous chunks and is due to it, False otherwise.
148 """
149 uniqLength = self._chunkLength - self._chunkOverlap
150
151 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
152 if self._verbose > 1: print "on different strand"
153 return False
154
155 if chunkQuery == chunkSubject + 1:
156 if self._verbose > 1: print "query > subject"
157 if a.range_query.getMin() >= 1 and a.range_query.getMax() <= self._chunkOverlap \
158 and a.range_subject.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
159 and a.range_subject.getMax() <= self._chunkLength:
160 if self._verbose > 1: print "included"
161 if self.isInInterval( a.range_query.getMin(), a.range_subject.getMin() - uniqLength ) \
162 and self.isInInterval( self._chunkOverlap - a.range_query.getMax(), self._chunkLength - a.range_subject.getMax() ):
163 if self._verbose > 1: print "due to overlap"
164 return True
165 else:
166 if self._verbose > 1: print "not due to overlap"
167 return False
168
169 elif chunkQuery == chunkSubject - 1:
170 if self._verbose > 1: print "query < subject"
171 if a.range_query.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
172 and a.range_query.getMax() <= self._chunkLength \
173 and a.range_subject.getMin() >= 1 \
174 and a.range_subject.getMax() <= self._chunkOverlap:
175 if self._verbose > 1: print "included"
176 if self.isInInterval( a.range_subject.getMin(), a.range_query.getMin() - uniqLength ) \
177 and self.isInInterval( self._chunkOverlap - a.range_subject.getMax(), self._chunkLength - a.range_query.getMax() ):
178 if self._verbose > 1: print "due to overlap"
179 return True
180 else:
181 if self._verbose > 1: print "not due to overlap"
182 return False
183
184 else:
185 if self._verbose > 1: print "not contiguous chunks"
186 return False
187
188 if self._verbose > 1: print "not included"
189 return False
190
191
192 def removeChunkOverlaps( self ):
193 """
194 Remove pairwise alignments exactly corresponding to chunk overlaps or those included within such overlaps.
195 """
196 totalNbPairAlign = 0
197 nbChunkOverlaps = 0
198 d = {}
199 nbPairAlignWithinChunkOverlaps = 0
200
201 inF = open( self._inFileName, "r" )
202 outF = open( self._outFileName, "w" )
203 alignInstance = Align()
204
205 while True:
206 if not alignInstance.read( inF ): break
207 totalNbPairAlign += 1
208 if self._verbose > 1: alignInstance.show()
209
210 if "chunk" not in alignInstance.range_query.seqname or "chunk" not in alignInstance.range_subject.seqname:
211 print "WARNING: no 'chunk' in query or subject name"; return False
212
213 chunkQuery = int(alignInstance.range_query.seqname.replace("chunk",""))
214 chunkSubject = int(alignInstance.range_subject.seqname.replace("chunk",""))
215
216 if abs( chunkSubject - chunkQuery ) > 1:
217 if self._verbose > 1: print "non contiguous chunks -> keep"
218 alignInstance.write( outF )
219 continue
220
221 if alignInstance.range_query.isOnDirectStrand() != alignInstance.range_subject.isOnDirectStrand():
222 if self._verbose > 1: print "on different strand"
223 alignInstance.write( outF )
224 continue
225
226 if abs( chunkSubject - chunkQuery ) == 0:
227 if alignInstance.range_query.start == 1 \
228 and alignInstance.range_query.end == self._chunkLength \
229 and alignInstance.range_subject.start == 1 \
230 and alignInstance.range_subject.end == self._chunkLength:
231 if self._verbose > 1: print "self-alignment on whole chunk -> remove"
232 continue
233
234 if self.isPairAlignAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
235 if self._verbose > 1: print "chunk overlap -> remove"
236 nbChunkOverlaps += 1
237
238 elif self.isPairAlignWithinAndDueToAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
239 if self._verbose > 1: print "within chunk overlap -> remove"
240 nbPairAlignWithinChunkOverlaps += 1
241
242 else:
243 if self._verbose > 1: print "keep"
244 alignInstance.write( outF )
245
246 inF.close()
247 if self._verbose > 0: print "nb of pairwise alignments in input file: %i" % ( totalNbPairAlign )
248 if self._verbose > 0: print "nb of chunk overlaps: %i" % ( nbChunkOverlaps )
249 if self._verbose > 0: print "nb of pairwise alignments within chunk overlaps: %i" % ( nbPairAlignWithinChunkOverlaps )
250
251 for names,lAligns in d.items():
252 for alignInstance in lAligns:
253 alignInstance.write( outF )
254 outF.close()
255
256
257 def start( self ):
258 """
259 Useful commands before running the program.
260 """
261 if self._verbose > 0:
262 print "START %s" % ( type(self).__name__ ); sys.stdout.flush()
263 self.checkAttributes()
264
265
266 def end( self ):
267 """
268 Useful commands before ending the program.
269 """
270 if self._verbose > 0:
271 print "END %s" % ( type(self).__name__ ); sys.stdout.flush()
272
273
274 def run( self ):
275 """
276 Run the program.
277 """
278 self.start()
279 self.removeChunkOverlaps()
280 self.end()
281
282
283 if __name__ == '__main__':
284 i = RmvPairAlignInChunkOverlaps()
285 i.setAttributesFromCmdLine()
286 i.run()