annotate commons/tools/RmvPairAlignInChunkOverlaps.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 Remove hits due to chunk overlaps.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 import getopt
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 import exceptions
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 import copy
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 from commons.core.coord.Align import *
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 class RmvPairAlignInChunkOverlaps( object ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 Remove hits due to chunk overlaps.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 def __init__( self, inFileName="", chunkLength=200000, chunkOverlap=10000, margin=10, outFileName="", verbose=0 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 Constructor.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 self._chunkLength = chunkLength
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 self._chunkOverlap = chunkOverlap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 self._margin = margin
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 self._outFileName = outFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 def help( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 Display the help.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 print "usage: %s [ options ]" % ( sys.argv[0] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 print "options:"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 print " -h: this help"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 print " -i: name of the input file (format='align')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 print " -l: chunk length (in bp)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 print " -o: chunk overlap (in bp)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 print " -m: margin to remove match included into a chunk overlap (default=10)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 print " -O: name of the output file (default=inFileName+'.not_over')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 print " -v: verbose (default=0/1)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 def setAttributesFromCmdLine( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 Set attributes from the command-line arguments.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 opts, args = getopt.getopt(sys.argv[1:],"h:i:l:o:m:O:v:")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 except getopt.GetoptError, err:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 print str(err); self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 for o,a in opts:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 if o == "-h":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 self.help(); sys.exit(0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 elif o == "-i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 self.setInputFileName( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 elif o == "-l":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 self.setChunkLength( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 elif o == "-o":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 self.setChunkOverlap( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 elif o == "-m":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 self.setMargin( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 elif o == "-O":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 self.setOutputFileName( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 elif o == "-v":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 self.setVerbosityLevel( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 def setInputFileName( self, inFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 def setChunkLength( self, chunkLength ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 self._chunkLength = int(chunkLength)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 def setChunkOverlap( self, chunkOverlap ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 self._chunkOverlap = int(chunkOverlap)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81 def setMargin( self, margin ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 self._margin = int(margin)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 def setOutputFileName( self, outFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 self._outFileName = outFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 def setVerbosityLevel( self, verbose ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 self._verbose = int(verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 def checkAttributes( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 Before running, check the required attributes are properly filled.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 if self._inFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 print "ERROR: missing input file"; self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 if not os.path.exists(self._inFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 print "ERROR: input file '%s' doesn't exist" %( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 if self._outFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 self._outFileName = "%s.not_over" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 def isPairAlignAChunkOverlap( self, a, chunkQuery, chunkSubject ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 Return True if the pairwise alignment exactly corresponds to a 2-chunk overlap, False otherwise.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 Take into account cases specific to BLASTER or PALS.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 if self._verbose > 1: print "on different strand"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 if chunkQuery == chunkSubject + 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 if self._verbose > 1: print "query > subject"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 if a.range_query.start == 1 and a.range_subject.end == self._chunkLength \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 and ( a.range_query.getLength() == self._chunkOverlap \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 and ( a.range_subject.getLength() == self._chunkOverlap \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 if self._verbose > 1: print "chunk overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 return True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 elif chunkQuery == chunkSubject - 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 if self._verbose > 1: print "query < subject"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 if a.range_query.end == self._chunkLength and a.range_subject.start == 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 and ( a.range_query.getLength() == self._chunkOverlap \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 and ( a.range_subject.getLength() == self._chunkOverlap \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 if self._verbose > 1: print "chunk overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 return True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 if self._verbose > 1: print "not a chunk overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 def isInInterval( self, coord, midpoint ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 Check if a given coordinate is inside the interval [midpoint-margin;midpoint+margin].
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 if coord >= midpoint - self._margin and coord <= midpoint + self._margin:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 return True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 def isPairAlignWithinAndDueToAChunkOverlap( self, a, chunkQuery, chunkSubject ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 Return True if the pairwise alignment lies within an overlap between two contiguous chunks and is due to it, False otherwise.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 uniqLength = self._chunkLength - self._chunkOverlap
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 if self._verbose > 1: print "on different strand"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155 if chunkQuery == chunkSubject + 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 if self._verbose > 1: print "query > subject"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 if a.range_query.getMin() >= 1 and a.range_query.getMax() <= self._chunkOverlap \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 and a.range_subject.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 and a.range_subject.getMax() <= self._chunkLength:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 if self._verbose > 1: print "included"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 if self.isInInterval( a.range_query.getMin(), a.range_subject.getMin() - uniqLength ) \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162 and self.isInInterval( self._chunkOverlap - a.range_query.getMax(), self._chunkLength - a.range_subject.getMax() ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 if self._verbose > 1: print "due to overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 return True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 if self._verbose > 1: print "not due to overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 elif chunkQuery == chunkSubject - 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 if self._verbose > 1: print "query < subject"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 if a.range_query.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172 and a.range_query.getMax() <= self._chunkLength \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 and a.range_subject.getMin() >= 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 and a.range_subject.getMax() <= self._chunkOverlap:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 if self._verbose > 1: print "included"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 if self.isInInterval( a.range_subject.getMin(), a.range_query.getMin() - uniqLength ) \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177 and self.isInInterval( self._chunkOverlap - a.range_subject.getMax(), self._chunkLength - a.range_query.getMax() ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 if self._verbose > 1: print "due to overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 return True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 if self._verbose > 1: print "not due to overlap"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 if self._verbose > 1: print "not contiguous chunks"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 if self._verbose > 1: print "not included"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189 return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192 def removeChunkOverlaps( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 Remove pairwise alignments exactly corresponding to chunk overlaps or those included within such overlaps.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196 totalNbPairAlign = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197 nbChunkOverlaps = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198 d = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 nbPairAlignWithinChunkOverlaps = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201 inF = open( self._inFileName, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 outF = open( self._outFileName, "w" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203 alignInstance = Align()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206 if not alignInstance.read( inF ): break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 totalNbPairAlign += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208 if self._verbose > 1: alignInstance.show()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210 if "chunk" not in alignInstance.range_query.seqname or "chunk" not in alignInstance.range_subject.seqname:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 print "WARNING: no 'chunk' in query or subject name"; return False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213 chunkQuery = int(alignInstance.range_query.seqname.replace("chunk",""))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 chunkSubject = int(alignInstance.range_subject.seqname.replace("chunk",""))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216 if abs( chunkSubject - chunkQuery ) > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 if self._verbose > 1: print "non contiguous chunks -> keep"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218 alignInstance.write( outF )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219 continue
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221 if alignInstance.range_query.isOnDirectStrand() != alignInstance.range_subject.isOnDirectStrand():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222 if self._verbose > 1: print "on different strand"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223 alignInstance.write( outF )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 continue
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 if abs( chunkSubject - chunkQuery ) == 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 if alignInstance.range_query.start == 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228 and alignInstance.range_query.end == self._chunkLength \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 and alignInstance.range_subject.start == 1 \
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230 and alignInstance.range_subject.end == self._chunkLength:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 if self._verbose > 1: print "self-alignment on whole chunk -> remove"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232 continue
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234 if self.isPairAlignAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235 if self._verbose > 1: print "chunk overlap -> remove"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 nbChunkOverlaps += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 elif self.isPairAlignWithinAndDueToAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 if self._verbose > 1: print "within chunk overlap -> remove"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240 nbPairAlignWithinChunkOverlaps += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243 if self._verbose > 1: print "keep"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 alignInstance.write( outF )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 inF.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247 if self._verbose > 0: print "nb of pairwise alignments in input file: %i" % ( totalNbPairAlign )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 if self._verbose > 0: print "nb of chunk overlaps: %i" % ( nbChunkOverlaps )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249 if self._verbose > 0: print "nb of pairwise alignments within chunk overlaps: %i" % ( nbPairAlignWithinChunkOverlaps )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251 for names,lAligns in d.items():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 for alignInstance in lAligns:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 alignInstance.write( outF )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254 outF.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 def start( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259 Useful commands before running the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262 print "START %s" % ( type(self).__name__ ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 self.checkAttributes()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266 def end( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268 Useful commands before ending the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 print "END %s" % ( type(self).__name__ ); sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274 def run( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 Run the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 self.start()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 self.removeChunkOverlaps()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 self.end()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283 if __name__ == '__main__':
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 i = RmvPairAlignInChunkOverlaps()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 i.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286 i.run()