annotate commons/tools/OrientSequences.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 Interface to orient sequences before making a multiple alignment.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 Use hashing or suffix tree to get an idea of the appropriate strand.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6 Use 'orienter' by default, otherwise use 'mummer'.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 import glob
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 import getopt
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 from commons.core.seq.BioseqDB import BioseqDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 import pyRepet.seq.fastaDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 from commons.core.checker.CheckerUtils import CheckerUtils
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 class OrientSequences( object ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 Interface to orient sequences before making a multiple alignment.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 Use hashing or suffix tree to get an idea of the appropriate strand.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 Use 'orienter' by default, otherwise use 'mummer'.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 Constructor.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 self._minMatchLength = minMatchLength
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 self._prgToOrient = prgToOrient
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 self._outFileName = outFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 self._clean = clean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 self._verbose = verbosity
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36 def help( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 Display the help on stdout.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 print "usage:",sys.argv[0].split("/")[-1],"[options]"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 print "options:"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 print " -h: this help"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44 print " -i: name of the input file (format='fasta')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 print " -m: minimum match length (default=10)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 print " -p: program to use first (default=orienter/mummer)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 print " -o: name of the output file (default=inFileName+'.oriented')"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 print " -c: clean"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49 print " -v: verbosity level (0/default=1/2)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 print
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 def setAttributesFromCmdLine( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 Set the attributes from the command-line.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 try:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 except getopt.GetoptError, err:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 print str(err); self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 for o,a in opts:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 if o == "-h":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 self.help(); sys.exit(0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 elif o == "-i":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 self.setInputFileName( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 elif o == "-m":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66 self.setMinMatchLength( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 elif o == "-p":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 self.setPrgToOrient( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 elif o == "-o":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 self.setOutputFileName( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71 elif o == "-c":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 self.setClean()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 elif o == "-v":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74 self.setVerbosityLevel( a )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 def setInputFileName( self, inFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 def setMinMatchLength( self, minMatchLength ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 self._minMatchLength = int(minMatchLength)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 def setPrgToOrient( self, prgToOrient ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83 self._prgToOrient = prgToOrient
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 def setOutputFileName( self, outFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 self._outFileName = outFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 def setClean( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 self._clean = True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 def setVerbosityLevel( self, verbose ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 self._verbose = int(verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 def checkAttributes( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96 Check the attributes are valid before running the algorithm.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 if self._inFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99 print "ERROR: missing input file name"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 if not os.path.exists( self._inFileName ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 print "ERROR: input file '%s' doesn't exist" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 if self._prgToOrient not in [ "orienter", "mummer" ]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 print "ERROR: unknown program '%s'" % ( self._prgToOrient )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 self.help(); sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 if self._outFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 self._outFileName = "%s.oriented" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 def useOrienter( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 Use 'orienter'.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 @return: exit value of 'orienter'
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 prg = "orienter"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 cmd += " -k"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118 # cmd += " -l %i" % ( self._minMatchLength )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 cmd += " -v %i" % ( self._verbose )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 cmd += " %s" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121 log = os.system( cmd )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 if log == 0 and self._outFileName.split(".")[-1] != "oriented":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 os.rename( self._inFileName + ".oriented", self._outFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124 return log
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 def compareInputSequencesWithMummer( self, nbInSeq ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 Only N(N-1)/2 comparisons are made.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130 @param nbInSeq: number of input sequences
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 @type nbInSeq: integer
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 print "aligning input sequences..."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136 if not CheckerUtils.isExecutableInUserPath( "mummer" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 msg = "ERROR: 'mummer' is not in your PATH"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 sys.stderr.write( "%s\n" % ( msg ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 lInFiles = glob.glob( "batch_*.fa" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 for i in range( 1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142 for j in range( i+1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 if self._verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 print "launch MUmmer on %i versus %i" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 prg = "mummer"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148 cmd += " -maxmatch"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 cmd += " -l %i" % ( self._minMatchLength )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 cmd += " -b"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151 cmd += " -F"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154 cmd += " > mummer_%i_vs_%i.txt" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155 if self._verbose < 3:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 cmd += " 2> /dev/null"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 returnStatus = os.system( cmd )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 if returnStatus != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 sys.stderr.write( "%s\n" % ( msg ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 Parse the output from MUmmer.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166 @param mummerFileName: file with the output from MUmmer
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167 @type mummerFileName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168 @param nameSeq1: name of the first sequence in the pairwise comparison
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 @type nameSeq1: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 @param nameSeq2: name of the first sequence in the pairwise comparison
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 @type nameSeq2: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172 @return: dictionary whose keys are strands and values the number of maximal matches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 if self._verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 print "parse '%s'" % ( mummerFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177 dStrand2NbMatches = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 mummerF = open( mummerFileName, "r" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 strand = "direct"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 nbMatches = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 line = mummerF.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 if line[0] == ">":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 if nameSeq1 not in line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187 print "ERROR"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 print nameSeq1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189 print nameSeq2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191 if "Reverse" in line:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192 dStrand2NbMatches[ strand ] = nbMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193 strand = "reverse"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 nbMatches = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196 nbMatches += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197 line = mummerF.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198 dStrand2NbMatches[ strand ] = nbMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 mummerF.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200 if self._verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201 print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204 return dStrand2NbMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206 def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208 For each pairwise comparison, retrieve the number of matches on both strand.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209 @param lInHeaders: list of the sequence headers
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210 @type lInHeaders: list of strings
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 @param nbInSeq: number of input sequences
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212 @type nbInSeq: integer
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213 @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215 dMatrix = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216 for i in range( 1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 for j in range( i+1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218 pairComp = "%i_vs_%i" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219 dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220 dMatrix[ pairComp ] = dStrand2NbMatches
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221 return dMatrix
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223 def showResultsAsOrienter( self, nbInSeq, dMatrix ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225 Not used for the moment but can be useful...
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 for i in range( 1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228 for j in range( i+1, nbInSeq+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 pairComp = "%i_vs_%i" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230 string = "aligning %i with %i" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232 string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233 if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234 string += " strand=-"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235 string += " nb=%i" % ( dMatrix[pairComp]["reverse"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237 string += " strand=+"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 string += " nb=%i" % ( dMatrix[pairComp]["direct"] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 print string; sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241 def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243 Retrieve the sequences than need to be re-oriented.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 @param dMatrix:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245 @type dMatrix:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 @param lInHeaders: list of the new sequence headers
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247 @type lInHeaders: list of strings
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 @return: list of headers corresponding to sequences than need to be re-oriented
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250 lSeqToOrient = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 for i in range( 1, len(lNewHeaders)+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 for j in range( i+1, len(lNewHeaders)+1 ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254 comp = "%i_vs_%i" % ( i, j )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255 nbDirectMatches = dMatrix[ comp ][ "direct" ]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256 nbReverseMatches = dMatrix[ comp ][ "reverse" ]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 if self._verbose > 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259 if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260 if lNewHeaders[ j-1 ] not in lSeqToOrient:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 if self._verbose > 2:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262 "need to reverse '%s'" % ( lNewHeaders[j-1] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 lSeqToOrient.append( lNewHeaders[ j-1 ] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264 return lSeqToOrient
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 def getSequencesToReverse( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269 Return a list with the headers of the sequences to reverse.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 lSequenceHeadersToReverse = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273 if self._prgToOrient == "orienter":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274 exitStatus = self.useOrienter()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 if exitStatus == 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 self.end()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277 sys.exit(0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 if exitStatus != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 print "\nWARNING: 'orienter' had a problem, switching to 'mummer'"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282 lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283 nbInSeq = len( lInHeaders )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 print "nb of input sequences: %i" % ( nbInSeq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
287
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
288 pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
289 tmpFileName = "%s.shortH" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
290 lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
291 dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
292
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
293 pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
294 os.chdir( "batches" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
295 self.compareInputSequencesWithMummer( nbInSeq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
296 dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
297 os.chdir( ".." )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
298
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
299 lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
300 for newH in lNewHeadersToReverse:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
301 lSequenceHeadersToReverse.append( dNew2Init[ newH ] )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
302 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
303 print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
304 for initH in lSequenceHeadersToReverse: print " %s" % ( initH )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
305 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
306
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
307 if self._clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
308 os.remove( tmpFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
309 os.remove( "%slink" % ( tmpFileName ) )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
310
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
311 return lSequenceHeadersToReverse
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
312
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
313 def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
314 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
315 Save input sequences while re-orienting those needing it.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
316 @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
317 @type lSequenceHeadersToReverse: list of strings
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
318 @param tmpFileName: name of a fasta file (inFileName by default)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
319 @type tmpFileName: string
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
320 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
321 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
322 print "saving oriented sequences..."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
323 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
324 if tmpFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
325 tmpFileName = self._inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
326 inDB = BioseqDB( tmpFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
327 outDB = BioseqDB()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
328 for bs in inDB.db:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
329 if bs.header in lSequenceHeadersToReverse:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
330 bs.reverseComplement()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
331 bs.header += " re-oriented"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
332 outDB.add( bs )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
333 outDB.save( self._outFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
334
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
335 def clean( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
336 if os.path.exists( "batches" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
337 os.system( "rm -rf batches" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
338 if os.path.exists( "orienter_error.log" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
339 os.remove( "orienter_error.log" )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
340 for f in glob.glob( "core.*" ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
341 os.remove( f )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
342
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
343 def start( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
344 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
345 Useful commands before running the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
346 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
347 self.checkAttributes()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
348 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
349 print "START %s" % ( type(self).__name__ )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
350 print "input file: %s" % ( self._inFileName )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
351 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
352
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
353 def end( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
354 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
355 Useful commands before ending the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
356 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
357 if self._clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
358 self.clean()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
359 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
360 print "END %s" % ( type(self).__name__ )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
361 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
362
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
363 def run( self ):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
364 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
365 Run the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
366 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
367 self.start()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
368 lSequenceHeadersToReverse = self.getSequencesToReverse()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
369 self.orientInputSequences( lSequenceHeadersToReverse )
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
370 self.end()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
371
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
372 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
373 i = OrientSequences()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
374 i.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
375 i.run()