18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 ##@file
|
|
4 # For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies.
|
|
5 # usage: GetMultAlignAndPhylogenyPerTErefSeq.py [ options ]
|
|
6 # options:
|
|
7 # -h: this help
|
|
8 # -S: step (0: all steps [default], 1:file generation, 2:multiple alignements, 3:phylogenies)
|
|
9 # -p: table with the annotations (format=path)
|
|
10 # -s: table with the TE reference sequences (format=seq)
|
|
11 # -g: table with the genome sequence (format=seq)
|
|
12 # -r: name or file with TE reference sequence(s) (all by default)
|
|
13 # -m: MSA method (default=Refalign/Map)
|
|
14 # -l: minimum length of copies (default=100)
|
|
15 # -n: number of longest copies to use (default=20)
|
|
16 # -y: minimum copy proportion compare to references (default=0.5)
|
|
17 # -R: keep the reference sequence (only with Refalign)
|
|
18 # -C: configuration file
|
|
19 # -q: queue name
|
|
20 # -c: clean
|
|
21 # -d: temporary directory
|
|
22 # -v: verbosity level (default=0/1)
|
|
23
|
|
24 import os
|
|
25 import sys
|
|
26 import glob
|
|
27 import ConfigParser
|
|
28
|
|
29 import pyRepet.launcher.programLauncher
|
|
30
|
|
31 from commons.core.coord.PathUtils import PathUtils
|
|
32 from commons.core.seq.FastaUtils import FastaUtils
|
|
33 from commons.core.coord.SetUtils import SetUtils
|
|
34 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
|
|
35 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
|
|
36 from commons.tools.OrientSequences import OrientSequences
|
|
37 from ConfigParser import MissingSectionHeaderError
|
|
38 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
39 from commons.core.LoggerFactory import LoggerFactory
|
|
40 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
|
|
41 from commons.launcher import LaunchMap
|
|
42 from commons.core.sql.DbFactory import DbFactory
|
|
43 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
|
|
44 from commons.core.launcher.Launcher import Launcher
|
|
45 from commons.core.utils.FileUtils import FileUtils
|
|
46
|
|
47
|
|
48 LOG_DEPTH = "repet.tools"
|
|
49
|
|
50 ## For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies.
|
|
51 #
|
|
52 class GetMultAlignAndPhylogenyPerTErefSeq(object):
|
|
53
|
|
54 def __init__(self, pathTableName="",refSeqTableName="", genomeSeqTableName="", step=0, mSAmethod="RefAlign",keepRefseq=False, configFileName= "", clean = True, verbosity=3):
|
|
55 """
|
|
56 Constructor.
|
|
57 """
|
|
58 self.step = step
|
|
59 self._pathTable = pathTableName
|
|
60 self._refSeqTable = refSeqTableName
|
|
61 self._genomeSeqTable = genomeSeqTableName
|
|
62 self._TErefseq = ""
|
|
63 self._MSAmethod = mSAmethod
|
|
64 self._minCopyLength = 100
|
|
65 self._nbLongestCopies = 20
|
|
66 self._minPropCopy = 0.5
|
|
67 self._keepRefseq = keepRefseq
|
|
68 self.setConfigFileName(configFileName)
|
|
69 self._queue = ""
|
|
70 self._tmpDir = ""
|
|
71 self._clean = clean
|
|
72 self._verbosity = verbosity
|
|
73 self._db = None
|
|
74 self._tpaAnnot = None
|
|
75 self._tsaRef = None
|
|
76 self._pL = pyRepet.launcher.programLauncher.programLauncher()
|
|
77 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
|
|
78
|
|
79 def _logAndRaise(self, errorMsg):
|
|
80 self._log.error(errorMsg)
|
|
81 raise Exception(errorMsg)
|
|
82
|
|
83 def setAttributesFromCmdLine(self):
|
|
84 desc = "For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies.\n"
|
|
85 #Commented: it's not true, Config File is mandatory!
|
|
86 # desc += "Connection to the database parameters are retrieved from the environment"
|
|
87
|
|
88 #TODO: format options as other scripts (have a look at LaunchTemplate)
|
|
89 parser = RepetOptionParser(description = desc, epilog = "")
|
|
90 parser.add_option("-S", "--step", dest = "step" , action = "store", type = "int", help = "step (0: all steps [default], 1:file generation, 2:multiple alignments, 3:phylogenies)", default = 0 )
|
|
91 parser.add_option("-p", "--pathTable", dest = "path", action= "store", type = "string", help = "(mandatory) table with the annotations (format=path)", default = "")
|
|
92 parser.add_option("-s", "--refSeqTable",dest = "refSeqTable", action= "store", type = "string", help = "(mandatory) table with the TE reference sequences (format=seq)", default = "")
|
|
93 parser.add_option("-g", "--genomeSeqTable",dest = "genomeSeqTable",action= "store", type = "string", help = "(mandatory) table with the genome sequence (format=seq)", default = "")
|
|
94 parser.add_option("-r", "--TErefseq",dest = "TErefseq", action= "store", type = "string", help = "name or file with TE reference sequence(s) (all by default)", default = "")
|
|
95 parser.add_option("-m", "--MSAmethod",dest = "MSAmethod", action= "store", type = "string", help = "MSA method (default=RefAlign/Map)", default = "RefAlign")
|
|
96 parser.add_option("-l", "--minCopyLength",dest = "minCopyLength", action= "store", type = "int", help = "minimum length of copies (default=100)", default = 100)
|
|
97 parser.add_option("-n", "--nbLongestCopies",dest = "nbLongestCopies", action= "store", type = "int", help = "number of longest copies to use (default=20)", default = 20)
|
|
98 parser.add_option("-y", "--minPropCopy",dest = "minPropCopy", action= "store", type = "float", help = "minimum copy proportion compare to references (default=0.5)", default = 0.5)
|
|
99 parser.add_option("-R", "--keepRefseq",dest = "keepRefseq", action="store_true", help = "keep the reference sequence (only with Refalign)", default = False)
|
|
100 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "(mandatory) config file name to set database connection", default = "")
|
|
101 parser.add_option("-q", "--queue",dest = "queue", action= "store", type = "string", help = "queue name", default = "")
|
|
102 parser.add_option("-c", "--clean", action="store_false", dest = "clean", help = "don't clean", default = True)
|
|
103 parser.add_option("-d", "--tmpDir",dest = "tmpDir", action= "store", type = "string", help = "temporary directory", default = "")
|
|
104 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity level (default=0)", default = 0)
|
|
105 options = parser.parse_args()[0]
|
|
106 self._setAttributesFromOptions(options)
|
|
107
|
|
108 def _setAttributesFromOptions(self, options):
|
|
109 self.setConfigFileName(options.configFileName)
|
|
110 self.setStep(options.step)
|
|
111 self.setPathTable(options.path)
|
|
112 self.setRefSeqTable(options.refSeqTable)
|
|
113 self.setGenomeSeqTable(options.genomeSeqTable)
|
|
114 self.setTErefseq(options.TErefseq)
|
|
115 self.setMSAmethod(options.MSAmethod)
|
|
116 self.setMinCopyLength(options.minCopyLength)
|
|
117 self.setNbLongestCopies(options.nbLongestCopies)
|
|
118 self.setMinPropCopy(options.minPropCopy)
|
|
119 self.setKeepRefseq(options.keepRefseq)
|
|
120 self.setQueue(options.queue)
|
|
121 self.setClean(options.clean)
|
|
122 self.setTmpDir(options.tmpDir)
|
|
123 self.setVerbosity(options.verbosity)
|
|
124
|
|
125 def setStep(self, step):
|
|
126 self.step = step
|
|
127
|
|
128 def setPathTable(self, path):
|
|
129 self._pathTable = path
|
|
130
|
|
131 def setRefSeqTable(self, refSeqTable):
|
|
132 self._refSeqTable = refSeqTable
|
|
133
|
|
134 def setGenomeSeqTable(self, genomeSeqTable):
|
|
135 self._genomeSeqTable = genomeSeqTable
|
|
136
|
|
137 def setTErefseq(self, TErefseq):
|
|
138 self._TErefseq = TErefseq
|
|
139
|
|
140 def setMSAmethod(self, MSAmethod):
|
|
141 self._MSAmethod = MSAmethod
|
|
142
|
|
143 def setMinCopyLength(self, minCopyLength):
|
|
144 self._minCopyLength = minCopyLength
|
|
145
|
|
146 def setNbLongestCopies(self, nbLongestCopies):
|
|
147 self._nbLongestCopies = nbLongestCopies
|
|
148
|
|
149 def setMinPropCopy(self, minPropCopy):
|
|
150 self._minPropCopy = minPropCopy
|
|
151
|
|
152 def setKeepRefseq(self, keepRefseq):
|
|
153 self._keepRefseq = keepRefseq
|
|
154
|
|
155 def setQueue(self, queue):
|
|
156 self._queue = queue
|
|
157
|
|
158 def setClean(self, clean):
|
|
159 self._clean = clean
|
|
160
|
|
161 def setTmpDir(self, tmpDir):
|
|
162 self._tmpDir = tmpDir
|
|
163
|
|
164 def setVerbosity(self, verbosity):
|
|
165 self._verbosity = verbosity
|
|
166
|
|
167 def setup_env(self):
|
|
168 configFileHandle = open(self._configFileName)
|
|
169 # Use RepetConfigParser?
|
|
170 config = ConfigParser.ConfigParser()
|
|
171 try :
|
|
172 config.readfp(configFileHandle)
|
|
173 except MissingSectionHeaderError:
|
|
174 self._logAndRaise("Configuration file %s must begin with a section header" % self._configFileName)
|
|
175
|
|
176 os.environ["REPET_HOST"] = config.get("repet_env", "repet_host")
|
|
177 os.environ["REPET_USER"] = config.get("repet_env", "repet_user")
|
|
178 os.environ["REPET_PW"] = config.get("repet_env", "repet_pw")
|
|
179 os.environ["REPET_DB"] = config.get("repet_env", "repet_db")
|
|
180 os.environ["REPET_PORT"] = config.get("repet_env", "repet_port")
|
|
181 os.environ["REPET_JOB_MANAGER"] = config.get("repet_env", "repet_job_manager")
|
|
182
|
|
183 def setConfigFileName(self, configFileName):
|
|
184 self._configFileName = configFileName
|
|
185
|
|
186 def checkAttributes( self ):
|
|
187 """
|
|
188 Check the attributes are valid before running the algorithm.
|
|
189 """
|
|
190 if self._pathTable == "":
|
|
191 self._logAndRaise("PathTable is mandatory")
|
|
192 if self._refSeqTable == "":
|
|
193 self._logAndRaise("RefSeqTable is mandatory")
|
|
194 if self._genomeSeqTable == "":
|
|
195 self._logAndRaise("GenomeSeqTable is mandatory")
|
|
196 if self._configFileName == "":
|
|
197 self._logAndRaise("Configuration file is mandatory")
|
|
198 else:
|
|
199 if FileUtils.isRessourceExists(self._configFileName):
|
|
200 self.setup_env()
|
|
201 else:
|
|
202 self._logAndRaise("Configuration file '%s' does not exist!" % self._configFileName)
|
|
203
|
|
204 if ( self.step == 2 or self.step == 3 ) and self._MSAmethod not in ["RefAlign","Map"]:
|
|
205 if self._MSAmethod == None or self._MSAmethod == "":
|
|
206 self._logAndRaise("Missing method option")
|
|
207 self._logAndRaise("Method '%s' not yet available" % ( self._MSAmethod ))
|
|
208
|
|
209 if self._tmpDir == "":
|
|
210 self._tmpDir = os.getcwd()
|
|
211
|
|
212 def connectSql(self):
|
|
213 self._db = DbFactory().createInstance(configFileName = self._configFileName, verbosity = 1)
|
|
214 self._tpaAnnot = TablePathAdaptator(self._db, self._pathTable)
|
|
215 self._tsaRef = TableSeqAdaptator(self._db,self._refSeqTable)
|
|
216
|
|
217 def getNamesOfTErefSeq( self ):
|
|
218 """
|
|
219 Return a list with the names of reference TEs.
|
|
220 """
|
|
221 lNamesTErefSeq = []
|
|
222 if self._TErefseq == "":
|
|
223 lNamesTErefSeq = self._tsaRef.getAccessionsList()
|
|
224 else:
|
|
225 if os.path.isfile( self._TErefseq ):
|
|
226 refseqFileHandler = open( self._TErefseq, "r" )
|
|
227 while True:
|
|
228 line = refseqFileHandler.readline()
|
|
229 if line == "":
|
|
230 break
|
|
231 lNamesTErefSeq.append( line[:-1].split("\t")[0] )
|
|
232 refseqFileHandler.close()
|
|
233 else:
|
|
234 lNamesTErefSeq = [ self._TErefseq ]
|
|
235 for name in lNamesTErefSeq:
|
|
236 if not self._tsaRef.isAccessionInTable( name ):
|
|
237 self._log.warning("'%s' not in table '%s'" % (name, self._refSeqTable))
|
|
238 lNamesTErefSeq.remove( name )
|
|
239 lNamesTErefSeq.sort()
|
|
240 self._log.info("nb of TE reference sequences: %d" % (len(lNamesTErefSeq)))
|
|
241 return lNamesTErefSeq
|
|
242
|
|
243
|
|
244 def getTErefSeqInFastaFiles( self, lNamesTErefSeq ):
|
|
245 """
|
|
246 Save sequences of reference TEs in fasta files.
|
|
247 """
|
|
248 for name in lNamesTErefSeq:
|
|
249 self._log.debug("save sequence of '%s'..." % ( name ))
|
|
250 self._tsaRef.saveAccessionsListInFastaFile( [name], name+".fa" )
|
|
251
|
|
252 def getCopiesInFastaFilesPerTErefSeq( self, lNamesTErefSeq ):
|
|
253 """
|
|
254 Save sequences of TE copies in fasta files.
|
|
255 """
|
|
256 self._log.info("retrieve the copies...")
|
|
257 tsaChr = TableSeqAdaptator( self._db, self._genomeSeqTable )
|
|
258 totalNbCopies = 0
|
|
259 totalNbSavedCopies = 0
|
|
260 for name in lNamesTErefSeq:
|
|
261 nbCopies = 0
|
|
262 if os.path.exists(name+"_copies.fa"):
|
|
263 continue
|
|
264 outFile = open( name+"_copies.fa", "w" )
|
|
265 self._log.debug("Fetching path nums for subject: %s " % name)
|
|
266 lPathNums = self._tpaAnnot.getIdListSortedByDecreasingChainLengthFromSubject(name)
|
|
267 nbCopies = len(lPathNums)
|
|
268 totalNbCopies += nbCopies
|
|
269 self._log.debug("refseq '%s': %d copies" % ( name, nbCopies ))
|
|
270 i = 0
|
|
271 nbSavedCopies = 0
|
|
272 nbSavedFragments = 0
|
|
273 lengthRefseq = self._tsaRef.getSeqLengthFromAccession( name )
|
|
274 while i < len(lPathNums) and nbSavedCopies < self._nbLongestCopies:
|
|
275 lPaths = self._tpaAnnot.getPathListFromId( lPathNums[i] )
|
|
276 lSets = PathUtils.getSetListFromQueries( lPaths )
|
|
277 copyLength = SetUtils.getCumulLength( lSets )
|
|
278 if copyLength >= self._minCopyLength \
|
|
279 and copyLength >= self._minPropCopy * lengthRefseq:
|
|
280 bs = tsaChr.getBioseqFromSetList( lSets )
|
|
281 bs.write(outFile)
|
|
282 nbSavedCopies += 1
|
|
283 nbSavedFragments += len(lPaths)
|
|
284 i += 1
|
|
285
|
|
286 self._log.debug(" (saved: %d copies, %d fragments)\n" % ( nbSavedCopies, nbSavedFragments ) )
|
|
287 outFile.close()
|
|
288 totalNbSavedCopies += nbSavedCopies
|
|
289 if nbSavedCopies == 0 and nbCopies != 0:
|
|
290 self._log.warning("No copy >= %d" % ( self._minCopyLength ))
|
|
291 self._log.info("nb of copies: %d (%d saved)" % ( totalNbCopies, totalNbSavedCopies ))
|
|
292
|
|
293 def filter4Alignments( self, lNamesTErefSeq ):
|
|
294 """
|
|
295 Filter TE copy sequences according to their length.
|
|
296 """
|
|
297 self._log.info("filtering copies...")
|
|
298 if len(lNamesTErefSeq) == 1:
|
|
299 if not os.path.exists( "%s_copies.fa" % ( lNamesTErefSeq[0] ) ):
|
|
300 self._logAndRaise("first run step 1")
|
|
301 lInFiles = [ "%s_copies.fa" % ( lNamesTErefSeq[0] ) ]
|
|
302 else:
|
|
303 lInFiles = glob.glob( "*_copies.fa" )
|
|
304 if len(lInFiles) == 0:
|
|
305 self._logAndRaise("no file '*_copies.fa'")
|
|
306 count = 0
|
|
307 for inFileName in lInFiles:
|
|
308 if os.path.exists( "%s.filtered" % ( inFileName ) ):
|
|
309 continue
|
|
310 count += 1
|
|
311 self._log.debug("TE %d --> %s" %(count,inFileName))
|
|
312 FastaUtils.dbLengthFilter( self._minCopyLength, inFileName, verbose=self._verbosity )
|
|
313 FastaUtils.dbLongestSequences( self._nbLongestCopies, inFileName+".Sup"+str(self._minCopyLength), verbose=self._verbosity )
|
|
314 os.rename( inFileName+".Sup"+str(self._minCopyLength)+".best"+str(self._nbLongestCopies), inFileName+".filtered" )
|
|
315 os.remove( inFileName+".Sup"+str(self._minCopyLength) )
|
|
316 os.remove( inFileName+".Inf"+str(self._minCopyLength) )
|
|
317 self._log.info("files filtered: %d" % (count))
|
|
318
|
|
319 def buildInFiles4Launcher( self, lNamesTErefSeq ):
|
|
320 """
|
|
321 Save sequences of TE copies with reference in fasta files for launcher usage.
|
|
322 """
|
|
323 self._log.info("building input files for launcher...")
|
|
324 for name in lNamesTErefSeq:
|
|
325 if os.path.exists( "%s_all.fa.oriented" % ( name ) ):
|
|
326 continue
|
|
327 if FastaUtils.dbSize( "%s_copies.fa.filtered" % ( name ) ) > 0:
|
|
328 os.system( "cat "+name+".fa "+ name+"_copies.fa.filtered > " + name+"_all.fa" )
|
|
329 ors = OrientSequences(inFileName= "%s_all.fa" %(name), prgToOrient="mummer", clean=True, verbosity =self._verbosity - 1)
|
|
330 ors.run()
|
|
331 ors.clean()
|
|
332 if len( glob.glob("*_all.fa.oriented") ) == 0:
|
|
333 self._logAndRaise("no copies")
|
|
334 self._log.info("done building input files for launcher...")
|
|
335
|
|
336 def renameFile( self, fromName, toName):
|
|
337 lFiles = glob.glob( "*%s" %fromName )
|
|
338 for f in lFiles:
|
|
339 os.rename( f, f.replace(fromName,toName))
|
|
340
|
|
341 def _preparejobs(self, iLauncher, cDir):
|
|
342 self._log.info("Preparing Align jobs")
|
|
343 lCmdsTuples = []
|
|
344 print(self.queriesDir,self.alignFileSuffix)
|
|
345 queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.alignFileSuffix))
|
|
346 print("queryFiles",queryFiles)
|
|
347 for queryFile in queryFiles:#os.listdir(self.queriesDir):
|
|
348 lCmds = []
|
|
349 lCmdStart = []
|
|
350 lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )']
|
|
351 queryFilePath = os.path.join(self.queriesDir,queryFile)
|
|
352 lCmds.append(self._createLaunchAlignCommands(iLauncher, queryFilePath))
|
|
353 #lCmdFinish.append("shutil.move(\"%s.%s\", \"%s/%s.%s\")" % (benchName,self.outputformat,self.resultsDir,benchName,self.outputformat))
|
|
354 lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
|
|
355 self._log.info("Finished preparing Align jobs")
|
|
356 return lCmdsTuples
|
|
357
|
|
358 def _preparePhyMljobs(self, iLauncher, cDir):
|
|
359 self._log.info("Preparing PhyMl jobs")
|
|
360 lCmdsTuples = []
|
|
361 queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.phyloFileSuffix))
|
|
362 print("queryFiles",queryFiles)
|
|
363 for queryFile in queryFiles:#os.listdir(self.queriesDir):
|
|
364 lCmds = []
|
|
365 lCmdStart = []
|
|
366 lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )']
|
|
367 queryFilePath = os.path.join(self.queriesDir,queryFile)
|
|
368 lCmds.append(self._createLaunchPhyMLCommands(iLauncher, queryFilePath))
|
|
369 lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))
|
|
370 self._log.info("Finished preparing PhyMl jobs")
|
|
371 return lCmdsTuples
|
|
372
|
|
373 def _createLaunchAlignCommands(self, iLauncher, query):
|
|
374 if self._MSAmethod == "Map":
|
|
375 prg = "LaunchMap.py"
|
|
376 elif self._MSAmethod == "RefAlign":
|
|
377 prg = "LaunchRefAlign.py"
|
|
378
|
|
379 lArgs = []
|
|
380 lArgs.append("-i %s" % query)
|
|
381 lArgs.append(" -o %s.fa_aln" % query)
|
|
382 lArgs.append("-v 1" )
|
|
383
|
|
384 if self._MSAmethod == "RefAlign" and self._keepRefseq:
|
|
385 lArgs.append("-r " )
|
|
386
|
|
387 self._log.debug("Prepared Align commands : %s %s" % (prg, " ".join(lArgs)))
|
|
388
|
|
389
|
|
390 return iLauncher.getSystemCommand("%s" % prg, lArgs)
|
|
391
|
|
392 def launchMultAlignments(self, lNamesTErefSeq):
|
|
393 """
|
|
394 Make multiple alignments via Map for each TE family
|
|
395 """
|
|
396 self.queriesDir = os.getcwd()
|
|
397
|
|
398 if len(lNamesTErefSeq) == 1:
|
|
399 self.alignFileSuffix = "%s_all.fa.oriented" % (lNamesTErefSeq[0])
|
|
400 else:
|
|
401 self.alignFileSuffix = "*_all.fa.oriented"
|
|
402
|
|
403 queue = self._queue
|
|
404 cDir = os.getcwd()
|
|
405 tmpDir = self._tmpDir
|
|
406 groupid = "%s_%s" % ( self._refSeqTable, self._MSAmethod )
|
|
407 acronym = "Align"
|
|
408 iDb = DbFactory.createInstance(configFileName=self._configFileName)
|
|
409 iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs")
|
|
410 iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid)
|
|
411 lCmdsTuples = self._preparejobs(iLauncher, cDir)
|
|
412 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean)
|
|
413
|
|
414 self.renameFile("_all.fa.oriented.fa_aln", "_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) )
|
|
415
|
|
416 # def __makeMultAlignments( self, lNamesTErefSeq ):
|
|
417 # """
|
|
418 # Make multiple alignments via Map or Refalign for each TE family
|
|
419 # """
|
|
420 # self._pL.reset("")
|
|
421 # if self._MSAmethod == "Map":
|
|
422 #
|
|
423 # prg = os.environ["REPET_PATH"] + "/bin/srptMAP.py"
|
|
424 # elif self._MSAmethod == "RefAlign":
|
|
425 # prg = os.environ["REPET_PATH"] + "/bin/srptRefalign.py"
|
|
426 # cmd = prg
|
|
427 # cmd += " -g %s_%s" % ( self._refSeqTable, self._MSAmethod )
|
|
428 # cmd += " -q %s" % ( os.getcwd() )
|
|
429 # if len(lNamesTErefSeq) == 1:
|
|
430 # cmd += " -S %s_all.fa.oriented" % ( lNamesTErefSeq[0] )
|
|
431 # else:
|
|
432 # cmd += " -S '*_all.fa.oriented'"
|
|
433 # cmd += " -Q %s" % ( self._queue )
|
|
434 # cmd += " -C %s" % ( self._configFileName )
|
|
435 # if self._MSAmethod == "Refalign" and self._keepRefseq:
|
|
436 # cmd += " -r"
|
|
437 # if not self._clean :
|
|
438 # cmd += " -c"
|
|
439 # if self._tmpDir != "":
|
|
440 # cmd += " -d %s" % ( self._tmpDir )
|
|
441 # cmd += " -v %d" % ( self._verbosity )
|
|
442 # self._pL.launch( prg, cmd )
|
|
443 # lFiles = glob.glob( "*_all.fa.oriented.fa_aln" )
|
|
444 # for f in lFiles:
|
|
445 # os.rename( f, f.replace("_all.fa.oriented.fa_aln","_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) ) )
|
|
446
|
|
447
|
|
448 def filter4phylogenies( self, verbosity=0 ):
|
|
449 """
|
|
450 Filter TE copy alignment for better phylogenies.
|
|
451 """
|
|
452 self._log.info("Filtering MSA")
|
|
453 lInFiles = glob.glob( "*_all.fa.oriented_%s.fa_aln" % ( self._MSAmethod.lower() ) )
|
|
454 count = 0
|
|
455 for inFileName in lInFiles:
|
|
456 count += 1
|
|
457 self._log.debug("clean MSA %d --> %s" % (count,inFileName))
|
|
458 alignDB = AlignedBioseqDB()
|
|
459 alignDB.load(inFileName)
|
|
460 alignDB.cleanMSA()
|
|
461 if alignDB.getSize() > 2:
|
|
462 alignDB.save( inFileName + ".clean" )
|
|
463 self._log.debug("clean!")
|
|
464 else:
|
|
465 self._log.debug("skip!")
|
|
466 self._log.info("MSA cleaned: %d" % count)
|
|
467
|
|
468
|
|
469 def _createLaunchPhyMLCommands(self, iLauncher, query):
|
|
470 # prg = os.environ["REPET_PATH"] + "/bin/srptPhyML.py"
|
|
471 # cmd = prg
|
|
472 # cmd += " -g %s_PHY_%s" % ( self._refSeqTable, os.getpid() )
|
|
473 # cmd += " -q %s" % ( os.getcwd() )
|
|
474 # cmd += " -S '*_all.fa.oriented_%s.fa_aln.clean'" % ( self._MSAmethod.lower() )
|
|
475 # cmd += " -Q %s" % ( self._queue )
|
|
476 # cmd += " -C %s" % ( self._configFileName )
|
|
477
|
|
478 prg = "LaunchPhyML.py"
|
|
479 lArgs = []
|
|
480 lArgs.append("-i %s" % query)
|
|
481 lArgs.append("-o %s.fa_phylo" % query)
|
|
482 lArgs.append("-v %d" % (self._verbosity-1))
|
|
483
|
|
484 self._log.debug("Prepared Phyml commands : %s %s" % (prg, " ".join(lArgs)))
|
|
485 return iLauncher.getSystemCommand("%s" % prg, lArgs)
|
|
486
|
|
487 def makePhylogenies( self ):
|
|
488 """
|
|
489 Launch PhyML on each TE family.
|
|
490 """
|
|
491 self.phyloFileSuffix = "*_all.fa.oriented_%s.fa_aln.clean" % ( self._MSAmethod.lower() )
|
|
492
|
|
493 queue = self._queue
|
|
494 cDir = os.getcwd()
|
|
495 tmpDir = self._tmpDir
|
|
496 groupid = "%s_PHY_%s" % ( self._refSeqTable, os.getpid() )
|
|
497 acronym = "Phylo"
|
|
498 iDb = DbFactory.createInstance(configFileName=self._configFileName)
|
|
499 iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs")
|
|
500 iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid)
|
|
501 lCmdsTuples = self._preparePhyMljobs(iLauncher, cDir)
|
|
502 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean)
|
|
503
|
|
504
|
|
505 def start( self ):
|
|
506 """
|
|
507 Useful commands before running the program.
|
|
508 """
|
|
509 self.checkAttributes()
|
|
510 self._log.info("START GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step)
|
|
511 self.connectSql()
|
|
512
|
|
513 def end( self ):
|
|
514 """
|
|
515 Useful commands before ending the program.
|
|
516 """
|
|
517 self._db.close()
|
|
518 self._log.info("END GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step)
|
|
519
|
|
520 def run( self ):
|
|
521 """
|
|
522 Run the program.
|
|
523 """
|
|
524 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
525 self.start()
|
|
526 lNamesTErefSeq = self.getNamesOfTErefSeq()
|
|
527 self._log.debug("lNamesTErefSeq: %s" % " ".join(lNamesTErefSeq))
|
|
528
|
|
529 if self.step in [0, 1]:
|
|
530 self.getTErefSeqInFastaFiles( lNamesTErefSeq )
|
|
531 self.getCopiesInFastaFilesPerTErefSeq( lNamesTErefSeq )
|
|
532
|
|
533 if self.step in [0, 2]:
|
|
534 self.filter4Alignments(lNamesTErefSeq)
|
|
535 self.buildInFiles4Launcher(lNamesTErefSeq)
|
|
536 self.launchMultAlignments(lNamesTErefSeq)
|
|
537
|
|
538 if self.step in [0, 3]:
|
|
539 self.filter4phylogenies(verbosity=self._verbosity)
|
|
540 self.makePhylogenies()
|
|
541 self.end()
|
|
542
|
|
543 if __name__ == "__main__":
|
|
544 iGetMultAlignAndPhylogenyPerTErefSeq = GetMultAlignAndPhylogenyPerTErefSeq()
|
|
545 iGetMultAlignAndPhylogenyPerTErefSeq.setAttributesFromCmdLine()
|
|
546 iGetMultAlignAndPhylogenyPerTErefSeq.run()
|
|
547
|