comparison TEdenovo_lite.py @ 0:baea09e6722b draft default tip

1st Uploaded
author vmarcon
date Mon, 06 Feb 2017 13:31:53 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:baea09e6722b
1 #!/usr/bin/env python
2
3
4 import os
5 import sys
6 import time
7 import glob
8 import shutil
9 import ConfigParser
10 from commons.core.seq.FastaUtils import *
11 import operator
12 import re
13
14
15
16 if not "REPET_PATH" in os.environ.keys():
17 print "ERROR: no environment variable REPET_PATH"
18 sys.exit(1)
19
20 if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()):
21 print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW"
22 sys.exit(1)
23
24 if not "REPET_JOB_MANAGER" in os.environ.keys():
25 print "ERROR: no environment variable REPET_JOB_MANAGER"
26 sys.exit(1)
27
28
29 if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]:
30 os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"])
31
32 sys.path.append(os.environ["REPET_PATH"])
33 if not "PYTHONPATH" in os.environ.keys():
34 os.environ["PYTHONPATH"] = os.environ["REPET_PATH"]
35 else:
36 os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"])
37
38 from commons.core.LoggerFactory import LoggerFactory
39 from commons.core.checker.RepetException import RepetException
40 from commons.core.utils.FileUtils import FileUtils
41 from commons.core.utils.RepetOptionParser import RepetOptionParser
42 from commons.core.seq.FastaUtils import FastaUtils
43 from commons.core.sql.DbFactory import DbFactory
44 from itertools import islice
45
46 LOG_DEPTH = "TEdenovo.pipeline"
47
48 class TEdenovo_lite(object):
49
50 def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0):
51 self._configFileName = configFileName
52 self._fastaFileName = os.path.abspath(fastaFileName)
53 self._projectName = time.strftime("%Y%m%d%H%M%S")
54 self._limitSeqSize = 200000000
55
56 if "REPET_NUCL_BANK" in os.environ.keys():
57 if os.path.exists(os.environ["REPET_NUCL_BANK"]):
58 self._nucl_bank = os.environ["REPET_NUCL_BANK"]
59 else :
60 print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable"
61 sys.exit(1)
62 else :
63 self._nucl_bank = ""
64 if "REPET_PROT_BANK" in os.environ.keys():
65 if os.path.exists(os.environ["REPET_PROT_BANK"]):
66 self._prot_bank = os.environ["REPET_PROT_BANK"]
67 else :
68 print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
69 sys.exit(1)
70 else :
71 self._prot_bank = ""
72 if "REPET_HMM_PROFILES" in os.environ.keys():
73 if os.path.exists(os.environ["REPET_HMM_PROFILES"]):
74 self._HMM_profiles = os.environ["REPET_HMM_PROFILES"]
75 else :
76 print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable"
77 sys.exit(1)
78 else :
79 self._HMM_profiles = ""
80 if "REPET_RDNA_BANK" in os.environ.keys():
81 if os.path.exists(os.environ["REPET_RDNA_BANK"]):
82 self._rdna_bank = os.environ["REPET_RDNA_BANK"]
83 else :
84 print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable"
85 sys.exit(1)
86 else :
87 self._rdna_bank = ""
88 if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == "" :
89 print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment"
90 if "REPET_TMP_DIR" in os.environ.keys():
91 self._tmp_dir = os.environ["REPET_TMP_DIR"]
92 else :
93 self._tmp_dir = ""
94 self._outputFasta = ""
95 self._classif = False
96 self._outputClassif = ""
97 self._outputStats = ""
98 self._verbosity = verbosity
99 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
100
101 def setAttributesFromCommandLine(self):
102 description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo."
103 epilog = "Example: TEdenovo_lite.py -i fastaFileName \n"
104 version = "2.0"
105 parser = RepetOptionParser(description = description, epilog = epilog, version = version)
106 parser.add_option("-i", "--fasta", dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "")
107 parser.add_option("-c", "--withClassif", dest="withClassif", action="store_true", help = " Get classification files in output.", default = False)
108 parser.add_option("-o", "--output", dest="outputLabel" , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "")
109 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 2]", default = 2)
110 options = parser.parse_args()[0]
111 self._setAttributesFromOptions(options)
112
113 def _setAttributesFromOptions(self, options):
114 self.setConfigFileName("")
115 if options.fastaFileName=="":
116 print "ERROR : You have to enter an input fasta file"
117 print "Example: TEdenovo_lite.py -i fastaFileName \n"
118 print "More option : TEdenovo_lite.py --help "
119 exit(1)
120 else :
121 self._fastaFileName = os.path.abspath(options.fastaFileName)
122 if options.outputLabel=="":
123 fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0])
124 options.outputLabel=fastaBaseName
125 self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8])
126 self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8])
127 self._verbosity = options.verbosity
128 if options.withClassif :
129 self._classif=True
130 self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8])
131
132 def setConfigFileName(self, configFileName):
133 self._configFileName = configFileName
134 if not self._configFileName:
135 self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName
136
137 def setAttributesFromConfigFile(self, configFileName):
138 config = ConfigParser.ConfigParser()
139 config.readfp( open(configFileName) )
140
141
142 def _writeConfigFile(self):
143 if FileUtils.isRessourceExists(self._configFileName):
144 self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.")
145
146 shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName)
147 self.setAttributesFromConfigFile(self._configFileName)
148
149 os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName))
150 os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName))
151 os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName))
152 os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName))
153 os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName))
154 os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName))
155 os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName))
156 os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName))
157 os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName))
158
159 if self._nucl_bank != "" and self._nucl_bank != None:
160 os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" % self._configFileName)
161 os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" % self._configFileName)
162 os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName))
163
164 if self._prot_bank != "" and self._prot_bank != None:
165 os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" % self._configFileName)
166 os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName))
167
168 if self._HMM_profiles != "" and self._HMM_profiles != None:
169 os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" % self._configFileName)
170 os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName))
171
172 if self._rdna_bank != "" and self._rdna_bank != None:
173 os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" % self._configFileName)
174 os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName))
175
176 os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName))
177
178
179 def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000):
180 if self._verbosity > 0:
181 print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize)
182 t0=time.time()
183 Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize)
184 t1=time.time()
185 refBSDB = BioseqDB(self._fastaFileName)
186 t3=time.time()
187 debut=1
188 t2=time.time()
189 if len(Nstretches)>0:
190 currentchrom=Nstretches[0].seqname
191 refBS=refBSDB.fetch(currentchrom)
192 t3=time.time()
193 newBSDB = BioseqDB()
194 i=0
195 seqInNstretches = []
196 for Nstretch in Nstretches :
197 i+=1
198 tmpBS=""
199 if Nstretch.seqname not in seqInNstretches :
200 seqInNstretches.append(Nstretch.seqname)
201 if currentchrom==Nstretch.seqname:
202 fin=Nstretch.start-1
203 size=fin-debut+1
204 if size>minContigsize :
205 tmpBS=refBS.subseq(debut,fin)
206 newBSDB.add(tmpBS)
207
208 debut=Nstretch.end+1
209
210 else :
211 fin = refBSDB.getSeqLength(currentchrom)
212 size=fin-debut+1
213 if size>minContigsize :
214 tmpBS=refBS.subseq(debut,fin)
215 newBSDB.add(tmpBS)
216 currentchrom=Nstretch.seqname
217 refBS=refBSDB.fetch(currentchrom)
218 debut=1
219 fin==Nstretch.start
220 size=fin-debut+1
221 if size>minContigsize :
222 tmpBS=refBS.subseq(debut,fin)
223 newBSDB.add(tmpBS)
224 debut=Nstretch.end+1
225
226 if len(Nstretches)>0:
227 fin = refBSDB.getSeqLength(currentchrom)
228 size=fin-debut+1
229 if size>minContigsize :
230 tmpBS=refBS.subseq(debut,fin)
231 newBSDB.add(tmpBS)
232
233 for refName in refBSDB.getHeaderList() :
234 if refName not in seqInNstretches:
235 debut=1
236 fin=refBSDB.getSeqLength(refName)
237 size=fin-debut+1
238 if size>minContigsize :
239 refBS=refBSDB.fetch(refName)
240 tmpBS=refBS.subseq(debut,fin)
241 newBSDB.add(tmpBS)
242
243
244 t5b=time.time()
245 if self._verbosity >= 2:
246 print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize())
247 #newBSDB.sortByLength(reverse=True)
248
249 return newBSDB
250
251
252
253 #TODO refactoring about min size of genome for preprocess
254 def selectContigs4givenSize(self,BSDB,limit=200000000):
255 if self._verbosity > 0:
256 print "Selecting contigs to reach %s pb "%limit
257 contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength())
258 size=0
259 size_small=0
260 size_big=500000000
261 lselectedContigs=[]
262
263 for seq in BSDB.db :
264 size+=seq.getLength()
265 if size<limit:
266 lselectedContigs.append(seq)
267 size_small=size
268 else :
269 size_big=size
270 break
271
272 if size_big-limit<limit-size_small :
273 lselectedContigs.append(seq)
274
275 if self._verbosity > 0:
276 print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength))
277
278 selectedContigsBSDB=BioseqDB()
279 selectedContigsBSDB.setData(lselectedContigs)
280 return selectedContigsBSDB
281
282 def writeFastaInput(self,BSDB,outFileName=''):
283 if self._verbosity > 0:
284 print "Writing fasta file"
285
286 if not outFileName:
287 outFileName = self._projectName + ".fastaExtract"
288
289 BSDB.save(outFileName)
290 if self._verbosity > 0:
291 print '%d sequences saved.'%BSDB.getSize()
292
293 return outFileName
294
295 def correctHeader(self,BSDB):
296 if self._verbosity > 0:
297 print "Correcting fasta headers"
298 replacedSeqNb=0
299 for header in BSDB.getHeaderList() :
300 p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE)
301 if p.search(header):
302 sub=list(set(p.findall(header)))
303 correctedHeader=header
304 for s in sub :
305 correctedHeader=correctedHeader.replace(s,'_')
306 if self._verbosity>2:
307 print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader)
308 BSDB.fetch(header).setHeader(correctedHeader)
309 replacedSeqNb+=1
310 if self._verbosity > 0:
311 print '%s sequence headers corrected'%replacedSeqNb
312 return BSDB
313
314
315 def _launchTEdenovo(self):
316 print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
317 lCmds = []
318 lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) )
319 lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) )
320 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) )
321 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) )
322 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) )
323 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
324 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
325 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
326 lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
327 lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
328 lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) )
329
330 for cmd in lCmds:
331 returnValue = os.system(cmd)
332 if returnValue != 0:
333 print "ERROR: command '%s' returned %i" % (cmd, returnValue)
334 self._cleanTables()
335 sys.exit(1)
336
337 print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S")
338 outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName)
339 shutil.copy(outFastaFile[0], self._outputFasta)
340 outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName)
341 shutil.copy(outStatsFile[0], self._outputStats)
342 if self._classif:
343 outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName)
344 shutil.copy(outClassifFile[0], self._outputClassif)
345 self._renameTE()
346
347 def _renameTE(self):
348 name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0]
349 os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta))
350 if self._classif:
351 os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif))
352
353 def preprocessFastaFile(self):
354 inFileHandler = open(self._fastaFileName, "r")
355 cumulLength = FastaUtils.dbCumLength(inFileHandler)
356 inFileHandler.close()
357 if cumulLength >= self._limitSeqSize:
358 print "Preprocess lauched"
359 allContigsBSDB=self.removeNstretches()
360 selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB)
361 self.correctHeader(selectedContigsBSDB)
362 fastaFile=self.writeFastaInput(selectedContigsBSDB)
363 print "Preprocess finished"
364 else:
365 fastaFile=self._fastaFileName
366 print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000)
367 os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet
368
369 def _launchListAndDropTables(self):
370 cmd = "ListAndDropTables.py"
371 cmd += " -C %s" % self._configFileName
372 cmd += " -d '%s'" % self._projectName
373 os.system(cmd)
374
375 def _cleanJobsTable(self):
376 db = DbFactory.createInstance( configFileName = self._configFileName )
377 sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName
378 db.execute( sql_cmd )
379 db.close()
380
381 def _cleanTables(self):
382 self._launchListAndDropTables()
383 self. _cleanJobsTable()
384
385
386 def run(self):
387 os.mkdir(self._projectName)
388 os.chdir(self._projectName)
389 self._writeConfigFile()
390 self.preprocessFastaFile()
391 if self._nucl_bank != "" and self._nucl_bank != None:
392 os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank)))
393 if self._prot_bank != "" and self._prot_bank != None:
394 os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank)))
395 if self._HMM_profiles != "" and self._HMM_profiles != None:
396 os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles)))
397 if self._rdna_bank != "" and self._rdna_bank != None:
398 os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank)))
399
400 self._launchTEdenovo()
401 self._cleanTables()
402
403 if __name__ == '__main__':
404 iTEdenovo = TEdenovo_lite()
405 iTEdenovo.setAttributesFromCommandLine()
406 iTEdenovo.run()