Mercurial > repos > vmarcon > repet_tedenovo
comparison TEdenovo_lite.py @ 0:baea09e6722b draft default tip
1st Uploaded
author | vmarcon |
---|---|
date | Mon, 06 Feb 2017 13:31:53 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:baea09e6722b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 import os | |
5 import sys | |
6 import time | |
7 import glob | |
8 import shutil | |
9 import ConfigParser | |
10 from commons.core.seq.FastaUtils import * | |
11 import operator | |
12 import re | |
13 | |
14 | |
15 | |
16 if not "REPET_PATH" in os.environ.keys(): | |
17 print "ERROR: no environment variable REPET_PATH" | |
18 sys.exit(1) | |
19 | |
20 if (not "REPET_DB" in os.environ.keys()) or (not "REPET_HOST" in os.environ.keys()) or (not "REPET_PORT" in os.environ.keys()) or (not "REPET_USER" in os.environ.keys()) or (not "REPET_PW" in os.environ.keys()): | |
21 print "ERROR: there is at least one environment database variable missing : REPET_DB, REPET_PORT, REPET_HOST, REPET_USER or REPET_PW" | |
22 sys.exit(1) | |
23 | |
24 if not "REPET_JOB_MANAGER" in os.environ.keys(): | |
25 print "ERROR: no environment variable REPET_JOB_MANAGER" | |
26 sys.exit(1) | |
27 | |
28 | |
29 if not "%s/bin" % os.environ["REPET_PATH"] in os.environ["PATH"]: | |
30 os.environ["PATH"] = "%s/bin:%s" % (os.environ["REPET_PATH"], os.environ["PATH"]) | |
31 | |
32 sys.path.append(os.environ["REPET_PATH"]) | |
33 if not "PYTHONPATH" in os.environ.keys(): | |
34 os.environ["PYTHONPATH"] = os.environ["REPET_PATH"] | |
35 else: | |
36 os.environ["PYTHONPATH"] = "%s:%s" % (os.environ["REPET_PATH"], os.environ["PYTHONPATH"]) | |
37 | |
38 from commons.core.LoggerFactory import LoggerFactory | |
39 from commons.core.checker.RepetException import RepetException | |
40 from commons.core.utils.FileUtils import FileUtils | |
41 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
42 from commons.core.seq.FastaUtils import FastaUtils | |
43 from commons.core.sql.DbFactory import DbFactory | |
44 from itertools import islice | |
45 | |
46 LOG_DEPTH = "TEdenovo.pipeline" | |
47 | |
48 class TEdenovo_lite(object): | |
49 | |
50 def __init__(self, configFileName = "", fastaFileName = "", verbosity = 0): | |
51 self._configFileName = configFileName | |
52 self._fastaFileName = os.path.abspath(fastaFileName) | |
53 self._projectName = time.strftime("%Y%m%d%H%M%S") | |
54 self._limitSeqSize = 200000000 | |
55 | |
56 if "REPET_NUCL_BANK" in os.environ.keys(): | |
57 if os.path.exists(os.environ["REPET_NUCL_BANK"]): | |
58 self._nucl_bank = os.environ["REPET_NUCL_BANK"] | |
59 else : | |
60 print "ERROR : the nucleotides bank configured doesn't exist. Please correct it in the REPET_NUCL_BANK variable" | |
61 sys.exit(1) | |
62 else : | |
63 self._nucl_bank = "" | |
64 if "REPET_PROT_BANK" in os.environ.keys(): | |
65 if os.path.exists(os.environ["REPET_PROT_BANK"]): | |
66 self._prot_bank = os.environ["REPET_PROT_BANK"] | |
67 else : | |
68 print "ERROR : the proteins bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" | |
69 sys.exit(1) | |
70 else : | |
71 self._prot_bank = "" | |
72 if "REPET_HMM_PROFILES" in os.environ.keys(): | |
73 if os.path.exists(os.environ["REPET_HMM_PROFILES"]): | |
74 self._HMM_profiles = os.environ["REPET_HMM_PROFILES"] | |
75 else : | |
76 print "ERROR : the hmm profiles bank configured doesn't exist. Please correct it in the REPET_HMM_PROFILES variable" | |
77 sys.exit(1) | |
78 else : | |
79 self._HMM_profiles = "" | |
80 if "REPET_RDNA_BANK" in os.environ.keys(): | |
81 if os.path.exists(os.environ["REPET_RDNA_BANK"]): | |
82 self._rdna_bank = os.environ["REPET_RDNA_BANK"] | |
83 else : | |
84 print "ERROR : the rDNA bank configured doesn't exist. Please correct it in the REPET_PROT_BANK variable" | |
85 sys.exit(1) | |
86 else : | |
87 self._rdna_bank = "" | |
88 if self._nucl_bank == "" and self._prot_bank == "" and self._HMM_profiles == "" and self._rdna_bank == "" : | |
89 print "WARNING : No bank are configured ... To set banks please add REPET_NUCL_BANK, REPET_PROT_BANK, REPET_HMM_PROFILES and/or REPET_RDNA_BANK in your environment" | |
90 if "REPET_TMP_DIR" in os.environ.keys(): | |
91 self._tmp_dir = os.environ["REPET_TMP_DIR"] | |
92 else : | |
93 self._tmp_dir = "" | |
94 self._outputFasta = "" | |
95 self._classif = False | |
96 self._outputClassif = "" | |
97 self._outputStats = "" | |
98 self._verbosity = verbosity | |
99 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
100 | |
101 def setAttributesFromCommandLine(self): | |
102 description = "This script is a ligth version of TEdenovo. It writes configuration file and launches TEdenovo." | |
103 epilog = "Example: TEdenovo_lite.py -i fastaFileName \n" | |
104 version = "2.0" | |
105 parser = RepetOptionParser(description = description, epilog = epilog, version = version) | |
106 parser.add_option("-i", "--fasta", dest = "fastaFileName" , action = "store" , type = "string", help ="input fasta file name ", default = "") | |
107 parser.add_option("-c", "--withClassif", dest="withClassif", action="store_true", help = " Get classification files in output.", default = False) | |
108 parser.add_option("-o", "--output", dest="outputLabel" , action = "store", type = "string", help = "[optional] Prefix label for output file(s).", default = "") | |
109 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "Verbosity [optional] [default: 2]", default = 2) | |
110 options = parser.parse_args()[0] | |
111 self._setAttributesFromOptions(options) | |
112 | |
113 def _setAttributesFromOptions(self, options): | |
114 self.setConfigFileName("") | |
115 if options.fastaFileName=="": | |
116 print "ERROR : You have to enter an input fasta file" | |
117 print "Example: TEdenovo_lite.py -i fastaFileName \n" | |
118 print "More option : TEdenovo_lite.py --help " | |
119 exit(1) | |
120 else : | |
121 self._fastaFileName = os.path.abspath(options.fastaFileName) | |
122 if options.outputLabel=="": | |
123 fastaBaseName=os.path.abspath(re.search(r'([^\/\\]*)\.[fa|fasta|fsa|fas]',options.fastaFileName).groups()[0]) | |
124 options.outputLabel=fastaBaseName | |
125 self._outputFasta = os.path.abspath(options.outputLabel+"-%s-denovoLibTEs_filtered.fa"%self._projectName[:8]) | |
126 self._outputStats = os.path.abspath(options.outputLabel+"-%s-classif_stats.txt"%self._projectName[:8]) | |
127 self._verbosity = options.verbosity | |
128 if options.withClassif : | |
129 self._classif=True | |
130 self._outputClassif = os.path.abspath(options.outputLabel+'-%s.classif'%self._projectName[:8]) | |
131 | |
132 def setConfigFileName(self, configFileName): | |
133 self._configFileName = configFileName | |
134 if not self._configFileName: | |
135 self._configFileName = "TEdenovo_Galaxy_config_%s" % self._projectName | |
136 | |
137 def setAttributesFromConfigFile(self, configFileName): | |
138 config = ConfigParser.ConfigParser() | |
139 config.readfp( open(configFileName) ) | |
140 | |
141 | |
142 def _writeConfigFile(self): | |
143 if FileUtils.isRessourceExists(self._configFileName): | |
144 self._logAndRaise("Configuration file '%s' already exists. Won't be overwritten.") | |
145 | |
146 shutil.copy("%s/config/TEdenovo.cfg" % os.environ.get("REPET_PATH"), self._configFileName) | |
147 self.setAttributesFromConfigFile(self._configFileName) | |
148 | |
149 os.system("sed -i 's|repet_host: <your_MySQL_host>|repet_host: %s|' %s" % (os.environ["REPET_HOST"], self._configFileName)) | |
150 os.system("sed -i 's|repet_user: <your_MySQL_login>|repet_user: %s|' %s" % (os.environ["REPET_USER"], self._configFileName)) | |
151 os.system("sed -i 's|repet_pw: <your_MySQL_password>|repet_pw: %s|' %s" % (os.environ["REPET_PW"], self._configFileName)) | |
152 os.system("sed -i 's|repet_db: <your_MySQL_db>|repet_db: %s|' %s" % (os.environ["REPET_DB"], self._configFileName)) | |
153 os.system("sed -i 's|repet_port: 3306|repet_port: %s|' %s" % (os.environ["REPET_PORT"], self._configFileName)) | |
154 os.system("sed -i 's|repet_job_manager: SGE|repet_job_manager: %s|' %s" % (os.environ["REPET_JOB_MANAGER"], self._configFileName)) | |
155 os.system("sed -i 's|project_name: <your_project_name>|project_name: %s|' %s" % (self._projectName, self._configFileName)) | |
156 os.system("sed -i 's|project_dir: <absolute_path_to_your_project_directory>|project_dir: %s|' %s" % (os.getcwd().replace("/", "\/"), self._configFileName)) | |
157 os.system("sed -i 's|tmpDir:|tmpDir: %s|g' %s" % (self._tmp_dir, self._configFileName)) | |
158 | |
159 if self._nucl_bank != "" and self._nucl_bank != None: | |
160 os.system("sed -i 's|TE_BLRn: no|TE_BLRn: yes|' %s" % self._configFileName) | |
161 os.system("sed -i 's|TE_BLRtx: no|TE_BLRtx: yes|' %s" % self._configFileName) | |
162 os.system("sed -i 's|TE_nucl_bank: <bank_of_TE_nucleotide_sequences_such_as_Repbase>|TE_nucl_bank: %s|' %s" % (os.path.basename(self._nucl_bank), self._configFileName)) | |
163 | |
164 if self._prot_bank != "" and self._prot_bank != None: | |
165 os.system("sed -i 's|TE_BLRx: no|TE_BLRx: yes|' %s" % self._configFileName) | |
166 os.system("sed -i 's|TE_prot_bank: <bank_of_TE_amino-acid_sequences_such_as_Repbase>|TE_prot_bank: %s|' %s" % (os.path.basename(self._prot_bank), self._configFileName)) | |
167 | |
168 if self._HMM_profiles != "" and self._HMM_profiles != None: | |
169 os.system("sed -i 's|TE_HMMER: no|TE_HMMER: yes|' %s" % self._configFileName) | |
170 os.system("sed -i 's|TE_HMM_profiles: <bank_of_HMM_profiles>|TE_HMM_profiles: %s|' %s" % (os.path.basename(self._HMM_profiles),self._configFileName)) | |
171 | |
172 if self._rdna_bank != "" and self._rdna_bank != None: | |
173 os.system("sed -i 's|rDNA_BLRn: no|rDNA_BLRn: yes|' %s" % self._configFileName) | |
174 os.system("sed -i 's|rDNA_bank: <bank_of_rDNA_sequences_from_eukaryota>|rDNA_bank: %s|' %s" % (os.path.basename(self._rdna_bank),self._configFileName)) | |
175 | |
176 os.system("sed -i 's|filter_host_gene: no|filter_host_gene: yes|' %s" % (self._configFileName)) | |
177 | |
178 | |
179 def removeNstretches(self,maxNstretchesSize=11,minContigsize=10000): | |
180 if self._verbosity > 0: | |
181 print "Removing Nstretches longer than %d pb and removing conting shorter than %d pb"%(maxNstretchesSize,minContigsize) | |
182 t0=time.time() | |
183 Nstretches=FastaUtils.getNstretchesRangesList(self._fastaFileName,maxNstretchesSize) | |
184 t1=time.time() | |
185 refBSDB = BioseqDB(self._fastaFileName) | |
186 t3=time.time() | |
187 debut=1 | |
188 t2=time.time() | |
189 if len(Nstretches)>0: | |
190 currentchrom=Nstretches[0].seqname | |
191 refBS=refBSDB.fetch(currentchrom) | |
192 t3=time.time() | |
193 newBSDB = BioseqDB() | |
194 i=0 | |
195 seqInNstretches = [] | |
196 for Nstretch in Nstretches : | |
197 i+=1 | |
198 tmpBS="" | |
199 if Nstretch.seqname not in seqInNstretches : | |
200 seqInNstretches.append(Nstretch.seqname) | |
201 if currentchrom==Nstretch.seqname: | |
202 fin=Nstretch.start-1 | |
203 size=fin-debut+1 | |
204 if size>minContigsize : | |
205 tmpBS=refBS.subseq(debut,fin) | |
206 newBSDB.add(tmpBS) | |
207 | |
208 debut=Nstretch.end+1 | |
209 | |
210 else : | |
211 fin = refBSDB.getSeqLength(currentchrom) | |
212 size=fin-debut+1 | |
213 if size>minContigsize : | |
214 tmpBS=refBS.subseq(debut,fin) | |
215 newBSDB.add(tmpBS) | |
216 currentchrom=Nstretch.seqname | |
217 refBS=refBSDB.fetch(currentchrom) | |
218 debut=1 | |
219 fin==Nstretch.start | |
220 size=fin-debut+1 | |
221 if size>minContigsize : | |
222 tmpBS=refBS.subseq(debut,fin) | |
223 newBSDB.add(tmpBS) | |
224 debut=Nstretch.end+1 | |
225 | |
226 if len(Nstretches)>0: | |
227 fin = refBSDB.getSeqLength(currentchrom) | |
228 size=fin-debut+1 | |
229 if size>minContigsize : | |
230 tmpBS=refBS.subseq(debut,fin) | |
231 newBSDB.add(tmpBS) | |
232 | |
233 for refName in refBSDB.getHeaderList() : | |
234 if refName not in seqInNstretches: | |
235 debut=1 | |
236 fin=refBSDB.getSeqLength(refName) | |
237 size=fin-debut+1 | |
238 if size>minContigsize : | |
239 refBS=refBSDB.fetch(refName) | |
240 tmpBS=refBS.subseq(debut,fin) | |
241 newBSDB.add(tmpBS) | |
242 | |
243 | |
244 t5b=time.time() | |
245 if self._verbosity >= 2: | |
246 print "%s contigs selected from %s scaffolds"%(newBSDB.getSize(),refBSDB.getSize()) | |
247 #newBSDB.sortByLength(reverse=True) | |
248 | |
249 return newBSDB | |
250 | |
251 | |
252 | |
253 #TODO refactoring about min size of genome for preprocess | |
254 def selectContigs4givenSize(self,BSDB,limit=200000000): | |
255 if self._verbosity > 0: | |
256 print "Selecting contigs to reach %s pb "%limit | |
257 contigsHeadersAndLength=zip(BSDB.getHeaderList(),BSDB.getListOfSequencesLength()) | |
258 size=0 | |
259 size_small=0 | |
260 size_big=500000000 | |
261 lselectedContigs=[] | |
262 | |
263 for seq in BSDB.db : | |
264 size+=seq.getLength() | |
265 if size<limit: | |
266 lselectedContigs.append(seq) | |
267 size_small=size | |
268 else : | |
269 size_big=size | |
270 break | |
271 | |
272 if size_big-limit<limit-size_small : | |
273 lselectedContigs.append(seq) | |
274 | |
275 if self._verbosity > 0: | |
276 print "%s contigs selected to reach %s pb (%s contigs initially) "%(len(lselectedContigs),limit,len(contigsHeadersAndLength)) | |
277 | |
278 selectedContigsBSDB=BioseqDB() | |
279 selectedContigsBSDB.setData(lselectedContigs) | |
280 return selectedContigsBSDB | |
281 | |
282 def writeFastaInput(self,BSDB,outFileName=''): | |
283 if self._verbosity > 0: | |
284 print "Writing fasta file" | |
285 | |
286 if not outFileName: | |
287 outFileName = self._projectName + ".fastaExtract" | |
288 | |
289 BSDB.save(outFileName) | |
290 if self._verbosity > 0: | |
291 print '%d sequences saved.'%BSDB.getSize() | |
292 | |
293 return outFileName | |
294 | |
295 def correctHeader(self,BSDB): | |
296 if self._verbosity > 0: | |
297 print "Correcting fasta headers" | |
298 replacedSeqNb=0 | |
299 for header in BSDB.getHeaderList() : | |
300 p = re.compile('[^a-zA-Z0-9_:\.\-]', re.IGNORECASE) | |
301 if p.search(header): | |
302 sub=list(set(p.findall(header))) | |
303 correctedHeader=header | |
304 for s in sub : | |
305 correctedHeader=correctedHeader.replace(s,'_') | |
306 if self._verbosity>2: | |
307 print "Correct Header : '%s' replaced by '%s'"%(header,correctedHeader) | |
308 BSDB.fetch(header).setHeader(correctedHeader) | |
309 replacedSeqNb+=1 | |
310 if self._verbosity > 0: | |
311 print '%s sequence headers corrected'%replacedSeqNb | |
312 return BSDB | |
313 | |
314 | |
315 def _launchTEdenovo(self): | |
316 print "START time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") | |
317 lCmds = [] | |
318 lCmds.append( "TEdenovo.py -P %s -C %s -S 1 -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
319 lCmds.append( "TEdenovo.py -P %s -C %s -S 2 -s Blaster -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
320 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Grouper -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
321 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Recon -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
322 lCmds.append( "TEdenovo.py -P %s -C %s -S 3 -s Blaster -c Piler -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
323 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Grouper -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
324 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Recon -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
325 lCmds.append( "TEdenovo.py -P %s -C %s -S 4 -s Blaster -c Piler -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
326 lCmds.append( "TEdenovo.py -P %s -C %s -S 5 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
327 lCmds.append( "TEdenovo.py -P %s -C %s -S 6 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
328 lCmds.append( "TEdenovo.py -P %s -C %s -S 7 -s Blaster -c GrpRecPil -m Map -v %i" % (self._projectName, self._configFileName, self._verbosity) ) | |
329 | |
330 for cmd in lCmds: | |
331 returnValue = os.system(cmd) | |
332 if returnValue != 0: | |
333 print "ERROR: command '%s' returned %i" % (cmd, returnValue) | |
334 self._cleanTables() | |
335 sys.exit(1) | |
336 | |
337 print "END time: %s" % time.strftime("%Y-%m-%d %H:%M:%S") | |
338 outFastaFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*_denovoLibTEs_filtered.fa"%self._projectName) | |
339 shutil.copy(outFastaFile[0], self._outputFasta) | |
340 outStatsFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif_Filtered/*.classif_stats.txt"%self._projectName) | |
341 shutil.copy(outStatsFile[0], self._outputStats) | |
342 if self._classif: | |
343 outClassifFile = glob.glob("%s_Blaster_GrpRecPil_Map_TEclassif/classifConsensus/*_withoutRedundancy_negStrandReversed_WickerH.classif"%self._projectName) | |
344 shutil.copy(outClassifFile[0], self._outputClassif) | |
345 self._renameTE() | |
346 | |
347 def _renameTE(self): | |
348 name=re.search(r'([^\/\\]*)-\d{8}-denovoLibTEs_filtered\.[fa|fasta|fsa|fas]',self._outputFasta).groups()[0] | |
349 os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputFasta)) | |
350 if self._classif: | |
351 os.system("sed -i 's|%s|%s|' %s" % (self._projectName,name, self._outputClassif)) | |
352 | |
353 def preprocessFastaFile(self): | |
354 inFileHandler = open(self._fastaFileName, "r") | |
355 cumulLength = FastaUtils.dbCumLength(inFileHandler) | |
356 inFileHandler.close() | |
357 if cumulLength >= self._limitSeqSize: | |
358 print "Preprocess lauched" | |
359 allContigsBSDB=self.removeNstretches() | |
360 selectedContigsBSDB=self.selectContigs4givenSize(allContigsBSDB) | |
361 self.correctHeader(selectedContigsBSDB) | |
362 fastaFile=self.writeFastaInput(selectedContigsBSDB) | |
363 print "Preprocess finished" | |
364 else: | |
365 fastaFile=self._fastaFileName | |
366 print "No preprocess : the genome size %s lower than %s Mbp" % (cumulLength, self._limitSeqSize/1000000) | |
367 os.symlink(fastaFile,"%s/%s.fa" %(os.getcwd(),self._projectName)) #creer repertoire projet | |
368 | |
369 def _launchListAndDropTables(self): | |
370 cmd = "ListAndDropTables.py" | |
371 cmd += " -C %s" % self._configFileName | |
372 cmd += " -d '%s'" % self._projectName | |
373 os.system(cmd) | |
374 | |
375 def _cleanJobsTable(self): | |
376 db = DbFactory.createInstance( configFileName = self._configFileName ) | |
377 sql_cmd="DELETE FROM jobs WHERE groupid like '%s%%';"%self._projectName | |
378 db.execute( sql_cmd ) | |
379 db.close() | |
380 | |
381 def _cleanTables(self): | |
382 self._launchListAndDropTables() | |
383 self. _cleanJobsTable() | |
384 | |
385 | |
386 def run(self): | |
387 os.mkdir(self._projectName) | |
388 os.chdir(self._projectName) | |
389 self._writeConfigFile() | |
390 self.preprocessFastaFile() | |
391 if self._nucl_bank != "" and self._nucl_bank != None: | |
392 os.symlink(self._nucl_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._nucl_bank))) | |
393 if self._prot_bank != "" and self._prot_bank != None: | |
394 os.symlink(self._prot_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._prot_bank))) | |
395 if self._HMM_profiles != "" and self._HMM_profiles != None: | |
396 os.symlink(self._HMM_profiles,"%s/%s" %(os.getcwd(),os.path.basename(self._HMM_profiles))) | |
397 if self._rdna_bank != "" and self._rdna_bank != None: | |
398 os.symlink(self._rdna_bank,"%s/%s" %(os.getcwd(),os.path.basename(self._rdna_bank))) | |
399 | |
400 self._launchTEdenovo() | |
401 self._cleanTables() | |
402 | |
403 if __name__ == '__main__': | |
404 iTEdenovo = TEdenovo_lite() | |
405 iTEdenovo.setAttributesFromCommandLine() | |
406 iTEdenovo.run() |