comparison commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 """
4 Launch Blastclust on nucleotide sequences and return a fasta file.
5 """
6
7 # Copyright INRA (Institut National de la Recherche Agronomique)
8 # http://www.inra.fr
9 # http://urgi.versailles.inra.fr
10 #
11 # This software is governed by the CeCILL license under French law and
12 # abiding by the rules of distribution of free software. You can use,
13 # modify and/ or redistribute the software under the terms of the CeCILL
14 # license as circulated by CEA, CNRS and INRIA at the following URL
15 # "http://www.cecill.info".
16 #
17 # As a counterpart to the access to the source code and rights to copy,
18 # modify and redistribute granted by the license, users are provided only
19 # with a limited warranty and the software's author, the holder of the
20 # economic rights, and the successive licensors have only limited
21 # liability.
22 #
23 # In this respect, the user's attention is drawn to the risks associated
24 # with loading, using, modifying and/or developing or reproducing the
25 # software by the user in light of its specific status of free software,
26 # that may mean that it is complicated to manipulate, and that also
27 # therefore means that it is reserved for developers and experienced
28 # professionals having in-depth computer knowledge. Users are therefore
29 # encouraged to load and test the software's suitability as regards their
30 # requirements in conditions enabling the security of their systems and/or
31 # data to be ensured and, more generally, to use and operate it in the
32 # same conditions as regards security.
33 #
34 # The fact that you are presently reading this means that you have had
35 # knowledge of the CeCILL license and that you accept its terms.
36
37 import os
38 import sys
39 import subprocess
40 from commons.core.seq.BioseqDB import BioseqDB
41 from commons.core.seq.Bioseq import Bioseq
42 from commons.core.utils.RepetOptionParser import RepetOptionParser
43 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
44
45 class LaunchBlastclust(object):
46 """
47 Launch Blastclust on nucleotide sequences and return a fasta file.
48 """
49
50 def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
51 """
52 Constructor.
53 """
54 self._inFileName = input
55 self._identityThreshold = 95
56 self._coverageThreshold = 0.9
57 self._bothSeq = "T"
58 self._filterUnclusteredSeq = False
59 self._outFilePrefix = outFilePrefix
60 self._isBlastToMap = False
61 self._isHeaderForTEdenovo = False
62 self._nbCPUs = 1
63 self._clean = clean
64 self._verbose = verbose
65 self._tmpFileName = ""
66
67 def setAttributesFromCmdLine(self):
68 """
69 Set the attributes from the command-line.
70 """
71
72 description = "Launch Blastclust on nucleotide sequences and return a fasta file."
73 usage = "LaunchBlastclust.py -i inputFileName [options]"
74
75 examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
76 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
77 examples += "\n\t"
78 examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
79 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
80 examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
81
82 #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
83
84 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
85 parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "")
86 parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9)
87 parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95)
88 parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T")
89 parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true")
90 parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "")
91 parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true")
92 parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true")
93 parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1)
94 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
95 parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0)
96
97 options = parser.parse_args()[0]
98 self._setAttributesFromOptions(options)
99
100 def _setAttributesFromOptions(self, options):
101 self.setInputFileName(options.inFileName)
102 self.setCoverageThreshold(options.coverageThreshold)
103 self.setIdentityThreshold(options.identityThreshold)
104 self.setBothSequences(options.bothSeq)
105 self.setNbCPUs(options.nbCPUs)
106 self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
107 if options.filterUnclusteredSeq:
108 self.setFilterUnclusteredSequences()
109 if options.outFilePrefix != "":
110 self.setOutputFilePrefix(options.outFilePrefix)
111 else:
112 self._outFilePrefix = self._inFileName
113 if options.isBlast2Map:
114 self.setIsBlastToMap()
115 if options.clean:
116 self.setClean()
117 self.setVerbosityLevel(options.verbose)
118
119 def setInputFileName(self , inFileName):
120 self._inFileName = inFileName
121
122 def setCoverageThreshold(self, lengthThresh):
123 self._coverageThreshold = float(lengthThresh)
124
125 def setIdentityThreshold(self, identityThresh):
126 self._identityThreshold = int(identityThresh)
127
128 def setBothSequences(self, bothSeq):
129 self._bothSeq = bothSeq
130
131 def setNbCPUs(self, nbCPUs):
132 self._nbCPUs = int(nbCPUs)
133
134 def setFilterUnclusteredSequences(self):
135 self._filterUnclusteredSeq = True
136
137 def setOutputFilePrefix(self, outFilePrefix):
138 self._outFilePrefix = outFilePrefix
139
140 def setIsBlastToMap(self):
141 self._isBlastToMap = True
142
143 def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
144 self._isHeaderForTEdenovo = isHeaderForTEdenovo
145
146 def setClean(self):
147 self._clean = True
148
149 def setVerbosityLevel(self, verbose):
150 self._verbose = int(verbose)
151
152 def setTmpFileName(self, tmpFileName):
153 self._tmpFileName = tmpFileName
154
155
156 def checkAttributes(self):
157 """
158 Check the attributes are valid before running the algorithm.
159 """
160 if self._inFileName == "":
161 print "ERROR: missing input file name (-i)"
162 sys.exit(1)
163 if self._outFilePrefix == "":
164 self._outFilePrefix = self._inFileName
165 self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
166
167
168 def launchBlastclust(self, inFile):
169 """
170 Launch Blastclust in command-line.
171 """
172 if os.path.exists(os.path.basename(inFile)):
173 inFile = os.path.basename(inFile)
174 prg = "blastclust"
175 cmd = prg
176 cmd += " -i %s" % (inFile)
177 cmd += " -o %s" % (self._tmpFileName)
178 cmd += " -S %i" % (self._identityThreshold)
179 cmd += " -L %f" % (self._coverageThreshold)
180 cmd += " -b %s" % (self._bothSeq)
181 cmd += " -p F"
182 cmd += " -a %i" % (self._nbCPUs)
183 if self._verbose == 0:
184 cmd += " -v blastclust.log"
185 if self._verbose > 0:
186 print cmd
187 sys.stdout.flush()
188 process = subprocess.Popen(cmd, shell = True)
189 process.communicate()
190 if process.returncode != 0:
191 raise Exception("ERROR when launching '%s'" % cmd)
192 if self._clean and os.path.exists("error.log"):
193 os.remove("error.log")
194 if self._clean and os.path.exists("blastclust.log"):
195 os.remove("blastclust.log")
196
197
198 def getClustersFromTxtFile(self):
199 """
200 Return a dictionary with cluster IDs as keys and sequence headers as values.
201 """
202 dClusterId2SeqHeaders = {}
203 inF = open(self._tmpFileName, "r")
204 line = inF.readline()
205 clusterId = 1
206 while True:
207 if line == "":
208 break
209 tokens = line[:-1].split(" ")
210 dClusterId2SeqHeaders[clusterId] = []
211 for seqHeader in tokens:
212 if seqHeader != "":
213 dClusterId2SeqHeaders[clusterId].append(seqHeader)
214 line = inF.readline()
215 clusterId += 1
216 inF.close()
217 if self._verbose > 0:
218 print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
219 sys.stdout.flush()
220 return dClusterId2SeqHeaders
221
222
223 def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
224 """
225 Filter clusters having only one sequence.
226 """
227 for clusterId in dClusterId2SeqHeaders.keys():
228 if len(dClusterId2SeqHeaders[clusterId]) == 1:
229 del dClusterId2SeqHeaders[clusterId]
230 if self._verbose > 0:
231 print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
232 sys.stdout.flush()
233 return dClusterId2SeqHeaders
234
235
236 def getClusteringResultsInFasta(self, inFile):
237 """
238 Write a fasta file whose sequence headers contain the clustering IDs.
239 """
240 dClusterId2SeqHeaders = self.getClustersFromTxtFile()
241 if self._filterUnclusteredSeq:
242 dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
243 inDB = BioseqDB(inFile)
244 outFileName = "%s_Blastclust.fa" % (inFile)
245 outF = open(outFileName, "w")
246 for clusterId in dClusterId2SeqHeaders.keys():
247 memberId = 1
248 for seqHeader in dClusterId2SeqHeaders[clusterId]:
249 bs = inDB.fetch(seqHeader)
250 bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
251 bs.write(outF)
252 memberId += 1
253 outF.close()
254
255
256 def getLinkInitNewHeaders(self):
257 dNew2Init = {}
258 linkFileName = "%s.shortHlink" % (self._inFileName)
259 linkFile = open(linkFileName,"r")
260 line = linkFile.readline()
261 while True:
262 if line == "":
263 break
264 data = line.split("\t")
265 dNew2Init[data[0]] = data[1]
266 line = linkFile.readline()
267 linkFile.close()
268 return dNew2Init
269
270
271 def retrieveInitHeaders(self, dNewH2InitH):
272 tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
273 tmpFaFileHandler = open(tmpFaFile, "r")
274 outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
275 outFaFileHandler = open(outFaFile, "w")
276 while True:
277 line = tmpFaFileHandler.readline()
278 if line == "":
279 break
280 if line[0] == ">":
281 tokens = line[1:-1].split("_")
282 initHeader = dNewH2InitH[tokens[1]]
283 if self._isHeaderForTEdenovo:
284 classif = initHeader.split("_")[0]
285 consensusName = "_".join(initHeader.split("_")[1:])
286 clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
287 newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
288 else:
289 newHeader = "%s_%s" % (tokens[0], initHeader)
290 outFaFileHandler.write(">%s\n" % (newHeader))
291 else:
292 outFaFileHandler.write(line)
293 tmpFaFileHandler.close()
294 outFaFileHandler.close()
295 if self._clean:
296 os.remove(tmpFaFile)
297
298
299 def blastclustToMap(self, blastclustFastaOut):
300 """
301 Write a map file from blastclust fasta output.
302 Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
303 """
304 fileDb = open(blastclustFastaOut , "r")
305 mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
306 fileMap = open(mapFilename, "w")
307 seq = Bioseq()
308 numseq = 0
309 while 1:
310 seq.read(fileDb)
311 if seq.sequence == None:
312 break
313 numseq = numseq + 1
314 ID = seq.header.split(' ')[0].split('_')[0]
315 chunk = seq.header.split(' ')[0].split('_')[1]
316 start = seq.header.split(' ')[-1].split(',')[0][1:]
317 end = seq.header.split(' ')[-1].split(',')[1][:-1]
318 line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
319 fileMap.write(line + "\n")
320
321 fileDb.close()
322 fileMap.close()
323 print "saved in %s" % mapFilename
324
325
326 def start(self):
327 """
328 Useful commands before running the program.
329 """
330 self.checkAttributes()
331 if self._verbose > 0:
332 print "START %s" % (type(self).__name__)
333
334
335 def end(self):
336 """
337 Useful commands before ending the program.
338 """
339 if self._verbose > 0:
340 print "END %s" % (type(self).__name__)
341
342
343 def run(self):
344 """
345 Run the program.
346 """
347 self.start()
348
349 iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
350 iCSH.run()
351
352 self.launchBlastclust("%s.shortH" % (self._inFileName))
353
354 self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
355
356 dNewH2InitH = self.getLinkInitNewHeaders()
357 self.retrieveInitHeaders(dNewH2InitH)
358
359 if self._isBlastToMap:
360 blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
361 self.blastclustToMap(blastclustFileName)
362
363 if self._clean:
364 os.remove("%s.shortH" % (self._inFileName))
365 os.remove("%s.shortHlink" % (self._inFileName))
366
367 self.end()
368
369 if __name__ == "__main__":
370 i = LaunchBlastclust()
371 i.setAttributesFromCmdLine()
372 i.run()