Mercurial > repos > yufei-luo > s_mart
comparison commons/launcher/LaunchBlastclust.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 Launch Blastclust on nucleotide sequences and return a fasta file. | |
| 5 """ | |
| 6 | |
| 7 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 8 # http://www.inra.fr | |
| 9 # http://urgi.versailles.inra.fr | |
| 10 # | |
| 11 # This software is governed by the CeCILL license under French law and | |
| 12 # abiding by the rules of distribution of free software. You can use, | |
| 13 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 14 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 15 # "http://www.cecill.info". | |
| 16 # | |
| 17 # As a counterpart to the access to the source code and rights to copy, | |
| 18 # modify and redistribute granted by the license, users are provided only | |
| 19 # with a limited warranty and the software's author, the holder of the | |
| 20 # economic rights, and the successive licensors have only limited | |
| 21 # liability. | |
| 22 # | |
| 23 # In this respect, the user's attention is drawn to the risks associated | |
| 24 # with loading, using, modifying and/or developing or reproducing the | |
| 25 # software by the user in light of its specific status of free software, | |
| 26 # that may mean that it is complicated to manipulate, and that also | |
| 27 # therefore means that it is reserved for developers and experienced | |
| 28 # professionals having in-depth computer knowledge. Users are therefore | |
| 29 # encouraged to load and test the software's suitability as regards their | |
| 30 # requirements in conditions enabling the security of their systems and/or | |
| 31 # data to be ensured and, more generally, to use and operate it in the | |
| 32 # same conditions as regards security. | |
| 33 # | |
| 34 # The fact that you are presently reading this means that you have had | |
| 35 # knowledge of the CeCILL license and that you accept its terms. | |
| 36 | |
| 37 import os | |
| 38 import sys | |
| 39 import subprocess | |
| 40 from commons.core.seq.BioseqDB import BioseqDB | |
| 41 from commons.core.seq.Bioseq import Bioseq | |
| 42 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
| 43 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders | |
| 44 | |
| 45 class LaunchBlastclust(object): | |
| 46 """ | |
| 47 Launch Blastclust on nucleotide sequences and return a fasta file. | |
| 48 """ | |
| 49 | |
| 50 def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0): | |
| 51 """ | |
| 52 Constructor. | |
| 53 """ | |
| 54 self._inFileName = input | |
| 55 self._identityThreshold = 95 | |
| 56 self._coverageThreshold = 0.9 | |
| 57 self._bothSeq = "T" | |
| 58 self._filterUnclusteredSeq = False | |
| 59 self._outFilePrefix = outFilePrefix | |
| 60 self._isBlastToMap = False | |
| 61 self._isHeaderForTEdenovo = False | |
| 62 self._nbCPUs = 1 | |
| 63 self._clean = clean | |
| 64 self._verbose = verbose | |
| 65 self._tmpFileName = "" | |
| 66 | |
| 67 def setAttributesFromCmdLine(self): | |
| 68 """ | |
| 69 Set the attributes from the command-line. | |
| 70 """ | |
| 71 | |
| 72 description = "Launch Blastclust on nucleotide sequences and return a fasta file." | |
| 73 usage = "LaunchBlastclust.py -i inputFileName [options]" | |
| 74 | |
| 75 examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n" | |
| 76 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c" | |
| 77 examples += "\n\t" | |
| 78 examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n" | |
| 79 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m" | |
| 80 examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n" | |
| 81 | |
| 82 #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option | |
| 83 | |
| 84 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples) | |
| 85 parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "") | |
| 86 parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9) | |
| 87 parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95) | |
| 88 parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T") | |
| 89 parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true") | |
| 90 parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "") | |
| 91 parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true") | |
| 92 parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true") | |
| 93 parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1) | |
| 94 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") | |
| 95 parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0) | |
| 96 | |
| 97 options = parser.parse_args()[0] | |
| 98 self._setAttributesFromOptions(options) | |
| 99 | |
| 100 def _setAttributesFromOptions(self, options): | |
| 101 self.setInputFileName(options.inFileName) | |
| 102 self.setCoverageThreshold(options.coverageThreshold) | |
| 103 self.setIdentityThreshold(options.identityThreshold) | |
| 104 self.setBothSequences(options.bothSeq) | |
| 105 self.setNbCPUs(options.nbCPUs) | |
| 106 self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo) | |
| 107 if options.filterUnclusteredSeq: | |
| 108 self.setFilterUnclusteredSequences() | |
| 109 if options.outFilePrefix != "": | |
| 110 self.setOutputFilePrefix(options.outFilePrefix) | |
| 111 else: | |
| 112 self._outFilePrefix = self._inFileName | |
| 113 if options.isBlast2Map: | |
| 114 self.setIsBlastToMap() | |
| 115 if options.clean: | |
| 116 self.setClean() | |
| 117 self.setVerbosityLevel(options.verbose) | |
| 118 | |
| 119 def setInputFileName(self , inFileName): | |
| 120 self._inFileName = inFileName | |
| 121 | |
| 122 def setCoverageThreshold(self, lengthThresh): | |
| 123 self._coverageThreshold = float(lengthThresh) | |
| 124 | |
| 125 def setIdentityThreshold(self, identityThresh): | |
| 126 self._identityThreshold = int(identityThresh) | |
| 127 | |
| 128 def setBothSequences(self, bothSeq): | |
| 129 self._bothSeq = bothSeq | |
| 130 | |
| 131 def setNbCPUs(self, nbCPUs): | |
| 132 self._nbCPUs = int(nbCPUs) | |
| 133 | |
| 134 def setFilterUnclusteredSequences(self): | |
| 135 self._filterUnclusteredSeq = True | |
| 136 | |
| 137 def setOutputFilePrefix(self, outFilePrefix): | |
| 138 self._outFilePrefix = outFilePrefix | |
| 139 | |
| 140 def setIsBlastToMap(self): | |
| 141 self._isBlastToMap = True | |
| 142 | |
| 143 def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo): | |
| 144 self._isHeaderForTEdenovo = isHeaderForTEdenovo | |
| 145 | |
| 146 def setClean(self): | |
| 147 self._clean = True | |
| 148 | |
| 149 def setVerbosityLevel(self, verbose): | |
| 150 self._verbose = int(verbose) | |
| 151 | |
| 152 def setTmpFileName(self, tmpFileName): | |
| 153 self._tmpFileName = tmpFileName | |
| 154 | |
| 155 | |
| 156 def checkAttributes(self): | |
| 157 """ | |
| 158 Check the attributes are valid before running the algorithm. | |
| 159 """ | |
| 160 if self._inFileName == "": | |
| 161 print "ERROR: missing input file name (-i)" | |
| 162 sys.exit(1) | |
| 163 if self._outFilePrefix == "": | |
| 164 self._outFilePrefix = self._inFileName | |
| 165 self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix) | |
| 166 | |
| 167 | |
| 168 def launchBlastclust(self, inFile): | |
| 169 """ | |
| 170 Launch Blastclust in command-line. | |
| 171 """ | |
| 172 if os.path.exists(os.path.basename(inFile)): | |
| 173 inFile = os.path.basename(inFile) | |
| 174 prg = "blastclust" | |
| 175 cmd = prg | |
| 176 cmd += " -i %s" % (inFile) | |
| 177 cmd += " -o %s" % (self._tmpFileName) | |
| 178 cmd += " -S %i" % (self._identityThreshold) | |
| 179 cmd += " -L %f" % (self._coverageThreshold) | |
| 180 cmd += " -b %s" % (self._bothSeq) | |
| 181 cmd += " -p F" | |
| 182 cmd += " -a %i" % (self._nbCPUs) | |
| 183 if self._verbose == 0: | |
| 184 cmd += " -v blastclust.log" | |
| 185 if self._verbose > 0: | |
| 186 print cmd | |
| 187 sys.stdout.flush() | |
| 188 process = subprocess.Popen(cmd, shell = True) | |
| 189 process.communicate() | |
| 190 if process.returncode != 0: | |
| 191 raise Exception("ERROR when launching '%s'" % cmd) | |
| 192 if self._clean and os.path.exists("error.log"): | |
| 193 os.remove("error.log") | |
| 194 if self._clean and os.path.exists("blastclust.log"): | |
| 195 os.remove("blastclust.log") | |
| 196 | |
| 197 | |
| 198 def getClustersFromTxtFile(self): | |
| 199 """ | |
| 200 Return a dictionary with cluster IDs as keys and sequence headers as values. | |
| 201 """ | |
| 202 dClusterId2SeqHeaders = {} | |
| 203 inF = open(self._tmpFileName, "r") | |
| 204 line = inF.readline() | |
| 205 clusterId = 1 | |
| 206 while True: | |
| 207 if line == "": | |
| 208 break | |
| 209 tokens = line[:-1].split(" ") | |
| 210 dClusterId2SeqHeaders[clusterId] = [] | |
| 211 for seqHeader in tokens: | |
| 212 if seqHeader != "": | |
| 213 dClusterId2SeqHeaders[clusterId].append(seqHeader) | |
| 214 line = inF.readline() | |
| 215 clusterId += 1 | |
| 216 inF.close() | |
| 217 if self._verbose > 0: | |
| 218 print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys())) | |
| 219 sys.stdout.flush() | |
| 220 return dClusterId2SeqHeaders | |
| 221 | |
| 222 | |
| 223 def filterUnclusteredSequences(self, dClusterId2SeqHeaders): | |
| 224 """ | |
| 225 Filter clusters having only one sequence. | |
| 226 """ | |
| 227 for clusterId in dClusterId2SeqHeaders.keys(): | |
| 228 if len(dClusterId2SeqHeaders[clusterId]) == 1: | |
| 229 del dClusterId2SeqHeaders[clusterId] | |
| 230 if self._verbose > 0: | |
| 231 print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys())) | |
| 232 sys.stdout.flush() | |
| 233 return dClusterId2SeqHeaders | |
| 234 | |
| 235 | |
| 236 def getClusteringResultsInFasta(self, inFile): | |
| 237 """ | |
| 238 Write a fasta file whose sequence headers contain the clustering IDs. | |
| 239 """ | |
| 240 dClusterId2SeqHeaders = self.getClustersFromTxtFile() | |
| 241 if self._filterUnclusteredSeq: | |
| 242 dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders) | |
| 243 inDB = BioseqDB(inFile) | |
| 244 outFileName = "%s_Blastclust.fa" % (inFile) | |
| 245 outF = open(outFileName, "w") | |
| 246 for clusterId in dClusterId2SeqHeaders.keys(): | |
| 247 memberId = 1 | |
| 248 for seqHeader in dClusterId2SeqHeaders[clusterId]: | |
| 249 bs = inDB.fetch(seqHeader) | |
| 250 bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader) | |
| 251 bs.write(outF) | |
| 252 memberId += 1 | |
| 253 outF.close() | |
| 254 | |
| 255 | |
| 256 def getLinkInitNewHeaders(self): | |
| 257 dNew2Init = {} | |
| 258 linkFileName = "%s.shortHlink" % (self._inFileName) | |
| 259 linkFile = open(linkFileName,"r") | |
| 260 line = linkFile.readline() | |
| 261 while True: | |
| 262 if line == "": | |
| 263 break | |
| 264 data = line.split("\t") | |
| 265 dNew2Init[data[0]] = data[1] | |
| 266 line = linkFile.readline() | |
| 267 linkFile.close() | |
| 268 return dNew2Init | |
| 269 | |
| 270 | |
| 271 def retrieveInitHeaders(self, dNewH2InitH): | |
| 272 tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName) | |
| 273 tmpFaFileHandler = open(tmpFaFile, "r") | |
| 274 outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix) | |
| 275 outFaFileHandler = open(outFaFile, "w") | |
| 276 while True: | |
| 277 line = tmpFaFileHandler.readline() | |
| 278 if line == "": | |
| 279 break | |
| 280 if line[0] == ">": | |
| 281 tokens = line[1:-1].split("_") | |
| 282 initHeader = dNewH2InitH[tokens[1]] | |
| 283 if self._isHeaderForTEdenovo: | |
| 284 classif = initHeader.split("_")[0] | |
| 285 consensusName = "_".join(initHeader.split("_")[1:]) | |
| 286 clusterId = tokens[0].split("Cluster")[1].split("Mb")[0] | |
| 287 newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName) | |
| 288 else: | |
| 289 newHeader = "%s_%s" % (tokens[0], initHeader) | |
| 290 outFaFileHandler.write(">%s\n" % (newHeader)) | |
| 291 else: | |
| 292 outFaFileHandler.write(line) | |
| 293 tmpFaFileHandler.close() | |
| 294 outFaFileHandler.close() | |
| 295 if self._clean: | |
| 296 os.remove(tmpFaFile) | |
| 297 | |
| 298 | |
| 299 def blastclustToMap(self, blastclustFastaOut): | |
| 300 """ | |
| 301 Write a map file from blastclust fasta output. | |
| 302 Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output. | |
| 303 """ | |
| 304 fileDb = open(blastclustFastaOut , "r") | |
| 305 mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0]) | |
| 306 fileMap = open(mapFilename, "w") | |
| 307 seq = Bioseq() | |
| 308 numseq = 0 | |
| 309 while 1: | |
| 310 seq.read(fileDb) | |
| 311 if seq.sequence == None: | |
| 312 break | |
| 313 numseq = numseq + 1 | |
| 314 ID = seq.header.split(' ')[0].split('_')[0] | |
| 315 chunk = seq.header.split(' ')[0].split('_')[1] | |
| 316 start = seq.header.split(' ')[-1].split(',')[0][1:] | |
| 317 end = seq.header.split(' ')[-1].split(',')[1][:-1] | |
| 318 line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end) | |
| 319 fileMap.write(line + "\n") | |
| 320 | |
| 321 fileDb.close() | |
| 322 fileMap.close() | |
| 323 print "saved in %s" % mapFilename | |
| 324 | |
| 325 | |
| 326 def start(self): | |
| 327 """ | |
| 328 Useful commands before running the program. | |
| 329 """ | |
| 330 self.checkAttributes() | |
| 331 if self._verbose > 0: | |
| 332 print "START %s" % (type(self).__name__) | |
| 333 | |
| 334 | |
| 335 def end(self): | |
| 336 """ | |
| 337 Useful commands before ending the program. | |
| 338 """ | |
| 339 if self._verbose > 0: | |
| 340 print "END %s" % (type(self).__name__) | |
| 341 | |
| 342 | |
| 343 def run(self): | |
| 344 """ | |
| 345 Run the program. | |
| 346 """ | |
| 347 self.start() | |
| 348 | |
| 349 iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName) | |
| 350 iCSH.run() | |
| 351 | |
| 352 self.launchBlastclust("%s.shortH" % (self._inFileName)) | |
| 353 | |
| 354 self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName)) | |
| 355 | |
| 356 dNewH2InitH = self.getLinkInitNewHeaders() | |
| 357 self.retrieveInitHeaders(dNewH2InitH) | |
| 358 | |
| 359 if self._isBlastToMap: | |
| 360 blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix) | |
| 361 self.blastclustToMap(blastclustFileName) | |
| 362 | |
| 363 if self._clean: | |
| 364 os.remove("%s.shortH" % (self._inFileName)) | |
| 365 os.remove("%s.shortHlink" % (self._inFileName)) | |
| 366 | |
| 367 self.end() | |
| 368 | |
| 369 if __name__ == "__main__": | |
| 370 i = LaunchBlastclust() | |
| 371 i.setAttributesFromCmdLine() | |
| 372 i.run() |
