comparison commons/launcher/LaunchRefAlign.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 from commons.core.LoggerFactory import LoggerFactory
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
35 from commons.core.checker.ConfigChecker import ConfigRules
36 from commons.core.checker.ConfigChecker import ConfigChecker
37 import subprocess
38 import os
39 from commons.core.seq.Bioseq import Bioseq
40
41 LOG_DEPTH = "repet.core.launchers"
42
43 from commons.core.seq.BioseqDB import BioseqDB
44 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
45
46
47 class LaunchRefAlign(object):
48 """
49 Launch 'refalign' to build a master-slave multiple sequence alignment.
50 """
51 def __init__(self, inputFileName="", outFileName="", gapSize=10, match=10, mismatch=8, gapOpen=16, gapExtend=4, refseqName="", keepRefseq =False, verbosity=3 ):
52 self.inputFileName = inputFileName
53 self.outFileName=outFileName
54 self.gapSize = gapSize
55 self.match = match
56 self.mismatch = mismatch
57 self.gapOpen = gapOpen
58 self.gapExtend = gapExtend
59 self.gapExtend = gapExtend
60 self.refseqName = refseqName
61 self.keepRefseq = keepRefseq
62 self._verbosity = verbosity
63 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
64
65 def setAttributesFromCmdLine(self):
66 description = "usage: LaunchRefalign.py [ options ]"
67 epilog = "\n -h: this help\n"
68 epilog += "\t -i: name of the input file (refseq is first, format='fasta')"
69 epilog += "\t -r: keep the reference sequence"
70 epilog += "\t -o: name of the output file (default=inFileName+'.fa_aln')"
71 epilog += "\t -v: verbosity (default=0)"
72 epilog += "\n\t"
73 parser = RepetOptionParser(description = description, epilog = epilog)
74 parser.add_option("-i", "--fasta", dest = "inputFileName", action = "store", type = "string", help = "input fasta file name [compulsory] [format: fasta]", default = "")
75 parser.add_option("-o", "--out", dest = "outFileName", action = "store", type = "string", help = "output file name [default: <input>.out]", default = "")
76 parser.add_option("-r", "--keepRefseq", dest = "keepRefseq", action = "store_true", help = "keep reference sequence [optional] [default: False]", default = False)
77 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
78 options = parser.parse_args()[0]
79 self._setAttributesFromOptions(options)
80
81 def _setAttributesFromOptions(self, options):
82 self.inputFileName = options.inputFileName
83 self.setOutFileName = options.outFileName
84 self.keepRefseq = options.keepRefseq
85 self._verbosity = options.verbosity
86
87 def _checkOptions(self):
88 if self.inputFileName == "":
89 self._logAndRaise("ERROR: Missing input file name")
90
91 if self.outFileName == "":
92 self.outFileName = "%s.fa_aln" % (self.inputFileName)
93
94 def _logAndRaise(self, errorMsg):
95 self._log.error(errorMsg)
96 raise Exception(errorMsg)
97
98 def _prepareRefAlign(self):
99 self.shortInputFileName = self.inputFileName+".shortH"
100 self.refFileName= self.shortInputFileName + ".ref"
101 self.cpyFileName=self.shortInputFileName + ".cpy"
102
103 file_db = open(self.shortInputFileName)
104 file_ref = open(self.refFileName,"w")
105 file_cpy = open(self.cpyFileName,"w")
106
107 self._numseq=0
108 while 1:
109 seq=Bioseq()
110 seq.read(file_db)
111 if seq.sequence==None:
112 break
113 self._numseq+=1
114 if self._numseq==1:
115 seq.write(file_ref)
116 else:
117 seq.write(file_cpy)
118 file_db.close()
119 file_ref.close()
120 file_cpy.close()
121
122 def _shortenHeaders(self):
123 self.csh = ChangeSequenceHeaders()
124 self.csh.setInputFile(self.inputFileName)
125 self.csh.setFormat("fasta")
126 self.csh.setStep(1)
127 self.csh.setPrefix("seq")
128 self.csh.setLinkFile(self.inputFileName+".shortHlink")
129 self.csh.setOutputFile(self.inputFileName+".shortH")
130 self.csh.setVerbosityLevel(self._verbosity-1)
131 self.csh.run()
132
133 bsDB = BioseqDB(self.inputFileName+".shortH")
134 bsDB.upCase()
135 bsDB.save(self.inputFileName+".shortHtmp")
136 del bsDB
137 os.rename(self.inputFileName+".shortHtmp", self.inputFileName+".shortH")
138
139 def _renameHeaders(self):
140 self.csh.setInputFile(self.inputFileName+".shortH.fa_aln")
141 self.csh.setFormat("fasta")
142 self.csh.setStep(2)
143 self.csh.setLinkFile(self.inputFileName+".shortHlink" )
144 self.csh.setOutputFile(self.outFileName)
145 self.csh.setVerbosityLevel(self._verbosity-1)
146 self.csh.run()
147
148 def run(self):
149 LoggerFactory.setLevel(self._log, self._verbosity)
150 self._checkOptions()
151 self._log.info("START LaunchRefAlign")
152 self._log.debug("building a multiple alignment from '%s'..." % ( self.inputFileName))
153
154 inputFileName = "%s/%s" % (os.getcwd(), os.path.basename(self.inputFileName))
155 if not os.path.exists(inputFileName):
156 os.symlink(self.inputFileName, inputFileName)
157 self.inputFileName = inputFileName
158
159 self._shortenHeaders()
160 if self.keepRefseq:
161 self.refseqName="seq1"
162 self._prepareRefAlign()
163
164 if self._numseq > 1:
165 cmd = "refalign %s %s -m %d -l %d -d %d -g %d -e %d" % (self.refFileName, self.cpyFileName, self.match, self.gapSize, self.mismatch, self.gapOpen, self.gapExtend)
166
167 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
168 self._log.debug("Running : %s" % cmd)
169 output = process.communicate()
170 self._log.debug("Output:\n%s" % output[0])
171 if process.returncode != 0:
172 self._logAndRaise("ERROR when launching '%s'" % cmd)
173 refseqNameParam = ""
174 if self.refseqName != "":
175 refseqNameParam = "-r %s " % (self.refseqName)
176 outFileName = self.inputFileName+".shortH.fa_aln"
177 #self.cpyFileName = os.path.join(os.getcwd(),os.path.basename(self.cpyFileName))
178
179 self._log.info("Copy file path %s " % self.cpyFileName)
180 print("Copy file path %s " % self.cpyFileName)
181 cmd = "refalign2fasta.py -i %s.aligner %s-g d -o %s -v 1" % (self.cpyFileName, refseqNameParam, outFileName)
182 self._log.debug("Running : %s" % cmd)
183 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
184 output = process.communicate()
185 self._log.debug("Output:\n%s" % output[0])
186
187 if process.returncode != 0:
188 self._logAndRaise("ERROR when launching '%s'" % cmd)
189
190 cmd = "rm -f "+ self.refFileName + " " + self.cpyFileName + " " + self.cpyFileName + ".aligner " + self.cpyFileName + ".oriented " + self.cpyFileName + ".refalign.stat"
191 os.system(cmd)
192
193 else:
194 self._logAndRaise("Only one sequence available")
195 cmd = "echo empty"
196
197 self._renameHeaders()
198
199 for fileName in [self.inputFileName + ".shortH", self.inputFileName + ".shortHlink", self.inputFileName + ".shortH.fa_aln"]:
200 os.remove(fileName)
201 self._log.info("END LaunchRefAlign")
202 return 0
203
204
205 if __name__ == "__main__":
206 iLaunchRefAlign = LaunchRefAlign()
207 iLaunchRefAlign.setAttributesFromCmdLine()
208 iLaunchRefAlign.run()