comparison commons/tools/AlignTEOnGenomeAccordingToAnnotation.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 import re
34 import os
35 from commons.core.LoggerFactory import LoggerFactory
36 from commons.core.utils.RepetOptionParser import RepetOptionParser
37 from commons.core.checker.ConfigChecker import ConfigRules, ConfigChecker
38 import subprocess
39 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
40 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
41 from commons.core.coord.PathUtils import PathUtils
42 from commons.core.coord.SetUtils import SetUtils
43 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
44 from commons.core.coord.Set import Set
45 from commons.core.sql.DbFactory import DbFactory
46
47 ## Align a TE on genome according to annotation
48
49 LOG_DEPTH = "repet.tools"
50
51 class AlignTEOnGenomeAccordingToAnnotation(object):
52
53 def __init__(self, pathTableName = "", queryTableName = "", subjectTableName = "", mergeSamePathId = False, outTableName = "", matchPenality=10, mism=8, gapo=16, gape=4, gapl=20, configFileName = "", doClean = False, verbosity = 0):
54 self._pathTableName = pathTableName
55 self._queryTableName = queryTableName
56 self._subjectTableName = subjectTableName
57 self._mergeSamePathId = mergeSamePathId
58 self.setOutTableName(outTableName)
59 self._matchPenality = matchPenality
60 self._mismatch = mism
61 self._gapOpening = gapo
62 self._gapExtend = gape
63 self._gapLength = gapl
64 self._configFileName = configFileName
65 self._doClean = doClean
66 self._verbosity = verbosity
67 self._iDb = None
68 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
69
70 def setAttributesFromCmdLine(self):
71 description = "Align a TE on genome according to annotation."
72 epilog = "\nExample 1: launch without verbosity and keep temporary files.\n"
73 epilog += "\t$ python AlignTEOnGenomeAccordingToAnnotation.py -p DmelChr4_chr_allTEs_nr_noSSR_join_path -q DmelChr4_chr_seq -s DmelChr4_refTEs_seq -v 0"
74 epilog += "\n"
75 parser = RepetOptionParser(description = description, epilog = epilog)
76 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "path table name [compulsory] [format: path]", default = "")
77 parser.add_option("-q", "--query", dest = "queryTableName", action = "store", type = "string", help = "query table name [compulsory] [format: seq]", default = "")
78 parser.add_option("-s", "--subject", dest = "subjectTableName", action = "store", type = "string", help = "subject table name [compulsory] [format: seq]", default = "")
79 parser.add_option("-m", "--merge", dest = "mergeSamePathId", action = "store_true", help = "merge joined matchs [optional] [default: False]", default = False)
80 parser.add_option("-o", "--out", dest = "outTableName", action = "store", type = "string", help = "output table name [default: <pathTableName>_align]", default = "")
81 #TODO: add options for : matchPenality=10, mism=8, gapo=16, gape=4, gapl=20
82 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file name (e.g. TEannot.cfg)", default = "")
83 #NOTE: doClean unused
84 # parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
85 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
86 options = parser.parse_args()[0]
87 self._setAttributesFromOptions(options)
88
89 def _setAttributesFromOptions(self, options):
90 self.setPathTableName(options.pathTableName)
91 self.setQueryTableName(options.queryTableName)
92 self.setSubjectTableName(options.subjectTableName)
93 self.setMergeSamePathId(options.mergeSamePathId)
94 self.setOutTableName(options.outTableName)
95 self.setConfigFileName(options.configFileName)
96 # self.setDoClean(options.doClean)
97 self.setVerbosity(options.verbosity)
98
99 def _checkConfig(self):
100 iConfigRules = ConfigRules()
101 iConfigRules.addRuleSection(section="", mandatory=True)
102 iConfigRules.addRuleOption(section="", option ="fasta_name", mandatory=True, type="string")
103 iConfigRules.addRuleOption(section="", option ="clean", mandatory=True, type="bool")
104 iConfigChecker = ConfigChecker(self._configFileName, iConfigRules)
105 iConfig = iConfigChecker.getConfig()
106 self._setAttributesFromConfig(iConfig)
107
108 def _setAttributesFromConfig(self, iConfig):
109 self.setOutTableName(self._outTableName)
110 self.setDoClean(iConfig.get("", "clean"))
111
112 def setPathTableName(self, pathTableName):
113 self._pathTableName = pathTableName
114
115 def setQueryTableName(self, queryTableName):
116 self._queryTableName = queryTableName
117
118 def setSubjectTableName(self, subjectTableName):
119 self._subjectTableName = subjectTableName
120
121 def setMergeSamePathId(self, mergeSamePathId):
122 self._mergeSamePathId = mergeSamePathId
123
124 def setOutTableName(self, outTableName):
125 if outTableName == "":
126 self._outTableName = "%s_align" % self._pathTableName
127 else:
128 self._outTableName = outTableName
129
130 def setConfigFileName(self, configFileName):
131 self._configFileName = configFileName
132
133 def setDoClean(self, doClean):
134 self._doClean = doClean
135
136 def setVerbosity(self, verbosity):
137 self._verbosity = verbosity
138
139 def setDbInstance(self, iDb):
140 self._iDb = iDb
141
142 def _checkOptions(self):
143 if self._pathTableName == "" or not self._iDb.doesTableExist(self._pathTableName):
144 self._logAndRaise("ERROR: Missing path table")
145 if self._queryTableName == "" or not self._iDb.doesTableExist(self._queryTableName):
146 self._logAndRaise("ERROR: Missing query table")
147 if self._subjectTableName == "" or not self._iDb.doesTableExist(self._subjectTableName):
148 self._logAndRaise("ERROR: Missing subject table")
149
150 def _logAndRaise(self, errorMsg):
151 self._log.error(errorMsg)
152 raise Exception(errorMsg)
153
154 def alignBioseqWithNWalign(self, iBioseq1, iBioseq2):
155 fastaFileName1 = "seqtoalign1.tmp"
156 iBioseq1.save(fastaFileName1)
157 fastaFileName2 = "seqtoalign2.tmp"
158 iBioseq2.save(fastaFileName2)
159 alignBioseqDBFileName = "aligned.tmp"
160 cmd = "NWalign"
161 cmd += " %s" % fastaFileName1
162 cmd += " %s" % fastaFileName2
163 cmd += " -m %s" % self._matchPenality
164 cmd += " -d %s" % self._mismatch
165 cmd += " -g %s" % self._gapOpening
166 cmd += " -e %s" % self._gapExtend
167 cmd += " -l %s" % self._gapLength
168 cmd += " -D"
169 cmd += " -o %s" % alignBioseqDBFileName
170 process = subprocess.Popen(cmd, shell = True)
171 self._log.debug("Running : %s" % cmd)
172 process.communicate()
173 if process.returncode != 0:
174 self._logAndRaise("ERROR when launching '%s'" % cmd)
175 iAlignedBioseqDB = AlignedBioseqDB()
176 iAlignedBioseqDB.load(alignBioseqDBFileName)
177 os.remove(fastaFileName1)
178 os.remove(fastaFileName2)
179 os.remove(alignBioseqDBFileName)
180 return iAlignedBioseqDB
181
182 def alignSeqAccordingToPathAndBuildAlignedSeqTable(self):
183 if self._iDb.doesTableExist(self._outTableName):
184 self._logAndRaise("ERROR: out table %s already exists" % self._outTableName)
185
186 #TODO: create alignedSeq table in DbMySql...
187 sqlCmd="CREATE TABLE %s (path int unsigned, query_aligned_seq longtext, subject_aligned_seq longtext, score int unsigned, identity float unsigned)" % self._outTableName
188 self._iDb.execute(sqlCmd)
189
190 iQueryTSA = TableSeqAdaptator(self._iDb, self._queryTableName)
191 iSubjectTSA = TableSeqAdaptator(self._iDb, self._subjectTableName)
192 iTPA = TablePathAdaptator(self._iDb, self._pathTableName)
193
194 if self._mergeSamePathId:
195 lPathId = iTPA.getIdList()
196 pathNb = len(lPathId)
197 count = 0
198 for pathNum in lPathId:
199 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...")
200 lPaths = iTPA.getPathListFromId(pathNum)
201 #TODO: getSetListFromQueries() call getSubjectAsSetOfQuery() => "reverse complement" the query (coordinates are inversed, so getBioseq() will take the reverse-comp.) : is it correct ?
202 lQuerySets = PathUtils.getSetListFromQueries(lPaths)
203 #NOTE: merge sets if overlap
204 lQueryMergedSets = SetUtils.mergeSetsInList(lQuerySets)
205 #TODO: getBioseqFromSetList() build a sequence that does not exist : is it correct ?
206 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQueryMergedSets)
207 lSubjectSets = PathUtils.getSetListFromSubjects(lPaths)
208 #TODO: no merge for subjects : is it correct ? matcher allow overlap on query and not on subject ?
209 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets)
210 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq)
211 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB)
212 else:
213 lPathId = iTPA.getIdList()
214 pathNb = len(lPathId)
215 count = 0
216 for pathNum in lPathId:
217 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...")
218 lPaths = iTPA.getPathListFromId(pathNum)
219 queryName = lPaths[0].getQueryName()
220 subjectName = lPaths[0].getSubjectName()
221 lQueryStart = []
222 lQueryEnd = []
223 lSubjectStart = []
224 lSubjectEnd = []
225 isReversed = not lPaths[0].isSubjectOnDirectStrand()
226 for iPath in lPaths:
227 lQueryStart.append(iPath.getQueryStart())
228 lQueryEnd.append(iPath.getQueryEnd())
229 lSubjectStart.append(iPath.getSubjectStart())
230 lSubjectEnd.append(iPath.getSubjectEnd())
231 queryStart = min(lQueryStart)
232 queryEnd = max(lQueryEnd)
233 if isReversed:
234 subjectStart = max(lSubjectStart)
235 subjectEnd = min(lSubjectEnd)
236 else:
237 subjectStart = min(lSubjectStart)
238 subjectEnd = max(lSubjectEnd)
239 lQuerySets = [Set(pathNum,subjectName, queryName,queryStart,queryEnd)]
240 lSubjectSets = [Set(pathNum,queryName, subjectName,subjectStart,subjectEnd)]
241
242 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQuerySets)
243 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets)
244 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq)
245 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB)
246
247 def run(self):
248 LoggerFactory.setLevel(self._log, self._verbosity)
249 if self._configFileName:
250 self._checkConfig()
251 self._iDb = DbFactory.createInstance()
252 self._checkOptions()
253 self._log.info("START AlignTEOnGenomeAccordingToAnnotation")
254 self._log.debug("path table name: %s" % self._pathTableName)
255 self._log.debug("query table name: %s" % self._queryTableName)
256 self._log.debug("subject table name: %s" % self._subjectTableName)
257 self.alignSeqAccordingToPathAndBuildAlignedSeqTable()
258 self._iDb.close()
259 self._log.info("END AlignTEOnGenomeAccordingToAnnotation")
260
261 def _insertAlignedBioseqDBWithScoreAndIdentityInTable(self, pathNum, iAlignedBioseqDB):
262 scoreWithEndLine = re.split("Score=", iAlignedBioseqDB.db[0].header)[1]
263 score = int(scoreWithEndLine.split()[0])
264
265 identity = re.split("Identity=", scoreWithEndLine)[1]
266 if identity == "nan":
267 identity = "0.0"
268 identity = float(identity)*100.0
269
270 #TODO: create TableAlignedSeqAdaptator (to use insert...)
271 sqlCmd = 'INSERT INTO %s VALUES (%d,"%s","%s", %d,%f);' % (self._outTableName, pathNum, iAlignedBioseqDB.db[0].sequence, iAlignedBioseqDB.db[1].sequence, score, identity)
272 self._iDb.execute(sqlCmd)
273
274 self._log.debug("header:", iAlignedBioseqDB.db[0].header)
275 self._log.debug("path", pathNum, "Score=", score, "Identity=", identity, "ok")
276 self._log.debug(iAlignedBioseqDB.db[0].sequence[:80])
277 self._log.debug(iAlignedBioseqDB.db[1].sequence[:80])
278
279 if __name__ == "__main__":
280 iLaunch = AlignTEOnGenomeAccordingToAnnotation()
281 iLaunch.setAttributesFromCmdLine()
282 iLaunch.run()