Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/AlignTEOnGenomeAccordingToAnnotation.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
comparison
equal
deleted
inserted
replaced
30:5677346472b5 | 31:0ab839023fe4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
4 # http://www.inra.fr | |
5 # http://urgi.versailles.inra.fr | |
6 # | |
7 # This software is governed by the CeCILL license under French law and | |
8 # abiding by the rules of distribution of free software. You can use, | |
9 # modify and/ or redistribute the software under the terms of the CeCILL | |
10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
11 # "http://www.cecill.info". | |
12 # | |
13 # As a counterpart to the access to the source code and rights to copy, | |
14 # modify and redistribute granted by the license, users are provided only | |
15 # with a limited warranty and the software's author, the holder of the | |
16 # economic rights, and the successive licensors have only limited | |
17 # liability. | |
18 # | |
19 # In this respect, the user's attention is drawn to the risks associated | |
20 # with loading, using, modifying and/or developing or reproducing the | |
21 # software by the user in light of its specific status of free software, | |
22 # that may mean that it is complicated to manipulate, and that also | |
23 # therefore means that it is reserved for developers and experienced | |
24 # professionals having in-depth computer knowledge. Users are therefore | |
25 # encouraged to load and test the software's suitability as regards their | |
26 # requirements in conditions enabling the security of their systems and/or | |
27 # data to be ensured and, more generally, to use and operate it in the | |
28 # same conditions as regards security. | |
29 # | |
30 # The fact that you are presently reading this means that you have had | |
31 # knowledge of the CeCILL license and that you accept its terms. | |
32 | |
33 import re | |
34 import os | |
35 from commons.core.LoggerFactory import LoggerFactory | |
36 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
37 from commons.core.checker.ConfigChecker import ConfigRules, ConfigChecker | |
38 import subprocess | |
39 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB | |
40 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator | |
41 from commons.core.coord.PathUtils import PathUtils | |
42 from commons.core.coord.SetUtils import SetUtils | |
43 from commons.core.sql.TablePathAdaptator import TablePathAdaptator | |
44 from commons.core.coord.Set import Set | |
45 from commons.core.sql.DbFactory import DbFactory | |
46 | |
47 ## Align a TE on genome according to annotation | |
48 | |
49 LOG_DEPTH = "repet.tools" | |
50 | |
51 class AlignTEOnGenomeAccordingToAnnotation(object): | |
52 | |
53 def __init__(self, pathTableName = "", queryTableName = "", subjectTableName = "", mergeSamePathId = False, outTableName = "", matchPenality=10, mism=8, gapo=16, gape=4, gapl=20, configFileName = "", doClean = False, verbosity = 0): | |
54 self._pathTableName = pathTableName | |
55 self._queryTableName = queryTableName | |
56 self._subjectTableName = subjectTableName | |
57 self._mergeSamePathId = mergeSamePathId | |
58 self.setOutTableName(outTableName) | |
59 self._matchPenality = matchPenality | |
60 self._mismatch = mism | |
61 self._gapOpening = gapo | |
62 self._gapExtend = gape | |
63 self._gapLength = gapl | |
64 self._configFileName = configFileName | |
65 self._doClean = doClean | |
66 self._verbosity = verbosity | |
67 self._iDb = None | |
68 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
69 | |
70 def setAttributesFromCmdLine(self): | |
71 description = "Align a TE on genome according to annotation." | |
72 epilog = "\nExample 1: launch without verbosity and keep temporary files.\n" | |
73 epilog += "\t$ python AlignTEOnGenomeAccordingToAnnotation.py -p DmelChr4_chr_allTEs_nr_noSSR_join_path -q DmelChr4_chr_seq -s DmelChr4_refTEs_seq -v 0" | |
74 epilog += "\n" | |
75 parser = RepetOptionParser(description = description, epilog = epilog) | |
76 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "path table name [compulsory] [format: path]", default = "") | |
77 parser.add_option("-q", "--query", dest = "queryTableName", action = "store", type = "string", help = "query table name [compulsory] [format: seq]", default = "") | |
78 parser.add_option("-s", "--subject", dest = "subjectTableName", action = "store", type = "string", help = "subject table name [compulsory] [format: seq]", default = "") | |
79 parser.add_option("-m", "--merge", dest = "mergeSamePathId", action = "store_true", help = "merge joined matchs [optional] [default: False]", default = False) | |
80 parser.add_option("-o", "--out", dest = "outTableName", action = "store", type = "string", help = "output table name [default: <pathTableName>_align]", default = "") | |
81 #TODO: add options for : matchPenality=10, mism=8, gapo=16, gape=4, gapl=20 | |
82 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file name (e.g. TEannot.cfg)", default = "") | |
83 #NOTE: doClean unused | |
84 # parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False) | |
85 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) | |
86 options = parser.parse_args()[0] | |
87 self._setAttributesFromOptions(options) | |
88 | |
89 def _setAttributesFromOptions(self, options): | |
90 self.setPathTableName(options.pathTableName) | |
91 self.setQueryTableName(options.queryTableName) | |
92 self.setSubjectTableName(options.subjectTableName) | |
93 self.setMergeSamePathId(options.mergeSamePathId) | |
94 self.setOutTableName(options.outTableName) | |
95 self.setConfigFileName(options.configFileName) | |
96 # self.setDoClean(options.doClean) | |
97 self.setVerbosity(options.verbosity) | |
98 | |
99 def _checkConfig(self): | |
100 iConfigRules = ConfigRules() | |
101 iConfigRules.addRuleSection(section="", mandatory=True) | |
102 iConfigRules.addRuleOption(section="", option ="fasta_name", mandatory=True, type="string") | |
103 iConfigRules.addRuleOption(section="", option ="clean", mandatory=True, type="bool") | |
104 iConfigChecker = ConfigChecker(self._configFileName, iConfigRules) | |
105 iConfig = iConfigChecker.getConfig() | |
106 self._setAttributesFromConfig(iConfig) | |
107 | |
108 def _setAttributesFromConfig(self, iConfig): | |
109 self.setOutTableName(self._outTableName) | |
110 self.setDoClean(iConfig.get("", "clean")) | |
111 | |
112 def setPathTableName(self, pathTableName): | |
113 self._pathTableName = pathTableName | |
114 | |
115 def setQueryTableName(self, queryTableName): | |
116 self._queryTableName = queryTableName | |
117 | |
118 def setSubjectTableName(self, subjectTableName): | |
119 self._subjectTableName = subjectTableName | |
120 | |
121 def setMergeSamePathId(self, mergeSamePathId): | |
122 self._mergeSamePathId = mergeSamePathId | |
123 | |
124 def setOutTableName(self, outTableName): | |
125 if outTableName == "": | |
126 self._outTableName = "%s_align" % self._pathTableName | |
127 else: | |
128 self._outTableName = outTableName | |
129 | |
130 def setConfigFileName(self, configFileName): | |
131 self._configFileName = configFileName | |
132 | |
133 def setDoClean(self, doClean): | |
134 self._doClean = doClean | |
135 | |
136 def setVerbosity(self, verbosity): | |
137 self._verbosity = verbosity | |
138 | |
139 def setDbInstance(self, iDb): | |
140 self._iDb = iDb | |
141 | |
142 def _checkOptions(self): | |
143 if self._pathTableName == "" or not self._iDb.doesTableExist(self._pathTableName): | |
144 self._logAndRaise("ERROR: Missing path table") | |
145 if self._queryTableName == "" or not self._iDb.doesTableExist(self._queryTableName): | |
146 self._logAndRaise("ERROR: Missing query table") | |
147 if self._subjectTableName == "" or not self._iDb.doesTableExist(self._subjectTableName): | |
148 self._logAndRaise("ERROR: Missing subject table") | |
149 | |
150 def _logAndRaise(self, errorMsg): | |
151 self._log.error(errorMsg) | |
152 raise Exception(errorMsg) | |
153 | |
154 def alignBioseqWithNWalign(self, iBioseq1, iBioseq2): | |
155 fastaFileName1 = "seqtoalign1.tmp" | |
156 iBioseq1.save(fastaFileName1) | |
157 fastaFileName2 = "seqtoalign2.tmp" | |
158 iBioseq2.save(fastaFileName2) | |
159 alignBioseqDBFileName = "aligned.tmp" | |
160 cmd = "NWalign" | |
161 cmd += " %s" % fastaFileName1 | |
162 cmd += " %s" % fastaFileName2 | |
163 cmd += " -m %s" % self._matchPenality | |
164 cmd += " -d %s" % self._mismatch | |
165 cmd += " -g %s" % self._gapOpening | |
166 cmd += " -e %s" % self._gapExtend | |
167 cmd += " -l %s" % self._gapLength | |
168 cmd += " -D" | |
169 cmd += " -o %s" % alignBioseqDBFileName | |
170 process = subprocess.Popen(cmd, shell = True) | |
171 self._log.debug("Running : %s" % cmd) | |
172 process.communicate() | |
173 if process.returncode != 0: | |
174 self._logAndRaise("ERROR when launching '%s'" % cmd) | |
175 iAlignedBioseqDB = AlignedBioseqDB() | |
176 iAlignedBioseqDB.load(alignBioseqDBFileName) | |
177 os.remove(fastaFileName1) | |
178 os.remove(fastaFileName2) | |
179 os.remove(alignBioseqDBFileName) | |
180 return iAlignedBioseqDB | |
181 | |
182 def alignSeqAccordingToPathAndBuildAlignedSeqTable(self): | |
183 if self._iDb.doesTableExist(self._outTableName): | |
184 self._logAndRaise("ERROR: out table %s already exists" % self._outTableName) | |
185 | |
186 #TODO: create alignedSeq table in DbMySql... | |
187 sqlCmd="CREATE TABLE %s (path int unsigned, query_aligned_seq longtext, subject_aligned_seq longtext, score int unsigned, identity float unsigned)" % self._outTableName | |
188 self._iDb.execute(sqlCmd) | |
189 | |
190 iQueryTSA = TableSeqAdaptator(self._iDb, self._queryTableName) | |
191 iSubjectTSA = TableSeqAdaptator(self._iDb, self._subjectTableName) | |
192 iTPA = TablePathAdaptator(self._iDb, self._pathTableName) | |
193 | |
194 if self._mergeSamePathId: | |
195 lPathId = iTPA.getIdList() | |
196 pathNb = len(lPathId) | |
197 count = 0 | |
198 for pathNum in lPathId: | |
199 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...") | |
200 lPaths = iTPA.getPathListFromId(pathNum) | |
201 #TODO: getSetListFromQueries() call getSubjectAsSetOfQuery() => "reverse complement" the query (coordinates are inversed, so getBioseq() will take the reverse-comp.) : is it correct ? | |
202 lQuerySets = PathUtils.getSetListFromQueries(lPaths) | |
203 #NOTE: merge sets if overlap | |
204 lQueryMergedSets = SetUtils.mergeSetsInList(lQuerySets) | |
205 #TODO: getBioseqFromSetList() build a sequence that does not exist : is it correct ? | |
206 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQueryMergedSets) | |
207 lSubjectSets = PathUtils.getSetListFromSubjects(lPaths) | |
208 #TODO: no merge for subjects : is it correct ? matcher allow overlap on query and not on subject ? | |
209 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets) | |
210 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq) | |
211 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB) | |
212 else: | |
213 lPathId = iTPA.getIdList() | |
214 pathNb = len(lPathId) | |
215 count = 0 | |
216 for pathNum in lPathId: | |
217 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...") | |
218 lPaths = iTPA.getPathListFromId(pathNum) | |
219 queryName = lPaths[0].getQueryName() | |
220 subjectName = lPaths[0].getSubjectName() | |
221 lQueryStart = [] | |
222 lQueryEnd = [] | |
223 lSubjectStart = [] | |
224 lSubjectEnd = [] | |
225 isReversed = not lPaths[0].isSubjectOnDirectStrand() | |
226 for iPath in lPaths: | |
227 lQueryStart.append(iPath.getQueryStart()) | |
228 lQueryEnd.append(iPath.getQueryEnd()) | |
229 lSubjectStart.append(iPath.getSubjectStart()) | |
230 lSubjectEnd.append(iPath.getSubjectEnd()) | |
231 queryStart = min(lQueryStart) | |
232 queryEnd = max(lQueryEnd) | |
233 if isReversed: | |
234 subjectStart = max(lSubjectStart) | |
235 subjectEnd = min(lSubjectEnd) | |
236 else: | |
237 subjectStart = min(lSubjectStart) | |
238 subjectEnd = max(lSubjectEnd) | |
239 lQuerySets = [Set(pathNum,subjectName, queryName,queryStart,queryEnd)] | |
240 lSubjectSets = [Set(pathNum,queryName, subjectName,subjectStart,subjectEnd)] | |
241 | |
242 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQuerySets) | |
243 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets) | |
244 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq) | |
245 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB) | |
246 | |
247 def run(self): | |
248 LoggerFactory.setLevel(self._log, self._verbosity) | |
249 if self._configFileName: | |
250 self._checkConfig() | |
251 self._iDb = DbFactory.createInstance() | |
252 self._checkOptions() | |
253 self._log.info("START AlignTEOnGenomeAccordingToAnnotation") | |
254 self._log.debug("path table name: %s" % self._pathTableName) | |
255 self._log.debug("query table name: %s" % self._queryTableName) | |
256 self._log.debug("subject table name: %s" % self._subjectTableName) | |
257 self.alignSeqAccordingToPathAndBuildAlignedSeqTable() | |
258 self._iDb.close() | |
259 self._log.info("END AlignTEOnGenomeAccordingToAnnotation") | |
260 | |
261 def _insertAlignedBioseqDBWithScoreAndIdentityInTable(self, pathNum, iAlignedBioseqDB): | |
262 scoreWithEndLine = re.split("Score=", iAlignedBioseqDB.db[0].header)[1] | |
263 score = int(scoreWithEndLine.split()[0]) | |
264 | |
265 identity = re.split("Identity=", scoreWithEndLine)[1] | |
266 if identity == "nan": | |
267 identity = "0.0" | |
268 identity = float(identity)*100.0 | |
269 | |
270 #TODO: create TableAlignedSeqAdaptator (to use insert...) | |
271 sqlCmd = 'INSERT INTO %s VALUES (%d,"%s","%s", %d,%f);' % (self._outTableName, pathNum, iAlignedBioseqDB.db[0].sequence, iAlignedBioseqDB.db[1].sequence, score, identity) | |
272 self._iDb.execute(sqlCmd) | |
273 | |
274 self._log.debug("header:", iAlignedBioseqDB.db[0].header) | |
275 self._log.debug("path", pathNum, "Score=", score, "Identity=", identity, "ok") | |
276 self._log.debug(iAlignedBioseqDB.db[0].sequence[:80]) | |
277 self._log.debug(iAlignedBioseqDB.db[1].sequence[:80]) | |
278 | |
279 if __name__ == "__main__": | |
280 iLaunch = AlignTEOnGenomeAccordingToAnnotation() | |
281 iLaunch.setAttributesFromCmdLine() | |
282 iLaunch.run() |