comparison commons/tools/AnnotationStats.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 ##@file
34 # Give summary information on a TE annotation table.
35 # options:
36 # -h: this help
37 # -t: analysis type (default = 1, 1: per transposable element (TE), 2: per cluster, 3: per classification, 4: with map input file)
38 # -p: name of the table (_path) or file (.path) with the annotated TE copies
39 # -s: name of the table (_seq) or file (.fasta or .fa) with the TE reference sequences
40 # -g: length of the genome (in bp)
41 # -m: name of the file with the group and the corresponding TE names (format = 'map')
42 # -o: name of the output file (default = pathTableName + '_stats.txt')
43 # -C: name of the configuration file to access MySQL (e.g. 'TEannot.cfg')
44 # -c: remove map files and blastclust file (if analysis type is 2 or 3)
45 # -I: identity coverage threshold (default = 0)
46 # -L: length coverage threshold (default=0.8)
47 # -v: verbosity level (default = 0)
48
49 from commons.core.LoggerFactory import LoggerFactory
50 from commons.core.stat.Stat import Stat
51 from commons.core.sql.DbFactory import DbFactory
52 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
53 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
54 from commons.tools.getCumulLengthFromTEannot import getCumulLengthFromTEannot
55
56 LOG_DEPTH = "repet.tools"
57
58 #TODO: use templating engine instead of raw strings for AnnotationStatsWriter
59 class AnnotationStats( object ):
60
61 def __init__(self, analysisName="TE", clusterFileName="",seqTableName="", pathTableName="", genomeLength=0, statsFileName="", globalStatsFileName="", verbosity=3):
62 self._analysisName = analysisName
63 self._clusterFileName = clusterFileName
64 self._seqTableName = seqTableName
65 self._pathTableName = pathTableName
66 self._genomeLength = genomeLength
67 self._statsFileName = statsFileName
68 self._globalStatsFileName = globalStatsFileName
69 self._iDb = None
70 self._iTablePathAdaptator = None
71 self._iTableSeqAdaptator = None
72 self._save = False
73 self._clean = False
74 self._verbosity = verbosity
75 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
76
77 def _logAndRaise(self, errorMsg):
78 self._log.error(errorMsg)
79 raise Exception(errorMsg)
80
81 def setCoverageThreshold( self, lengthThresh ):
82 self._coverageThreshold = float(lengthThresh)
83
84 def setIdentityThreshold( self, identityThresh ):
85 self._identityThreshold = int(identityThresh)
86
87 def setAnalyseType(self, analyseType):
88 self._analyseType = str(analyseType)
89
90 def setPathTableName(self, pathTableName):
91 self._pathTableName = pathTableName
92
93 def setDBInstance(self, iDb):
94 self._iDb = iDb
95
96 def setTablePathAdaptator(self, iTablePathAdaptator):
97 self._iTablePathAdaptator = iTablePathAdaptator
98
99 def setTableSeqAdaptator(self, iTableSeqAdaptator):
100 self._iTableSeqAdaptator = iTableSeqAdaptator
101
102 ## Get the coverage of TE copies for a given family (using 'mapOp')
103 #
104 # @param consensus string name of a TE family ('subject_name' in the 'path' table)
105 # @return cumulCoverage integer cumulative coverage
106 #
107 def getCumulCoverage( self, consensus = "" ):
108 gclft = getCumulLengthFromTEannot()
109 gclft.setInputTable( self._pathTableName )
110 gclft.setTErefseq( consensus )
111 gclft.setClean()
112 gclft._db = self._iDb
113 gclft._tpA = self._iTablePathAdaptator
114 mapFileName = gclft.getAllSubjectsAsMapOfQueries()
115 mergeFileName = gclft.mergeRanges( mapFileName )
116 cumulCoverage = gclft.getCumulLength( mergeFileName ) #self._iTablePathAdaptator.getCumulPathLength_from_subject( consensus )
117 return cumulCoverage
118
119 ## Get the number of full-lengths (95% <= L =< 105%)
120 #
121 # @param consensusLength integer
122 # @param lLengths list of integers
123 # @return fullLengthConsensusNb integer
124 #
125 def getNbFullLengths( self, consensusLength, lLengths ):
126 fullLengthConsensusNb = 0
127 for i in lLengths:
128 if i / float(consensusLength ) >= 0.95 and i / float(consensusLength ) <= 1.05:
129 fullLengthConsensusNb += 1
130 return fullLengthConsensusNb
131
132 def getStatPerTE(self, consensusName):
133 dConsensusStats = {}
134 lLengthPerFragment = self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName)
135 lLengthPerCopy = self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName)
136 lIdentityPerCopy = self._iTablePathAdaptator.getChainIdentityListFromSubject(consensusName)
137 dConsensusStats["length"] = self._iTableSeqAdaptator.getSeqLengthFromAccession(consensusName)
138 dConsensusStats["cumulCoverage"] = self.getCumulCoverage(consensusName)
139 dConsensusStats["nbFragments"] = len(lLengthPerFragment)
140 dConsensusStats["nbFullLengthFragments"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerFragment)
141 dConsensusStats["nbCopies"] = len(lLengthPerCopy)
142 dConsensusStats["nbFullLengthCopies"] = self.getNbFullLengths(dConsensusStats["length"], lLengthPerCopy)
143 dConsensusStats["statsIdentityPerChain"] = Stat()
144 dConsensusStats["statsLengthPerChain"] = Stat()
145 dConsensusStats["statsLengthPerChainPerc"] = Stat()
146 self._statsForIdentityAndLength(dConsensusStats, lLengthPerCopy, lIdentityPerCopy)
147 return dConsensusStats
148
149 def getStatPerCluster(self, lConsensusNames):
150 dConsensusClusterStats = {}
151 lLengthPerFragment = []
152 lLengthPerCopy = []
153 cumulCoverageLength = 0
154 for consensusName in lConsensusNames:
155 cumulCoverageLength += self.getCumulCoverage(consensusName)
156 lLengthPerFragment.extend(self._iTablePathAdaptator.getPathLengthListFromSubject(consensusName))
157 lLengthPerCopy.extend(self._iTablePathAdaptator.getChainLengthListFromSubject(consensusName))
158 dConsensusClusterStats["cumulCoverage"] = cumulCoverageLength
159 dConsensusClusterStats["nbFragments"] = len(lLengthPerFragment)
160 dConsensusClusterStats["nbCopies"] = len(lLengthPerCopy)
161 return dConsensusClusterStats
162
163 def getClusterListFromFile(self):
164 lClusters = []
165 with open(self._clusterFileName) as fCluster:
166 for line in fCluster:
167 lConsensusNames = line.rstrip().split("\t")
168 lClusters.append(lConsensusNames)
169 return lClusters
170
171 def run(self):
172 LoggerFactory.setLevel(self._log, self._verbosity)
173 self._iDb = DbFactory.createInstance()
174 self._iTablePathAdaptator = TablePathAdaptator(self._iDb, self._pathTableName)
175 self._iTableSeqAdaptator = TableSeqAdaptator(self._iDb, self._seqTableName)
176
177 iASW = AnnotationStatsWriter()
178 if self._analysisName == "TE":
179 with open(self._statsFileName, "w") as fStats:
180 string = "%s\tlength\tcovg\tfrags\tfullLgthFrags\tcopies\tfullLgthCopies\tmeanId\tmeanLgth\tmeanLgthPerc\n" % self._analysisName
181 fStats.write(string)
182
183 lNamesTErefseq = self._iTableSeqAdaptator.getAccessionsList()
184 lDistinctSubjects = self._iTablePathAdaptator.getSubjectList()
185 totalCumulCoverage = self.getCumulCoverage()
186
187 with open(self._globalStatsFileName, "w") as fG:
188 fG.write("%s\n" % iASW.printResume(lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, self._genomeLength))
189 for consensusName in lNamesTErefseq:
190 self._log.debug("processing '%s'..." % consensusName)
191 dStatForOneConsensus = self.getStatPerTE(consensusName)
192 iASW.addCalculsOfOneTE(dStatForOneConsensus)
193 fStats.write("%s\n" % iASW.getStatAsString(consensusName, dStatForOneConsensus))
194 fG.write(iASW.printStatsForAllTEs(len(lNamesTErefseq)))
195
196 elif self._analysisName == "Cluster":
197 lClusters = self.getClusterListFromFile()
198 lClusters.sort(key=lambda k: len(k), reverse=True)
199 with open(self._statsFileName, "w") as fStats:
200 string = "%s\tcovg\tfrags\tcopies\n" % self._analysisName
201 #TODO: add fullLgthFrags and fullLgthCopies ? Is addition of previous results significant ?
202 fStats.write(string)
203 for index, lConsensus in enumerate(lClusters):
204 self._log.debug("processing '%s'..." % lConsensus)
205 dStatForOneCluster = self.getStatPerCluster(lConsensus)
206 fStats.write("%s\n" % iASW.getStatAsStringForCluster(str(index + 1), dStatForOneCluster))
207
208 if self._save:
209 outTableName = "%s_statsPer%s" % (self._pathTableName, self._analysisName)
210 self._iDb.createTable(outTableName, "pathstat", self._statsFileName)
211
212 self._iDb.close()
213 self._log.info("END %s" % type(self).__name__)
214
215 def _statsForIdentityAndLength(self, dStat, lLengthPerCopy, lIdentityPerCopy):
216 for i in lIdentityPerCopy:
217 dStat["statsIdentityPerChain"].add(i)
218 lLengthPercPerCopy = []
219 for i in lLengthPerCopy:
220 dStat["statsLengthPerChain"].add(i)
221 lperc = 100 * i / float(dStat["length"])
222 lLengthPercPerCopy.append(lperc)
223 dStat["statsLengthPerChainPerc"].add(lperc)
224
225 class AnnotationStatsWriter(object):
226
227 def __init__(self):
228 self._dAllTErefseqs = { "sumCumulCoverage": 0,
229 "totalNbFragments": 0,
230 "totalNbFullLengthFragments": 0,
231 "totalNbCopies": 0,
232 "totalNbFullLengthCopies": 0,
233 "nbFamWithFullLengthFragments": 0,
234 "nbFamWithOneFullLengthFragment": 0,
235 "nbFamWithTwoFullLengthFragments": 0,
236 "nbFamWithThreeFullLengthFragments": 0,
237 "nbFamWithMoreThanThreeFullLengthFragments": 0,
238 "nbFamWithFullLengthCopies": 0,
239 "nbFamWithOneFullLengthCopy": 0,
240 "nbFamWithTwoFullLengthCopies": 0,
241 "nbFamWithThreeFullLengthCopies": 0,
242 "nbFamWithMoreThanThreeFullLengthCopies": 0,
243 "statsAllCopiesMedIdentity": Stat(),
244 "statsAllCopiesMedLengthPerc": Stat()
245 }
246
247 def getAllTEsRefSeqDict(self):
248 return self._dAllTErefseqs
249
250 def getStatAsString( self, name, d ):
251 """
252 Return a string with all data properly formatted.
253 """
254 string = ""
255 string += "%s" % name
256 string += "\t%i" % d["length"]
257 string += "\t%i" % d["cumulCoverage"]
258 string += "\t%i" % d["nbFragments"]
259 string += "\t%i" % d["nbFullLengthFragments"]
260 string += "\t%i" % d["nbCopies"]
261 string += "\t%i" % d["nbFullLengthCopies"]
262
263 if d["statsIdentityPerChain"].getValuesNumber() != 0:
264 string += "\t%.2f" % d["statsIdentityPerChain"].mean()
265 else:
266 string += "\tNA"
267
268 if d["statsLengthPerChain"].getValuesNumber() != 0:
269 string += "\t%.2f" % d["statsLengthPerChain"].mean()
270 else:
271 string += "\tNA"
272
273 if d["statsLengthPerChainPerc"].getValuesNumber() != 0:
274 string += "\t%.2f" % d["statsLengthPerChainPerc"].mean()
275 else:
276 string += "\tNA"
277
278 return string
279
280 def getStatAsStringForCluster( self, name, d ):
281 """
282 Return a string with all data properly formatted.
283 """
284 string = ""
285 string += "%s" % name
286 string += "\t%i" % d["cumulCoverage"]
287 string += "\t%i" % d["nbFragments"]
288 string += "\t%i" % d["nbCopies"]
289
290 return string
291
292 def addCalculsOfOneTE(self, dOneTErefseq):
293 self._dAllTErefseqs[ "sumCumulCoverage" ] += dOneTErefseq[ "cumulCoverage" ]
294
295 self._dAllTErefseqs[ "totalNbFragments" ] += dOneTErefseq[ "nbFragments" ]
296 self._dAllTErefseqs[ "totalNbFullLengthFragments" ] += dOneTErefseq[ "nbFullLengthFragments" ]
297 if dOneTErefseq[ "nbFullLengthFragments" ] > 0:
298 self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] += 1
299 if dOneTErefseq[ "nbFullLengthFragments" ] == 1:
300 self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] += 1
301 elif dOneTErefseq[ "nbFullLengthFragments" ] == 2:
302 self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] += 1
303 elif dOneTErefseq[ "nbFullLengthFragments" ] == 3:
304 self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] += 1
305 elif dOneTErefseq[ "nbFullLengthFragments" ] > 3:
306 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] += 1
307
308 self._dAllTErefseqs[ "totalNbCopies" ] += dOneTErefseq[ "nbCopies" ]
309 self._dAllTErefseqs[ "totalNbFullLengthCopies" ] += dOneTErefseq[ "nbFullLengthCopies" ]
310 if dOneTErefseq[ "nbFullLengthCopies" ] > 0:
311 self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] += 1
312 if dOneTErefseq[ "nbFullLengthCopies" ] == 1:
313 self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] += 1
314 elif dOneTErefseq[ "nbFullLengthCopies" ] == 2:
315 self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] += 1
316 elif dOneTErefseq[ "nbFullLengthCopies" ] == 3:
317 self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] += 1
318 elif dOneTErefseq[ "nbFullLengthCopies" ] > 3:
319 self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] += 1
320
321 if dOneTErefseq[ "statsIdentityPerChain" ].getValuesNumber() != 0:
322 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].add( dOneTErefseq[ "statsIdentityPerChain" ].median() )
323
324 if dOneTErefseq[ "statsLengthPerChainPerc" ].getValuesNumber() != 0:
325 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].add( dOneTErefseq[ "statsLengthPerChainPerc" ].median() )
326
327 def printStatsForAllTEs(self, TEnb):
328 # statString += "(sum of cumulative coverages: %i bp)" % ( self._dAllTErefseqs[ "sumCumulCoverage" ] )
329 statString = "total nb of TE fragments: %i\n" % ( self._dAllTErefseqs[ "totalNbFragments" ] )
330
331 if self._dAllTErefseqs[ "totalNbFragments" ] != 0:
332
333 statString += "total nb full-length fragments: %i (%.2f%%)\n" % \
334 ( self._dAllTErefseqs[ "totalNbFullLengthFragments" ], \
335 100*self._dAllTErefseqs[ "totalNbFullLengthFragments" ] / float(self._dAllTErefseqs[ "totalNbFragments" ]) )
336
337 statString += "total nb of TE copies: %i\n" % ( self._dAllTErefseqs[ "totalNbCopies" ] )
338
339 statString += "total nb full-length copies: %i (%.2f%%)\n" % \
340 ( self._dAllTErefseqs[ "totalNbFullLengthCopies" ], \
341 100*self._dAllTErefseqs[ "totalNbFullLengthCopies" ] / float(self._dAllTErefseqs[ "totalNbCopies" ]) )
342
343 statString += "families with full-length fragments: %i (%.2f%%)\n" % \
344 ( self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ], \
345 100*self._dAllTErefseqs[ "nbFamWithFullLengthFragments" ] / float(TEnb) )
346 statString += " with only one full-length fragment: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthFragment" ] )
347 statString += " with only two full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthFragments" ] )
348 statString += " with only three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthFragments" ] )
349 statString += " with more than three full-length fragments: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthFragments" ] )
350
351 statString += "families with full-length copies: %i (%.2f%%)\n" % \
352 ( self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ], \
353 100*self._dAllTErefseqs[ "nbFamWithFullLengthCopies" ] / float(TEnb) )
354 statString += " with only one full-length copy: %i\n" % ( self._dAllTErefseqs[ "nbFamWithOneFullLengthCopy" ] )
355 statString += " with only two full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithTwoFullLengthCopies" ] )
356 statString += " with only three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithThreeFullLengthCopies" ] )
357 statString += " with more than three full-length copies: %i\n" % ( self._dAllTErefseqs[ "nbFamWithMoreThanThreeFullLengthCopies" ] )
358
359 statString += "mean of median identity of all families: %.2f +- %.2f\n" % \
360 ( self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].mean(), \
361 self._dAllTErefseqs[ "statsAllCopiesMedIdentity" ].sd() )
362
363 statString += "mean of median length percentage of all families: %.2f +- %.2f\n" % \
364 ( self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].mean(), \
365 self._dAllTErefseqs[ "statsAllCopiesMedLengthPerc" ].sd() )
366 return statString
367
368 def printResume(self, lNamesTErefseq, lDistinctSubjects, totalCumulCoverage, genomeLength):
369 statString = "nb of sequences: %i\n" % len(lNamesTErefseq)
370 statString += "nb of matched sequences: %i\n" % len(lDistinctSubjects)
371 statString += "cumulative coverage: %i bp\n" % totalCumulCoverage
372 statString += "coverage percentage: %.2f%%\n" % ( 100 * totalCumulCoverage / float(genomeLength) )
373 # statString += "processing the %i TE families..." % len(lNamesTErefseq)
374 return statString