comparison commons/tools/GetSpecificTELibAccordingToAnnotation.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33
34 import os
35 import sys
36 from commons.core.sql.DbMySql import DbMySql
37 from commons.core.utils.RepetOptionParser import RepetOptionParser
38 from commons.core.utils.FileUtils import FileUtils
39 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
40 from commons.core.LoggerFactory import LoggerFactory
41
42 LOG_DEPTH = "repet.tools"
43 LOG_FORMAT = "%(message)s"
44 #TODO: use configuration file
45
46 ## Get 3 annotation files, using output from TEannot:
47 #- consensus with one or more full length copy,
48 #- consensus with one or more full length fragment,
49 #- consensus without copy
50
51 class GetSpecificTELibAccordingToAnnotation(object):
52
53 def __init__(self, inInfoFileName = "", tableName = "", verbose = 0):
54 self._inInfoFileName = inInfoFileName
55 self._tableName = tableName
56 self._verbose = verbose
57 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbose, LOG_FORMAT)
58
59 def setAttributesFromCmdLine(self):
60 desc = "Splits a GiveInfoTEannot \"statsPerTE.txt\" file in 3 subfiles containing consensus which have at least one copy, one full length fragment or one full length copy. "
61 desc += "A TEs library is built according to each category. Connection to the database parameters are retrieved from the environment"
62
63 examples = "\nExample : with a project called \"MyTEannotAnalysis\":\n"
64 examples += "\t$ python GetSpecificTELibAccordingToAnnotation.py -i MyTEannotAnalysis_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt -t MyTEannotAnalysis_refTEs_seq"
65 examples += "\n\t"
66 examples += "\n\n"
67
68 parser = RepetOptionParser(description = desc, epilog = examples)
69 parser.add_option("-i", "--file", dest = "inInfoFileName", action = "store", type = "string", help = "input file (mandatory) = output file from GiveInfoTEannot.py (e.g. <project_name>_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt)", default = "")
70 parser.add_option("-t", "--table", dest = "tableName", action = "store", type = "string", help = "table name of TEs sequences (mandatory, seq format, e.g. <project_name>_refTEs_seq)", default = "")
71 parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0, else 1)", default = 0)
72 (options, args) = parser.parse_args()
73 self._setAttributesFromOptions(options)
74
75 def _setAttributesFromOptions(self, options):
76 self.setInInfoFileName(options.inInfoFileName)
77 self.setTableName(options.tableName)
78 self.setVerbose(options.verbose)
79
80 def setTableName(self, tableName):
81 self._tableName = tableName
82
83 def setInInfoFileName(self, inInfoFileName):
84 self._inInfoFileName = inInfoFileName
85
86 def setVerbose(self, verbose):
87 self._verbose = verbose
88
89 def checkOptions(self):
90 if self._inInfoFileName != "":
91 if not FileUtils.isRessourceExists(self._inClassifFileName):
92 self._logAndRaise("ERROR: Input GiveInfoTEannot.txt output file does not exist!")
93 else:
94 self._logAndRaise("ERROR: No specified -i option!")
95
96 if self._tableName != "":
97 iDb = DbMySql()
98 if not iDb.doesTableExist(self._tableName):
99 self._logAndRaise("ERROR: table does not exist!")
100 iDb.close()
101 else:
102 self._logAndRaise("ERROR: No specified -t option!")
103
104 def _logAndRaise(self, errorMsg):
105 self._log.error(errorMsg)
106 raise Exception(errorMsg)
107
108 def writeFastaFileFromGiveInfoTEAnnot(self, fileName):
109 fFileHandler = open(fileName,"r")
110 lineHeader = fFileHandler.readline()
111 line = fFileHandler.readline()
112 lConsensusName = []
113 while line:
114 lConsensusName.append(line.split()[0])
115 line = fFileHandler.readline()
116
117 fFileHandler.close()
118 iDb = DbMySql()
119 iTSA = TableSeqAdaptator(iDb, self._tableName)
120 outPutFileName = "%s.fa" % os.path.splitext(fileName)[0]
121 iTSA.saveAccessionsListInFastaFile(lConsensusName, outPutFileName)
122 iDb.close()
123
124 def run(self):
125 LoggerFactory.setLevel(self._log, self._verbose)
126
127 outInfoFileNameFullCopy = "%s_FullLengthCopy.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
128 outInfoFileNameCopy = "%s_OneCopyAndMore.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
129 outInfoFileNameFullFrag = "%s_FullLengthFrag.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
130
131 outInfoFileFullCopy = open(outInfoFileNameFullCopy, "w")
132 outInfoFileCopy = open(outInfoFileNameCopy, "w")
133 outInfoFileFullFrag = open(outInfoFileNameFullFrag, "w")
134
135 self._log.info("START GetSpecificTELibAccordingToAnnotation\n input info file: %s" % self._inInfoFileName)
136
137 inFileFh = open(self._inInfoFileName, "r")
138 line = inFileFh.readline()
139 lHeaders = line.split()
140 if "fullLgthCopies" not in lHeaders:
141 self._logAndRaise("ERROR: No headers in %s!" % self._inInfoFileName )
142
143 outInfoFileFullCopy.write(line)
144 outInfoFileCopy.write(line)
145 outInfoFileFullFrag.write(line)
146
147 line = inFileFh.readline()
148 while line:
149 dTokens = {}
150 for index, token in enumerate(line.split()):
151 dTokens[lHeaders[index]] = token
152
153 if int(dTokens["fullLgthCopies"]) > 0:
154 outInfoFileFullCopy.write(line)
155 if int(dTokens["copies"]) > 0:
156 outInfoFileCopy.write(line)
157 if int(dTokens["fullLgthFrags"]) > 0:
158 outInfoFileFullFrag.write(line)
159 line = inFileFh.readline()
160
161 inFileFh.close()
162 outInfoFileFullCopy.close()
163 outInfoFileCopy.close()
164 outInfoFileFullFrag.close()
165
166 self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullCopy)
167 self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameCopy)
168 self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullFrag)
169
170 self._log.info("END GetSpecificTELibAccordingToAnnotation\n" )
171
172 return 0
173
174 if __name__ == '__main__':
175 iGetTELib = GetSpecificTELibAccordingToAnnotation()
176 iGetTELib.setAttributesFromCmdLine()
177 iGetTELib.run()
178