Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/SplicerFromAnnotation.py @ 31:0ab839023fe4
Uploaded
| author | m-zytnicki |
|---|---|
| date | Tue, 30 Apr 2013 14:33:21 -0400 |
| parents | 94ab73e8a190 |
| children |
comparison
equal
deleted
inserted
replaced
| 30:5677346472b5 | 31:0ab839023fe4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 4 # http://www.inra.fr | |
| 5 # http://urgi.versailles.inra.fr | |
| 6 # | |
| 7 # This software is governed by the CeCILL license under French law and | |
| 8 # abiding by the rules of distribution of free software. You can use, | |
| 9 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 10 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 11 # "http://www.cecill.info". | |
| 12 # | |
| 13 # As a counterpart to the access to the source code and rights to copy, | |
| 14 # modify and redistribute granted by the license, users are provided only | |
| 15 # with a limited warranty and the software's author, the holder of the | |
| 16 # economic rights, and the successive licensors have only limited | |
| 17 # liability. | |
| 18 # | |
| 19 # In this respect, the user's attention is drawn to the risks associated | |
| 20 # with loading, using, modifying and/or developing or reproducing the | |
| 21 # software by the user in light of its specific status of free software, | |
| 22 # that may mean that it is complicated to manipulate, and that also | |
| 23 # therefore means that it is reserved for developers and experienced | |
| 24 # professionals having in-depth computer knowledge. Users are therefore | |
| 25 # encouraged to load and test the software's suitability as regards their | |
| 26 # requirements in conditions enabling the security of their systems and/or | |
| 27 # data to be ensured and, more generally, to use and operate it in the | |
| 28 # same conditions as regards security. | |
| 29 # | |
| 30 # The fact that you are presently reading this means that you have had | |
| 31 # knowledge of the CeCILL license and that you accept its terms. | |
| 32 | |
| 33 | |
| 34 import os | |
| 35 import sys | |
| 36 import ConfigParser | |
| 37 | |
| 38 from commons.core.sql.DbMySql import DbMySql | |
| 39 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
| 40 from commons.core.utils.FileUtils import FileUtils | |
| 41 from commons.core.parsing.FastaParser import FastaParser | |
| 42 from ConfigParser import MissingSectionHeaderError | |
| 43 from commons.core.sql.DbFactory import DbFactory | |
| 44 from commons.core.sql.TablePathAdaptator import TablePathAdaptator | |
| 45 from commons.core.LoggerFactory import LoggerFactory | |
| 46 | |
| 47 #TODO: use configuration file | |
| 48 | |
| 49 LOG_DEPTH = "repet.tools" | |
| 50 | |
| 51 ## Get 3 annotation files, using output from TEannot: | |
| 52 #- consensus with one or more full length copy, | |
| 53 #- consensus with one or more full length fragment, | |
| 54 #- consensus without copy | |
| 55 | |
| 56 class SplicerFromAnnotation(object): | |
| 57 | |
| 58 def __init__(self, inInfoFileName = "", tableName = "", verbose = 0): | |
| 59 self._inInfoFileName = inInfoFileName | |
| 60 self._tableName = tableName | |
| 61 self._verbosity = verbose | |
| 62 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
| 63 | |
| 64 def _logAndRaise(self, errorMsg): | |
| 65 self._log.error(errorMsg) | |
| 66 raise Exception(errorMsg) | |
| 67 | |
| 68 def setAttributesFromCmdLine(self): | |
| 69 desc = "Splice annotations from genome. These annotations are Full Length Copy or Full Length Fragment according to consensus." | |
| 70 desc += "A TEs library and annotation are necessary. Connection to the database parameters are retrieved from the environment" | |
| 71 | |
| 72 examples = "\nExample : with a project called \"MyTEannotAnalysis\":\n" | |
| 73 examples += "\t$ python SplicerFromAnnotation.py -i inputFastaFileName -C configFileName -t MyTEannotAnalysis_refTEs_seq " | |
| 74 examples += "\n\t" | |
| 75 examples += "\n\n" | |
| 76 | |
| 77 parser = RepetOptionParser(description = desc, epilog = examples) | |
| 78 parser.add_option("-i", "--file", dest = "inputFastaFileName", action = "store", type = "string", help = "input file (mandatory) = output file with .splice)") | |
| 79 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "config file name to set database connection", default = "") | |
| 80 parser.add_option("-t", "--copyType", dest = "copyType" , action = "store", type = "int", help = "type number [default: 1, 2] 1 is Full Length Copy", default = 1 ) | |
| 81 parser.add_option("-I", "--identity", dest = "identity", action= "store", type = "float", help = "identity between 0 and 100 [default: 80]", default = 80) | |
| 82 parser.add_option("-o", "--outputFile",dest = "outputFile", action= "store", type = "string", help = "output fasta file (default=input File + '.splice')", default = "") | |
| 83 parser.add_option("-v", "--verbose", dest = "verbose", action = "store", type = "int", help = "verbosity level (default=0)", default = 0) | |
| 84 (options, args) = parser.parse_args() | |
| 85 self._setAttributesFromOptions(options) | |
| 86 | |
| 87 def _setAttributesFromOptions(self, options): | |
| 88 self.setConfigFileName(options.configFileName) | |
| 89 self.setInputFileName(options.inputFastaFileName) | |
| 90 self.setOutputFileName(options.outputFile) | |
| 91 self.setIdentity(options.identity) | |
| 92 self.setCopyType(options.copyType) | |
| 93 self.setVerbose(options.verbose) | |
| 94 | |
| 95 def setCopyType(self, copyType): | |
| 96 self._copyType = copyType | |
| 97 | |
| 98 def setIdentity(self,identity): | |
| 99 self._identity=identity | |
| 100 | |
| 101 def setInputFileName(self, inputFastaFileName): | |
| 102 self._inputFastaFileName = inputFastaFileName | |
| 103 self._projectName= os.path.basename(self._inputFastaFileName) | |
| 104 self._projectName = self._projectName.split('.')[0] | |
| 105 self._fF=FastaParser(self._inputFastaFileName) | |
| 106 self._fF.getInfos() | |
| 107 self.genomeSize=self._fF.size | |
| 108 self.nbSeqGenome=self._fF.nbSequences | |
| 109 | |
| 110 def setOutputFileName(self,outputFile): | |
| 111 self._outputFileName = outputFile | |
| 112 | |
| 113 def setConfigFileName(self, configFileName): | |
| 114 self._configFileName = configFileName | |
| 115 configFileHandle = open(self._configFileName) | |
| 116 config = ConfigParser.ConfigParser() | |
| 117 | |
| 118 try : | |
| 119 config.readfp( configFileHandle ) | |
| 120 except MissingSectionHeaderError: | |
| 121 self._logAndRaise("Config file " + self._configFileName + " must begin with a section header ") | |
| 122 | |
| 123 self.setup_env( config ) | |
| 124 | |
| 125 def setVerbose(self, verbose): | |
| 126 self._verbosity = verbose | |
| 127 | |
| 128 def setup_env(self, config): | |
| 129 os.environ["REPET_HOST"] = config.get("repet_env", "repet_host") | |
| 130 os.environ["REPET_USER"] = config.get("repet_env", "repet_user") | |
| 131 os.environ["REPET_PW"] = config.get("repet_env", "repet_pw") | |
| 132 os.environ["REPET_DB"] = config.get("repet_env", "repet_db") | |
| 133 os.environ["REPET_PORT"] = config.get("repet_env", "repet_port") | |
| 134 os.environ["REPET_JOB_MANAGER"] = config.get("repet_env", "repet_job_manager") | |
| 135 | |
| 136 def checkOptions(self): | |
| 137 if self._inputFastaFileName != "": | |
| 138 if not FileUtils.isRessourceExists(self._inputFastaFileName): | |
| 139 self._logAndRaise("Input fasta file does not exist!") | |
| 140 else: | |
| 141 self._logAndRaise("No specified -i option! It is mandatory") | |
| 142 | |
| 143 if self._outputFileName =="": | |
| 144 self._outputFileName = os.path.basename(self._inputFastaFileName)+'.splice' | |
| 145 | |
| 146 if self._copyType!=1 or self._copyType!=2: | |
| 147 self._logAndRaise("Copy type must be only 1 or 2!") | |
| 148 if self._configFileName != "": | |
| 149 iDb = DbMySql(cfgFileName = self._configFileName) | |
| 150 iDb.close() | |
| 151 else: | |
| 152 self._logAndRaise("No specified config file name!") | |
| 153 | |
| 154 | |
| 155 def run(self): | |
| 156 LoggerFactory.setLevel(self._log, self._verbosity) | |
| 157 self.checkOptions() | |
| 158 | |
| 159 msg = "START SplicerFromAnnotation" | |
| 160 msg += "\n input info file: %s" % self._inputFastaFileName | |
| 161 msg += "\n Copy type is: %s" % self._copyType | |
| 162 msg += "\n identity is: %s" % self._identity | |
| 163 msg += "\n host is: %s" % os.environ["REPET_HOST"] | |
| 164 msg += "\n user is: %s" % os.environ["REPET_USER"] | |
| 165 msg += "\n DB is: %s" % os.environ["REPET_DB"] | |
| 166 msg += "\n port is: %s" % os.environ["REPET_PORT"] | |
| 167 self._log.debug("%s\n" % msg) | |
| 168 | |
| 169 cmd="PostAnalyzeTELib.py -a 3 -p %s_chr_allTEs_nr_noSSR_join_path -s %s_refTEs_seq -g %s" % (self._projectName,self._projectName,self.genomeSize) | |
| 170 os.system(cmd) | |
| 171 | |
| 172 cmd="GetSpecificTELibAccordingToAnnotation.py -i %s_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE.tab -t %s_refTEs_seq -v 2" % (self._projectName,self._projectName) | |
| 173 os.system(cmd) | |
| 174 | |
| 175 if self._copyType == 1 : | |
| 176 f = open("Splicer_inputFile_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE_FullLengthCopy.txt", "r") | |
| 177 else : | |
| 178 f = open("Splicer_inputFile_chr_allTEs_nr_noSSR_join_path.annotStatsPerTE_FullLengthFrag.txt", "r") | |
| 179 | |
| 180 lines=f.readlines()[1:] | |
| 181 if len(lines)>0: | |
| 182 lConsensusHeader_copyType=[i.split('\t',1)[0] for i in lines] | |
| 183 db = DbFactory.createInstance() | |
| 184 | |
| 185 sql_cmd = "CREATE TABLE %s_annotationIdentitySup%d_path SELECT * FROM %s_chr_allTEs_nr_noSSR_join_path where identity >=%f" % ( self._projectName,int(self._identity),self._projectName,self._identity) | |
| 186 db.execute( sql_cmd ) | |
| 187 | |
| 188 iTPA = TablePathAdaptator(db, "%s_annotationIdentitySup%d_path" % (self._projectName, int(self._identity))) | |
| 189 lAllDistinctPath=[] | |
| 190 for consensusName in lConsensusHeader_copyType: | |
| 191 lDistinctPath = iTPA.getIdListFromSubject(consensusName) | |
| 192 lAllDistinctPath=lAllDistinctPath+lDistinctPath | |
| 193 | |
| 194 iTPA = TablePathAdaptator(db,"%s_chr_allTEs_nr_noSSR_join_path" % self._projectName) | |
| 195 sql_cmd = "CREATE TABLE %s_annotationToSplice_path LIKE %s_chr_allTEs_nr_noSSR_join_path" % ( self._projectName, self._projectName ) | |
| 196 db.execute( sql_cmd ) | |
| 197 | |
| 198 for pathId in lAllDistinctPath: | |
| 199 sql_cmd = "INSERT INTO %s_annotationToSplice_path SELECT * FROM %s_chr_allTEs_nr_noSSR_join_path where path =%d" % ( self._projectName, self._projectName, pathId ) | |
| 200 db.execute( sql_cmd ) | |
| 201 db.close() | |
| 202 | |
| 203 cmd="SpliceTEsFromGenome.py -i %s_annotationToSplice_path -f path -g %s -o %s -C %s -v 2" % (self._projectName, self._inputFastaFileName, self._outputFileName, self._configFileName) | |
| 204 os.system(cmd) | |
| 205 | |
| 206 else : | |
| 207 msg = "There is no consensus in this copy type.\n" | |
| 208 self._log.info(msg) | |
| 209 f.close() | |
| 210 | |
| 211 self._log.info("END SplicerFromAnnotation") | |
| 212 return 0 | |
| 213 | |
| 214 if __name__ == '__main__': | |
| 215 iGetTELib = SplicerFromAnnotation() | |
| 216 iGetTELib.setAttributesFromCmdLine() | |
| 217 iGetTELib.run() | |
| 218 |
