diff commons/tools/GetSpecificTELibAccordingToAnnotation.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/GetSpecificTELibAccordingToAnnotation.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,178 @@
+#!/usr/bin/env python
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+
+import os
+import sys
+from commons.core.sql.DbMySql import DbMySql
+from commons.core.utils.RepetOptionParser import RepetOptionParser
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
+from commons.core.LoggerFactory import LoggerFactory
+
+LOG_DEPTH = "repet.tools"  
+LOG_FORMAT = "%(message)s"
+#TODO: use configuration file
+
+## Get 3 annotation files, using output from TEannot:
+#- consensus with one or more full length copy, 
+#- consensus with one or more full length fragment,
+#- consensus without copy
+
+class GetSpecificTELibAccordingToAnnotation(object):
+    
+    def __init__(self, inInfoFileName = "", tableName = "", verbose = 0):
+        self._inInfoFileName = inInfoFileName
+        self._tableName = tableName
+        self._verbose = verbose
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbose, LOG_FORMAT)
+    
+    def setAttributesFromCmdLine(self):
+        desc = "Splits a GiveInfoTEannot \"statsPerTE.txt\" file in 3 subfiles containing consensus which have at least one copy, one full length fragment or one full length copy. "
+        desc += "A TEs library is built according to each category. Connection to the database parameters are retrieved from the environment"
+        
+        examples = "\nExample : with a project called \"MyTEannotAnalysis\":\n"
+        examples += "\t$ python GetSpecificTELibAccordingToAnnotation.py -i MyTEannotAnalysis_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt -t MyTEannotAnalysis_refTEs_seq"
+        examples += "\n\t"
+        examples += "\n\n"
+        
+        parser = RepetOptionParser(description = desc, epilog = examples)
+        parser.add_option("-i", "--file",     dest = "inInfoFileName",  action = "store", type = "string", help = "input file (mandatory) = output file from GiveInfoTEannot.py (e.g. <project_name>_chr_allTEs_nr_noSSR_join_path_statsPerTE.txt)",  default = "")
+        parser.add_option("-t", "--table",    dest = "tableName",       action = "store", type = "string", help = "table name of TEs sequences (mandatory, seq format, e.g. <project_name>_refTEs_seq)", default = "")
+        parser.add_option("-v", "--verbose",  dest = "verbose",         action = "store", type = "int",    help = "verbosity level (default=0, else 1)", default = 0)
+        (options, args) = parser.parse_args()
+        self._setAttributesFromOptions(options)
+        
+    def _setAttributesFromOptions(self, options):
+        self.setInInfoFileName(options.inInfoFileName)
+        self.setTableName(options.tableName)
+        self.setVerbose(options.verbose)
+        
+    def setTableName(self, tableName):
+        self._tableName = tableName
+        
+    def setInInfoFileName(self, inInfoFileName):
+        self._inInfoFileName = inInfoFileName
+        
+    def setVerbose(self, verbose):
+        self._verbose = verbose
+   
+    def checkOptions(self):
+        if self._inInfoFileName != "":
+            if not FileUtils.isRessourceExists(self._inClassifFileName):
+                self._logAndRaise("ERROR: Input GiveInfoTEannot.txt output file does not exist!")
+        else:
+            self._logAndRaise("ERROR: No specified -i option!")
+        
+        if self._tableName != "":
+            iDb = DbMySql()
+            if not iDb.doesTableExist(self._tableName):
+                self._logAndRaise("ERROR: table does not exist!")
+            iDb.close()
+        else:
+            self._logAndRaise("ERROR: No specified -t option!")
+
+    def _logAndRaise(self, errorMsg):
+        self._log.error(errorMsg)
+        raise Exception(errorMsg)
+
+    def writeFastaFileFromGiveInfoTEAnnot(self, fileName):
+        fFileHandler = open(fileName,"r")
+        lineHeader = fFileHandler.readline()
+        line = fFileHandler.readline()
+        lConsensusName = []
+        while line:         
+            lConsensusName.append(line.split()[0])
+            line = fFileHandler.readline()
+
+        fFileHandler.close()
+        iDb = DbMySql()
+        iTSA = TableSeqAdaptator(iDb, self._tableName)
+        outPutFileName = "%s.fa" % os.path.splitext(fileName)[0]
+        iTSA.saveAccessionsListInFastaFile(lConsensusName, outPutFileName)
+        iDb.close()
+                      
+    def run(self):
+        LoggerFactory.setLevel(self._log, self._verbose)
+
+        outInfoFileNameFullCopy = "%s_FullLengthCopy.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
+        outInfoFileNameCopy = "%s_OneCopyAndMore.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
+        outInfoFileNameFullFrag = "%s_FullLengthFrag.txt" % os.path.splitext(os.path.basename(self._inInfoFileName))[0]
+        
+        outInfoFileFullCopy = open(outInfoFileNameFullCopy, "w")
+        outInfoFileCopy = open(outInfoFileNameCopy, "w")
+        outInfoFileFullFrag = open(outInfoFileNameFullFrag, "w")
+            
+        self._log.info("START GetSpecificTELibAccordingToAnnotation\n input info file: %s" % self._inInfoFileName)
+        
+        inFileFh = open(self._inInfoFileName, "r")
+        line = inFileFh.readline()
+        lHeaders = line.split()
+        if "fullLgthCopies" not in lHeaders:
+            self._logAndRaise("ERROR: No headers in %s!" % self._inInfoFileName )
+        
+        outInfoFileFullCopy.write(line)
+        outInfoFileCopy.write(line)
+        outInfoFileFullFrag.write(line)
+            
+        line = inFileFh.readline()
+        while line:
+            dTokens = {}
+            for index, token in enumerate(line.split()):
+                dTokens[lHeaders[index]] = token
+                    
+            if int(dTokens["fullLgthCopies"]) > 0:
+                outInfoFileFullCopy.write(line)
+            if int(dTokens["copies"]) > 0:
+                outInfoFileCopy.write(line)
+            if int(dTokens["fullLgthFrags"]) > 0:
+                outInfoFileFullFrag.write(line)
+            line = inFileFh.readline()
+                
+        inFileFh.close()
+        outInfoFileFullCopy.close()
+        outInfoFileCopy.close()
+        outInfoFileFullFrag.close()
+        
+        self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullCopy)
+        self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameCopy)
+        self.writeFastaFileFromGiveInfoTEAnnot(outInfoFileNameFullFrag)
+        
+        self._log.info("END GetSpecificTELibAccordingToAnnotation\n" )
+            
+        return 0
+
+if __name__ == '__main__':
+    iGetTELib = GetSpecificTELibAccordingToAnnotation()
+    iGetTELib.setAttributesFromCmdLine()
+    iGetTELib.run() 
+    
\ No newline at end of file