diff commons/pyRepetUnit/profilesDB/ProfilesDB4Repet.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/pyRepetUnit/profilesDB/ProfilesDB4Repet.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,226 @@
+#!/usr/bin/env python
+
+import re
+import getopt
+import sys
+from commons.pyRepetUnit.profilesDB.Profiles import Profiles
+from commons.core.LoggerFactory import LoggerFactory
+
+LOG_DEPTH = "commons.pyRepetUnit.profiles"
+
+## Format a profiles DB for pipelines in REPET
+#    
+class ProfilesDB4Repet( object ):
+    
+    def __init__(self):
+        self.profile = Profiles()
+        self._inputFile = "" 
+        self._outputFile =  ""
+        self._verbosity = 2
+        self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
+        
+        
+    def _help( self ):
+        print
+        print "usage: %s.py [ options ]" % ( type(self).__name__ )
+        print "options:"
+        print "     -h: this help"
+        print "     -i: name of the profiles DB to format for Repet"
+        print "     -o: name of the output profiles DB for Repet"
+        print
+        
+        
+    def _setAttributesFromCmdLine( self ):
+        try:
+            opts, args = getopt.getopt(sys.argv[1:],"hi:o:")
+        except getopt.GetoptError, err:
+            print str(err); self._help(); sys.exit(1)
+        for o,a in opts:
+            if o == "-h":
+                self._help(); sys.exit(0)
+            elif o == "-i":
+                self.setInputFile( a )
+            elif o == "-o":
+                self.setOutputFile( a )
+                
+    #TDOD: add nb of each domain in log file, verbose...
+    def _searchCurrentDomain(self, profile):
+        currentDomain = ""
+        #TODO: pattern GAGA and GAGE should be excluded !
+        #TODO: add new tag like "ORF1_LTR" for ATHILA as key word in Pfam
+        #TODO: add new tags from GypsyDB (MOV etc...)
+        if (re.search("[gG][aA][Gg]", profile[1]) or re.search("[gG][aA][Gg]", profile[3])):
+            currentDomain = "GAG"
+        elif (re.search("Zinc knuckle", profile[1]) or re.search("Zinc knuckle", profile[3])):
+            currentDomain = "GAG"
+        elif (re.search("PF02813", profile[2]) or re.search("PF01021", profile[2])):
+            currentDomain = "GAG"
+        elif (re.search("GAG_", profile[1])):
+            currentDomain = "GAG"
+        elif (re.search("GAGCOAT_", profile[1])):
+            currentDomain = "GAG"
+            
+        elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]aspartic", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[rR]etrotransposon", profile[1]) or re.search("[rR]etrotransposon", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))):
+            currentDomain = "AP"
+        elif ((re.search("AP_", profile[1])) and not (re.search("[eE]ndonuclease", profile[3]))):
+            currentDomain = "AP"
+            
+        elif ((re.search("[iI]ntegrase", profile[1]) or re.search("[iI]ntegrase", profile[3])) and not (re.search(" C ", profile[1])) and not (re.search(" C ", profile[3]))):
+            currentDomain = "INT"
+        elif (re.search(".*[Cc]hromo.*", profile[1]) or re.search(".*[Cc]hromo.*", profile[3])):
+            currentDomain = "INT"
+        elif (re.search("INT_", profile[1])):
+            currentDomain = "INT"
+            
+            
+        elif ((re.search("[rR]everse", profile[1]) or re.search("[Rr]everse", profile[3])) and (re.search("[tT]ranscriptase", profile[1]) or re.search("[tT]ranscriptase", profile[3]))):
+            currentDomain = "RT"
+        elif (re.search("RT_", profile[1])):
+            currentDomain = "RT"
+            
+            
+        elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H", profile[1]) or re.search("H", profile[3]))):
+            currentDomain = "RH"
+        elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H ", profile[1]) or re.search("H ", profile[3]))):
+            currentDomain = "RH"
+        elif (re.search("RNaseH_", profile[1])):
+            currentDomain = "RH"
+            
+            
+        elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3]))):
+            currentDomain = "ENV"
+        elif ((re.search("ENV", profile[1]) or re.search("ENV", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))):
+            currentDomain = "ENV"
+        elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))):
+            currentDomain = "ENV"
+        elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[gG]lycoprotein", profile[1]) or re.search("[gG]lycoprotein", profile[3]))):
+            currentDomain = "ENV"
+        elif (re.search("PF07253", profile[2]) or re.search("PF03811", profile[2])):
+            currentDomain = "ENV"
+        elif (re.search("ENV_", profile[1])):
+            currentDomain = "ENV"
+            
+        elif ((re.search("[tT]yrosine", profile[1]) or re.search("[tT]yrosine", profile[3])) and (re.search("[rR]ecombinase", profile[1]) or re.search("[rR]ecombinase", profile[3]))):
+            currentDomain = "YR"
+            
+        elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and not (re.search("AP ", profile[1]) or re.search("AP ", profile[3])) and not (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))):
+            currentDomain = "EN"
+            
+        elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("AP ", profile[1]) or re.search("AP ", profile[3]))):
+            currentDomain = "APE"
+        elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))):
+            currentDomain = "APE"
+            
+        elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and not (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))):
+            currentDomain = "Tase"
+        elif (re.search("DUF659", profile[1])):
+            currentDomain = "Tase"
+        elif (re.search("PF01695", profile[2])) or (re.search("PF02316", profile[2])) or (re.search("PF09039", profile[2])) or (re.search("PF04827", profile[2])) or (re.search("PF05699", profile[2])):
+            currentDomain = "Tase"
+            
+        elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))):
+            currentDomain = "Tase*"
+            
+        elif ((re.search("[rR]eplication", profile[1]) or re.search("[rR]eplication", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3])) and (re.search("A ", profile[1]) or re.search("A ", profile[3]))):
+            currentDomain = "RPA"
+        elif (re.search("[rR]epA ", profile[1]) or re.search("[rR]epA ", profile[3])):
+            currentDomain = "RPA"
+        elif (re.search("RPA", profile[1]) or re.search("RPA", profile[3])):
+            currentDomain = "RPA"
+            
+        elif (re.search("[cC]-integrase", profile[1]) or re.search("[cC]-integrase", profile[3])):
+            currentDomain = "C-INT"
+            
+        elif ((re.search("[pP]ackaging", profile[1]) or re.search("[pP]ackaging", profile[3])) and (re.search("ATPase", profile[1]) or re.search("ATPase", profile[3]))):
+            currentDomain = "ATP"
+            
+        elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))):
+            currentDomain = "CYP"
+        elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))):
+            currentDomain = "CYP"
+        elif (re.search("[pP]eptidase_C", profile[1]) or re.search("[pP]eptidase_C", profile[3])):
+            currentDomain = "CYP"
+        elif (re.search("PF00559", profile[2])):
+            currentDomain = "CYP"
+            
+        elif (re.search("[pP]ol\S*_*B", profile[1]) or re.search("[pP]ol\S*_*B", profile[3])):
+            currentDomain = "POLB"
+        elif ((re.search("[pP]olymerase", profile[1]) or re.search("[pP]olymerase", profile[3])) and (re.search("B ", profile[1]) or re.search("B ", profile[3]))):
+            currentDomain = "POLB"
+            
+        elif (re.search("[hH]elicase", profile[1]) or re.search("[hH]elicase", profile[3])):
+            currentDomain = "HEL"
+            
+        else :
+            currentDomain = "OTHER"
+        return currentDomain
+    
+    
+    ## Replace the old profile name by accession number, name, domain and gather cut off
+    #  
+    # @param fout file handle
+    # @param profile Profiles instance
+    # @param currentDomain string
+    #
+    def _writeModifiedProfile(self, fout, profile, currentDomain):
+        for i in xrange(0, len(profile), 1):
+            if i != 1:
+                fout.write(profile[i])
+            else:
+                fout.write("NAME  " + self.profile.accNumber + "_"\
+                            + self.profile.name + "_"\
+                             + currentDomain + "_"\
+                              + str(self.profile.GA_cut_off) + "\n")
+                
+                
+    ## Set input file name
+    #  
+    # @param inputFileName string
+    #
+    def setInputFile(self, inputFileName):
+        self._inputFile = inputFileName
+        
+        
+    ## Set output file name
+    #
+    # @param outputFileName string
+    #
+    def setOutputFile(self, outputFileName):
+        self._outputFile = outputFileName   
+        
+        
+    ## Read a profiles DB file, parse it and, write a new profiles DB with TE domain information and GA score cut_off placed side by side of the name
+    #    
+    def run( self ):
+        LoggerFactory.setLevel(self._log, self._verbosity)
+        fileIn = open( self._inputFile )
+        fout = open( self._outputFile, "w" )
+        profile = self.profile.readAndRetrieve( fileIn )
+        while profile != None:
+            currentDomain = self._searchCurrentDomain(profile)
+            if currentDomain == "OTHER":
+                self._log.warning(self.profile.accNumber + " " + self.profile.name + " has no associated domain")
+            self._writeModifiedProfile(fout, profile, currentDomain)
+            profile = self.profile.read( fileIn )
+            
+            
+if __name__ == "__main__":
+    i = ProfilesDB4Repet()
+    i._setAttributesFromCmdLine()
+    i.run()