Mercurial > repos > yufei-luo > s_mart
diff commons/pyRepetUnit/profilesDB/ProfilesDB4Repet.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/pyRepetUnit/profilesDB/ProfilesDB4Repet.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,226 @@ +#!/usr/bin/env python + +import re +import getopt +import sys +from commons.pyRepetUnit.profilesDB.Profiles import Profiles +from commons.core.LoggerFactory import LoggerFactory + +LOG_DEPTH = "commons.pyRepetUnit.profiles" + +## Format a profiles DB for pipelines in REPET +# +class ProfilesDB4Repet( object ): + + def __init__(self): + self.profile = Profiles() + self._inputFile = "" + self._outputFile = "" + self._verbosity = 2 + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + + def _help( self ): + print + print "usage: %s.py [ options ]" % ( type(self).__name__ ) + print "options:" + print " -h: this help" + print " -i: name of the profiles DB to format for Repet" + print " -o: name of the output profiles DB for Repet" + print + + + def _setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:o:") + except getopt.GetoptError, err: + print str(err); self._help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self._help(); sys.exit(0) + elif o == "-i": + self.setInputFile( a ) + elif o == "-o": + self.setOutputFile( a ) + + #TDOD: add nb of each domain in log file, verbose... + def _searchCurrentDomain(self, profile): + currentDomain = "" + #TODO: pattern GAGA and GAGE should be excluded ! + #TODO: add new tag like "ORF1_LTR" for ATHILA as key word in Pfam + #TODO: add new tags from GypsyDB (MOV etc...) + if (re.search("[gG][aA][Gg]", profile[1]) or re.search("[gG][aA][Gg]", profile[3])): + currentDomain = "GAG" + elif (re.search("Zinc knuckle", profile[1]) or re.search("Zinc knuckle", profile[3])): + currentDomain = "GAG" + elif (re.search("PF02813", profile[2]) or re.search("PF01021", profile[2])): + currentDomain = "GAG" + elif (re.search("GAG_", profile[1])): + currentDomain = "GAG" + elif (re.search("GAGCOAT_", profile[1])): + currentDomain = "GAG" + + elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]aspartic", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): + currentDomain = "AP" + elif ((re.search("[rR]etrotransposon", profile[1]) or re.search("[rR]etrotransposon", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): + currentDomain = "AP" + elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))): + currentDomain = "AP" + elif ((re.search("AP_", profile[1])) and not (re.search("[eE]ndonuclease", profile[3]))): + currentDomain = "AP" + + elif ((re.search("[iI]ntegrase", profile[1]) or re.search("[iI]ntegrase", profile[3])) and not (re.search(" C ", profile[1])) and not (re.search(" C ", profile[3]))): + currentDomain = "INT" + elif (re.search(".*[Cc]hromo.*", profile[1]) or re.search(".*[Cc]hromo.*", profile[3])): + currentDomain = "INT" + elif (re.search("INT_", profile[1])): + currentDomain = "INT" + + + elif ((re.search("[rR]everse", profile[1]) or re.search("[Rr]everse", profile[3])) and (re.search("[tT]ranscriptase", profile[1]) or re.search("[tT]ranscriptase", profile[3]))): + currentDomain = "RT" + elif (re.search("RT_", profile[1])): + currentDomain = "RT" + + + elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H", profile[1]) or re.search("H", profile[3]))): + currentDomain = "RH" + elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H ", profile[1]) or re.search("H ", profile[3]))): + currentDomain = "RH" + elif (re.search("RNaseH_", profile[1])): + currentDomain = "RH" + + + elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3]))): + currentDomain = "ENV" + elif ((re.search("ENV", profile[1]) or re.search("ENV", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))): + currentDomain = "ENV" + elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))): + currentDomain = "ENV" + elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[gG]lycoprotein", profile[1]) or re.search("[gG]lycoprotein", profile[3]))): + currentDomain = "ENV" + elif (re.search("PF07253", profile[2]) or re.search("PF03811", profile[2])): + currentDomain = "ENV" + elif (re.search("ENV_", profile[1])): + currentDomain = "ENV" + + elif ((re.search("[tT]yrosine", profile[1]) or re.search("[tT]yrosine", profile[3])) and (re.search("[rR]ecombinase", profile[1]) or re.search("[rR]ecombinase", profile[3]))): + currentDomain = "YR" + + elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and not (re.search("AP ", profile[1]) or re.search("AP ", profile[3])) and not (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))): + currentDomain = "EN" + + elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("AP ", profile[1]) or re.search("AP ", profile[3]))): + currentDomain = "APE" + elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))): + currentDomain = "APE" + + elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and not (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))): + currentDomain = "Tase" + elif (re.search("DUF659", profile[1])): + currentDomain = "Tase" + elif (re.search("PF01695", profile[2])) or (re.search("PF02316", profile[2])) or (re.search("PF09039", profile[2])) or (re.search("PF04827", profile[2])) or (re.search("PF05699", profile[2])): + currentDomain = "Tase" + + elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))): + currentDomain = "Tase*" + + elif ((re.search("[rR]eplication", profile[1]) or re.search("[rR]eplication", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3])) and (re.search("A ", profile[1]) or re.search("A ", profile[3]))): + currentDomain = "RPA" + elif (re.search("[rR]epA ", profile[1]) or re.search("[rR]epA ", profile[3])): + currentDomain = "RPA" + elif (re.search("RPA", profile[1]) or re.search("RPA", profile[3])): + currentDomain = "RPA" + + elif (re.search("[cC]-integrase", profile[1]) or re.search("[cC]-integrase", profile[3])): + currentDomain = "C-INT" + + elif ((re.search("[pP]ackaging", profile[1]) or re.search("[pP]ackaging", profile[3])) and (re.search("ATPase", profile[1]) or re.search("ATPase", profile[3]))): + currentDomain = "ATP" + + elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): + currentDomain = "CYP" + elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): + currentDomain = "CYP" + elif (re.search("[pP]eptidase_C", profile[1]) or re.search("[pP]eptidase_C", profile[3])): + currentDomain = "CYP" + elif (re.search("PF00559", profile[2])): + currentDomain = "CYP" + + elif (re.search("[pP]ol\S*_*B", profile[1]) or re.search("[pP]ol\S*_*B", profile[3])): + currentDomain = "POLB" + elif ((re.search("[pP]olymerase", profile[1]) or re.search("[pP]olymerase", profile[3])) and (re.search("B ", profile[1]) or re.search("B ", profile[3]))): + currentDomain = "POLB" + + elif (re.search("[hH]elicase", profile[1]) or re.search("[hH]elicase", profile[3])): + currentDomain = "HEL" + + else : + currentDomain = "OTHER" + return currentDomain + + + ## Replace the old profile name by accession number, name, domain and gather cut off + # + # @param fout file handle + # @param profile Profiles instance + # @param currentDomain string + # + def _writeModifiedProfile(self, fout, profile, currentDomain): + for i in xrange(0, len(profile), 1): + if i != 1: + fout.write(profile[i]) + else: + fout.write("NAME " + self.profile.accNumber + "_"\ + + self.profile.name + "_"\ + + currentDomain + "_"\ + + str(self.profile.GA_cut_off) + "\n") + + + ## Set input file name + # + # @param inputFileName string + # + def setInputFile(self, inputFileName): + self._inputFile = inputFileName + + + ## Set output file name + # + # @param outputFileName string + # + def setOutputFile(self, outputFileName): + self._outputFile = outputFileName + + + ## Read a profiles DB file, parse it and, write a new profiles DB with TE domain information and GA score cut_off placed side by side of the name + # + def run( self ): + LoggerFactory.setLevel(self._log, self._verbosity) + fileIn = open( self._inputFile ) + fout = open( self._outputFile, "w" ) + profile = self.profile.readAndRetrieve( fileIn ) + while profile != None: + currentDomain = self._searchCurrentDomain(profile) + if currentDomain == "OTHER": + self._log.warning(self.profile.accNumber + " " + self.profile.name + " has no associated domain") + self._writeModifiedProfile(fout, profile, currentDomain) + profile = self.profile.read( fileIn ) + + +if __name__ == "__main__": + i = ProfilesDB4Repet() + i._setAttributesFromCmdLine() + i.run()