Mercurial > repos > yufei-luo > s_mart
comparison commons/pyRepetUnit/profilesDB/ProfilesDB4Repet.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
comparison
equal
deleted
inserted
replaced
30:5677346472b5 | 31:0ab839023fe4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import re | |
4 import getopt | |
5 import sys | |
6 from commons.pyRepetUnit.profilesDB.Profiles import Profiles | |
7 from commons.core.LoggerFactory import LoggerFactory | |
8 | |
9 LOG_DEPTH = "commons.pyRepetUnit.profiles" | |
10 | |
11 ## Format a profiles DB for pipelines in REPET | |
12 # | |
13 class ProfilesDB4Repet( object ): | |
14 | |
15 def __init__(self): | |
16 self.profile = Profiles() | |
17 self._inputFile = "" | |
18 self._outputFile = "" | |
19 self._verbosity = 2 | |
20 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
21 | |
22 | |
23 def _help( self ): | |
24 print | |
25 print "usage: %s.py [ options ]" % ( type(self).__name__ ) | |
26 print "options:" | |
27 print " -h: this help" | |
28 print " -i: name of the profiles DB to format for Repet" | |
29 print " -o: name of the output profiles DB for Repet" | |
30 print | |
31 | |
32 | |
33 def _setAttributesFromCmdLine( self ): | |
34 try: | |
35 opts, args = getopt.getopt(sys.argv[1:],"hi:o:") | |
36 except getopt.GetoptError, err: | |
37 print str(err); self._help(); sys.exit(1) | |
38 for o,a in opts: | |
39 if o == "-h": | |
40 self._help(); sys.exit(0) | |
41 elif o == "-i": | |
42 self.setInputFile( a ) | |
43 elif o == "-o": | |
44 self.setOutputFile( a ) | |
45 | |
46 #TDOD: add nb of each domain in log file, verbose... | |
47 def _searchCurrentDomain(self, profile): | |
48 currentDomain = "" | |
49 #TODO: pattern GAGA and GAGE should be excluded ! | |
50 #TODO: add new tag like "ORF1_LTR" for ATHILA as key word in Pfam | |
51 #TODO: add new tags from GypsyDB (MOV etc...) | |
52 if (re.search("[gG][aA][Gg]", profile[1]) or re.search("[gG][aA][Gg]", profile[3])): | |
53 currentDomain = "GAG" | |
54 elif (re.search("Zinc knuckle", profile[1]) or re.search("Zinc knuckle", profile[3])): | |
55 currentDomain = "GAG" | |
56 elif (re.search("PF02813", profile[2]) or re.search("PF01021", profile[2])): | |
57 currentDomain = "GAG" | |
58 elif (re.search("GAG_", profile[1])): | |
59 currentDomain = "GAG" | |
60 elif (re.search("GAGCOAT_", profile[1])): | |
61 currentDomain = "GAG" | |
62 | |
63 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]aspartic", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))): | |
64 currentDomain = "AP" | |
65 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): | |
66 currentDomain = "AP" | |
67 elif ((re.search("[rR]etrotransposon", profile[1]) or re.search("[rR]etrotransposon", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): | |
68 currentDomain = "AP" | |
69 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): | |
70 currentDomain = "AP" | |
71 elif ((re.search("[aA]spartic", profile[1]) or re.search("[aA]spartic", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))): | |
72 currentDomain = "AP" | |
73 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]roteinase", profile[1]) or re.search("[pP]roteinase", profile[3]))): | |
74 currentDomain = "AP" | |
75 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): | |
76 currentDomain = "AP" | |
77 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): | |
78 currentDomain = "AP" | |
79 elif ((re.search("[aA]spartyl", profile[1]) or re.search("[aA]spartyl", profile[3])) and (re.search("[eE]ndopeptidase", profile[1]) or re.search("[eE]ndopeptidase", profile[3]))): | |
80 currentDomain = "AP" | |
81 elif ((re.search("AP_", profile[1])) and not (re.search("[eE]ndonuclease", profile[3]))): | |
82 currentDomain = "AP" | |
83 | |
84 elif ((re.search("[iI]ntegrase", profile[1]) or re.search("[iI]ntegrase", profile[3])) and not (re.search(" C ", profile[1])) and not (re.search(" C ", profile[3]))): | |
85 currentDomain = "INT" | |
86 elif (re.search(".*[Cc]hromo.*", profile[1]) or re.search(".*[Cc]hromo.*", profile[3])): | |
87 currentDomain = "INT" | |
88 elif (re.search("INT_", profile[1])): | |
89 currentDomain = "INT" | |
90 | |
91 | |
92 elif ((re.search("[rR]everse", profile[1]) or re.search("[Rr]everse", profile[3])) and (re.search("[tT]ranscriptase", profile[1]) or re.search("[tT]ranscriptase", profile[3]))): | |
93 currentDomain = "RT" | |
94 elif (re.search("RT_", profile[1])): | |
95 currentDomain = "RT" | |
96 | |
97 | |
98 elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H", profile[1]) or re.search("H", profile[3]))): | |
99 currentDomain = "RH" | |
100 elif ((re.search("R[nN]ase", profile[1]) or re.search("R[nN]ase", profile[3])) and (re.search("H ", profile[1]) or re.search("H ", profile[3]))): | |
101 currentDomain = "RH" | |
102 elif (re.search("RNaseH_", profile[1])): | |
103 currentDomain = "RH" | |
104 | |
105 | |
106 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3]))): | |
107 currentDomain = "ENV" | |
108 elif ((re.search("ENV", profile[1]) or re.search("ENV", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))): | |
109 currentDomain = "ENV" | |
110 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[pP]olyprotein", profile[1]) or re.search("[pP]olyprotein", profile[3]))): | |
111 currentDomain = "ENV" | |
112 elif ((re.search("[eE]nvelope", profile[1]) or re.search("[eE]nvelope", profile[3])) and (re.search("[gG]lycoprotein", profile[1]) or re.search("[gG]lycoprotein", profile[3]))): | |
113 currentDomain = "ENV" | |
114 elif (re.search("PF07253", profile[2]) or re.search("PF03811", profile[2])): | |
115 currentDomain = "ENV" | |
116 elif (re.search("ENV_", profile[1])): | |
117 currentDomain = "ENV" | |
118 | |
119 elif ((re.search("[tT]yrosine", profile[1]) or re.search("[tT]yrosine", profile[3])) and (re.search("[rR]ecombinase", profile[1]) or re.search("[rR]ecombinase", profile[3]))): | |
120 currentDomain = "YR" | |
121 | |
122 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and not (re.search("AP ", profile[1]) or re.search("AP ", profile[3])) and not (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))): | |
123 currentDomain = "EN" | |
124 | |
125 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("AP ", profile[1]) or re.search("AP ", profile[3]))): | |
126 currentDomain = "APE" | |
127 elif ((re.search("[eE]ndonuclease", profile[1]) or re.search("[eE]ndonuclease", profile[3])) and (re.search("[aA]purinic", profile[1]) or re.search("[aA]purinic", profile[3]))): | |
128 currentDomain = "APE" | |
129 | |
130 elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and not (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))): | |
131 currentDomain = "Tase" | |
132 elif (re.search("DUF659", profile[1])): | |
133 currentDomain = "Tase" | |
134 elif (re.search("PF01695", profile[2])) or (re.search("PF02316", profile[2])) or (re.search("PF09039", profile[2])) or (re.search("PF04827", profile[2])) or (re.search("PF05699", profile[2])): | |
135 currentDomain = "Tase" | |
136 | |
137 elif ((re.search("[tT]ransposase", profile[1]) or re.search("[tT]ransposase", profile[3])) and (re.search("DDE ", profile[1]) or re.search("DDE ", profile[3]))): | |
138 currentDomain = "Tase*" | |
139 | |
140 elif ((re.search("[rR]eplication", profile[1]) or re.search("[rR]eplication", profile[3])) and (re.search("[pP]rotein", profile[1]) or re.search("[pP]rotein", profile[3])) and (re.search("A ", profile[1]) or re.search("A ", profile[3]))): | |
141 currentDomain = "RPA" | |
142 elif (re.search("[rR]epA ", profile[1]) or re.search("[rR]epA ", profile[3])): | |
143 currentDomain = "RPA" | |
144 elif (re.search("RPA", profile[1]) or re.search("RPA", profile[3])): | |
145 currentDomain = "RPA" | |
146 | |
147 elif (re.search("[cC]-integrase", profile[1]) or re.search("[cC]-integrase", profile[3])): | |
148 currentDomain = "C-INT" | |
149 | |
150 elif ((re.search("[pP]ackaging", profile[1]) or re.search("[pP]ackaging", profile[3])) and (re.search("ATPase", profile[1]) or re.search("ATPase", profile[3]))): | |
151 currentDomain = "ATP" | |
152 | |
153 elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]rotease", profile[1]) or re.search("[pP]rotease", profile[3]))): | |
154 currentDomain = "CYP" | |
155 elif ((re.search("[cC]ysteine", profile[1]) or re.search("[cC]ysteine", profile[3])) and (re.search("[pP]eptidase", profile[1]) or re.search("[pP]eptidase", profile[3]))): | |
156 currentDomain = "CYP" | |
157 elif (re.search("[pP]eptidase_C", profile[1]) or re.search("[pP]eptidase_C", profile[3])): | |
158 currentDomain = "CYP" | |
159 elif (re.search("PF00559", profile[2])): | |
160 currentDomain = "CYP" | |
161 | |
162 elif (re.search("[pP]ol\S*_*B", profile[1]) or re.search("[pP]ol\S*_*B", profile[3])): | |
163 currentDomain = "POLB" | |
164 elif ((re.search("[pP]olymerase", profile[1]) or re.search("[pP]olymerase", profile[3])) and (re.search("B ", profile[1]) or re.search("B ", profile[3]))): | |
165 currentDomain = "POLB" | |
166 | |
167 elif (re.search("[hH]elicase", profile[1]) or re.search("[hH]elicase", profile[3])): | |
168 currentDomain = "HEL" | |
169 | |
170 else : | |
171 currentDomain = "OTHER" | |
172 return currentDomain | |
173 | |
174 | |
175 ## Replace the old profile name by accession number, name, domain and gather cut off | |
176 # | |
177 # @param fout file handle | |
178 # @param profile Profiles instance | |
179 # @param currentDomain string | |
180 # | |
181 def _writeModifiedProfile(self, fout, profile, currentDomain): | |
182 for i in xrange(0, len(profile), 1): | |
183 if i != 1: | |
184 fout.write(profile[i]) | |
185 else: | |
186 fout.write("NAME " + self.profile.accNumber + "_"\ | |
187 + self.profile.name + "_"\ | |
188 + currentDomain + "_"\ | |
189 + str(self.profile.GA_cut_off) + "\n") | |
190 | |
191 | |
192 ## Set input file name | |
193 # | |
194 # @param inputFileName string | |
195 # | |
196 def setInputFile(self, inputFileName): | |
197 self._inputFile = inputFileName | |
198 | |
199 | |
200 ## Set output file name | |
201 # | |
202 # @param outputFileName string | |
203 # | |
204 def setOutputFile(self, outputFileName): | |
205 self._outputFile = outputFileName | |
206 | |
207 | |
208 ## Read a profiles DB file, parse it and, write a new profiles DB with TE domain information and GA score cut_off placed side by side of the name | |
209 # | |
210 def run( self ): | |
211 LoggerFactory.setLevel(self._log, self._verbosity) | |
212 fileIn = open( self._inputFile ) | |
213 fout = open( self._outputFile, "w" ) | |
214 profile = self.profile.readAndRetrieve( fileIn ) | |
215 while profile != None: | |
216 currentDomain = self._searchCurrentDomain(profile) | |
217 if currentDomain == "OTHER": | |
218 self._log.warning(self.profile.accNumber + " " + self.profile.name + " has no associated domain") | |
219 self._writeModifiedProfile(fout, profile, currentDomain) | |
220 profile = self.profile.read( fileIn ) | |
221 | |
222 | |
223 if __name__ == "__main__": | |
224 i = ProfilesDB4Repet() | |
225 i._setAttributesFromCmdLine() | |
226 i.run() |