0
|
1 #!/usr/bin/env python
|
|
2 # -*- coding: utf-8 -*-
|
|
3 #
|
|
4 # calclenchange.py
|
|
5 #
|
|
6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
|
|
7 #
|
|
8 # This program is free software; you can redistribute it and/or modify
|
|
9 # it under the terms of the GNU General Public License as published by
|
|
10 # the Free Software Foundation; either version 2 of the License, or
|
|
11 # (at your option) any later version.
|
|
12 #
|
|
13 # This program is distributed in the hope that it will be useful,
|
|
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 # GNU General Public License for more details.
|
|
17 #
|
|
18 # You should have received a copy of the GNU General Public License
|
|
19 # along with this program; if not, write to the Free Software
|
|
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
|
|
21 # MA 02110-1301, USA.
|
|
22
|
|
23 import argparse,mechanize,os,sys
|
|
24 from decimal import Decimal,getcontext
|
|
25 from xml.etree.ElementTree import ElementTree,tostring
|
|
26 import networkx as nx
|
|
27 from copy import copy
|
|
28
|
|
29 #method to rank the the pthways by mut. freq.
|
|
30 def rankdN(ltfreqs):
|
|
31 ordvals=sorted(ltfreqs)#sort and reverse freqs.
|
|
32 #~
|
|
33 outrnk=[]
|
|
34 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
|
|
35 if tmpOri=='C':
|
|
36 if tmpMut!='C':
|
|
37 tmpChng0='C-%s'%tmpMut
|
|
38 else:
|
|
39 tmpChng0=Decimal('0')
|
|
40 crank=1
|
|
41 outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
|
|
42 totalnvals=len(ordvals)
|
|
43 cnt=0
|
|
44 while totalnvals>cnt:
|
|
45 cnt+=1
|
|
46 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
|
|
47 if tmpOri=='C':
|
|
48 if tmpMut!='C':
|
|
49 tmpChng='C-%s'%tmpMut
|
|
50 else:
|
|
51 tmpChng=Decimal('0')
|
|
52 if tmpChng!=tmpChng0:
|
|
53 crank=len(outrnk)+1
|
|
54 tmpChng0=tmpChng
|
|
55 outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw])
|
|
56 return outrnk
|
|
57
|
|
58 #method to rank the the pthways by mut. freq.
|
|
59 def rankdAvr(ltfreqs):
|
|
60 ordvals=sorted(ltfreqs)#sort and reverse freqs.
|
|
61 #~
|
|
62 outrnk={}
|
|
63 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value
|
|
64 if tmpOri=='I':
|
|
65 if tmpMut!='I':
|
|
66 tmpChng0='I-%s'%tmpMut
|
|
67 else:
|
|
68 tmpChng0=Decimal('0')
|
|
69 crank=1
|
|
70 outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)])
|
|
71 totalnvals=len(ordvals)
|
|
72 cnt=0
|
|
73 while totalnvals>cnt:
|
|
74 cnt+=1
|
|
75 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()
|
|
76 if tmpOri=='I':
|
|
77 if tmpMut!='I':
|
|
78 tmpChng='I-%s'%tmpMut
|
|
79 else:
|
|
80 tmpChng=Decimal('0')
|
|
81 if tmpChng!=tmpChng0:
|
|
82 crank=len(outrnk)+1
|
|
83 tmpChng0=tmpChng
|
|
84 outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)])
|
|
85 return outrnk
|
|
86
|
|
87 #this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0
|
|
88 def returnstartanendnodes(edges):
|
|
89 listID0st=set()#starts
|
|
90 listOD0en=set()#end
|
|
91 for beginNod,endNod in edges:# O(n)
|
|
92 listID0st.add(beginNod)
|
|
93 listOD0en.add(endNod)
|
|
94 startNdsID0=listID0st.difference(listOD0en)
|
|
95 endNdsOD0=listOD0en.difference(listID0st)
|
|
96 return startNdsID0,endNdsOD0
|
|
97
|
|
98 #~ Method to return nodes and edges
|
|
99 def returnNodesNEdgesfKXML(fpthwKGXML):
|
|
100 #~
|
|
101 tree = ElementTree()
|
|
102 ptree=tree.parse(fpthwKGXML)
|
|
103 #~
|
|
104 title=ptree.get('title')
|
|
105 prots=ptree.findall('entry')
|
|
106 reactns=ptree.findall('reaction')
|
|
107 #~
|
|
108 edges,ndstmp=set(),set()
|
|
109 nreactns=len(reactns)
|
|
110 cr=0#count reacts
|
|
111 while nreactns>cr:
|
|
112 cr+=1
|
|
113 reactn=reactns.pop()
|
|
114 mainid=reactn.get('id')
|
|
115 ndstmp.add(mainid)#add node
|
|
116 reacttyp=reactn.get('type')
|
|
117 sbstrts=reactn.findall('substrate')
|
|
118 while len(sbstrts)>0:
|
|
119 csbstrt=sbstrts.pop()
|
|
120 csbtsid=csbstrt.get('id')
|
|
121 ndstmp.add(csbtsid)#add node
|
|
122 if reacttyp=='irreversible':
|
|
123 edges.add((csbtsid,mainid))#add edges
|
|
124 elif reacttyp=='reversible':
|
|
125 edges.add((mainid,csbtsid))#add edges
|
|
126 edges.add((csbtsid,mainid))#add edges
|
|
127 #~
|
|
128 prdcts=reactn.findall('product')
|
|
129 while len(prdcts)>0:
|
|
130 prdct=prdcts.pop()
|
|
131 prodctid=prdct.get('id')
|
|
132 ndstmp.add(prodctid)#add node
|
|
133 if reacttyp=='irreversible':
|
|
134 edges.add((mainid,prodctid))#add edges
|
|
135 elif reacttyp=='reversible':
|
|
136 edges.add((mainid,prodctid))#add edges
|
|
137 edges.add((prodctid,mainid))#add edges
|
|
138 #~ Nodes
|
|
139 nprots=len(prots)
|
|
140 cp=0#count prots
|
|
141 dnodes={}
|
|
142 while nprots>cp:
|
|
143 cp+=1
|
|
144 prot=prots.pop()
|
|
145 tmpProtnm=prot.get('id')
|
|
146 if tmpProtnm in ndstmp:
|
|
147 dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id
|
|
148 return dnodes,edges,title
|
|
149
|
|
150 #~ make calculation on pathways
|
|
151 def rtrnAvrgLen(edges,strNds,endNds):
|
|
152 wG=nx.DiGraph()#reference graph
|
|
153 wG.add_edges_from(edges)
|
|
154 dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length
|
|
155 nstartNdsID0=len(strNds)
|
|
156 cstrtNds=0
|
|
157 nPaths=0
|
|
158 lPathLen=[]
|
|
159 while nstartNdsID0>cstrtNds:
|
|
160 cStartNd=strNds.pop()#current start node
|
|
161 dEndNdsLen=dPairsSrcSnks.pop(cStartNd)
|
|
162 for cendNd in dEndNdsLen:
|
|
163 if cendNd in endNds:
|
|
164 lPathLen.append(dEndNdsLen[cendNd])
|
|
165 nPaths+=1
|
|
166 cstrtNds+=1
|
|
167 AvrgPthLen=0
|
|
168 if nPaths!=0:
|
|
169 AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths))
|
|
170 return nPaths,AvrgPthLen
|
|
171
|
|
172 def main():
|
|
173 parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.')
|
|
174 parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database')
|
|
175 parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file')
|
|
176 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' )
|
|
177 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')
|
|
178 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')
|
|
179 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
|
|
180 #~
|
|
181 #~Open arguments
|
|
182 class C(object):
|
|
183 pass
|
|
184 fulargs=C()
|
|
185 parser.parse_args(sys.argv[1:],namespace=fulargs)
|
|
186 #test input vars
|
|
187 inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn
|
|
188 posKEGGclmn-=1#correct pos
|
|
189 Kgeneposcolmn-=1
|
|
190 #~ Get the extra variables
|
|
191 crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0]
|
|
192 sppPrefx,dinput=crDB[1],crDB[2]
|
|
193 #~ set decimal positions
|
|
194 getcontext().prec = 3
|
|
195 #make a dictionary of valid genes
|
|
196 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()])
|
|
197 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1])
|
|
198 while True:#to crrect names with more than one gene
|
|
199 try:
|
|
200 mgenes=sdGenes.pop()
|
|
201 pthwsAssotd=dKEGGcPthws.pop(mgenes)
|
|
202 mgenes=mgenes.split('.')
|
|
203 for eachg in mgenes:
|
|
204 dKEGGcPthws[eachg]=pthwsAssotd
|
|
205 except:
|
|
206 break
|
|
207 #~
|
|
208 lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']]
|
|
209 nPthws=len(lPthwsF)
|
|
210 cPthw=0
|
|
211 lPthwPthN=[]#the output list for number of paths
|
|
212 lPthwPthAvr=[]#the output list for the length of paths
|
|
213 #~
|
|
214 while cPthw<nPthws:
|
|
215 cPthw+=1
|
|
216 KEGGpathw=lPthwsF.pop()
|
|
217 comdKEGGpathw=KEGGpathw.split('.')[0]
|
|
218 tmpddGenrcgenPresent=set()
|
|
219 sKEGGc=dKEGGcPthws.keys()
|
|
220 lsKEGGc=len(sKEGGc)
|
|
221 ctPthw=0
|
|
222 while ctPthw < lsKEGGc:#to save memory
|
|
223 eachK=sKEGGc.pop()
|
|
224 alPthws=dKEGGcPthws[eachK]
|
|
225 if comdKEGGpathw in alPthws:
|
|
226 tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK]))
|
|
227 ctPthw+=1
|
|
228 #~ Make graph calculations
|
|
229 dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw)))
|
|
230 startNdsID0,endNdsOD0=returnstartanendnodes(edges)
|
|
231 startNdsOri=copy(startNdsID0)
|
|
232 #~
|
|
233 nPaths='C'#stands for circuit
|
|
234 AvrgPthLen='I'#stand for infinite
|
|
235 if len(startNdsID0)>0 and len(endNdsOD0)>0:
|
|
236 nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0)
|
|
237 #~ work with the genes in the list
|
|
238 genestodel=set()
|
|
239 lnodes=len(dnodes)
|
|
240 sNds=set(dnodes)
|
|
241 ctPthw=0
|
|
242 while ctPthw<lnodes:
|
|
243 ctPthw+=1
|
|
244 cNod=sNds.pop()
|
|
245 sgenes=dnodes.pop(cNod)
|
|
246 if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes):
|
|
247 genestodel.add(cNod)
|
|
248 #~ del nodes from graph edges
|
|
249 wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen)
|
|
250 if len(genestodel)>0:
|
|
251 wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0])
|
|
252 wstartNds,wendNds=returnstartanendnodes(wedges)
|
|
253 if nPaths!='C':
|
|
254 wstartNds=[x for x in wstartNds if x in startNdsOri]
|
|
255 wendNds=[x for x in wendNds if x in endNdsOD0]
|
|
256 if len(wstartNds)>0 and len(wendNds)>0:
|
|
257 wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds)
|
|
258 #~ Calculate the differences
|
|
259 orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen
|
|
260 if nPaths=='C':
|
|
261 orNP=Decimal('1000')
|
|
262 oriLen=Decimal('1000')
|
|
263 if wnPaths=='C':
|
|
264 mutNP=Decimal('1000')
|
|
265 mutLen=Decimal('1000')
|
|
266 lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
|
|
267 lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen
|
|
268 doutrnkPthN=rankdN(lPthwPthN)
|
|
269 doutrnkPthAvr=rankdAvr(lPthwPthAvr)
|
|
270 #~
|
|
271 sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN]
|
|
272 salef=open(output,'w')
|
|
273 salef.write('\n'.join(sall))
|
|
274 salef.close()
|
|
275 return 0
|
|
276
|
|
277
|
|
278 if __name__ == '__main__':
|
|
279 main()
|
|
280
|