comparison rank_pathways_pct.py @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 95a05c1ef5d5
children
comparison
equal deleted inserted replaced
26:91e835060ad2 27:8997f2ca8c7a
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 # 3 #
4 # calcfreq.py 4 # KEGGFisher.py
5 # 5 #
6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> 6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
7 # 7 #
8 # This program is free software; you can redistribute it and/or modify 8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the pathways of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or 10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software 19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, 20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA. 21 # MA 02110-1301, USA.
22 22
23 import argparse,os,sys 23 import argparse
24 import os
25 import sys
26 from fisher import pvalue as fisher
24 from decimal import Decimal,getcontext 27 from decimal import Decimal,getcontext
25 from LocationFile import LocationFile 28 from math import lgamma,exp,factorial
26 from fisher import pvalue 29
27 30 def binProb(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG):
28 #method to rank the the pthways by mut. freq. 31 """
29 def rankd(ltfreqs): 32 Returns binomial probability.
30 ordvals=sorted(ltfreqs)#sort and reverse freqs. 33 """
31 #~ 34 def comb(CntKEGG_All,k):
32 outrnk=[] 35 return factorial(CntKEGG_All) / Decimal(str(factorial(k)*factorial(CntKEGG_All-k)))
33 tmpFreq0,tmpCount,pval,tmpPthw=ordvals.pop()#the highest possible value 36 probLow = 0
34 crank=1 37 for k in range(0, SAPs_KEGG+1):
35 outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),str(pval),tmpPthw])) 38 cp=Decimal(str(comb(CntKEGG_All,k)))
36 totalnvals=len(ordvals) 39 bp=Decimal(str(pKEGG**k))
40 dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k))
41 probLow+=cp*bp*dp
42 #~
43 probHigh = 0
44 for k in range(int(SAPs_KEGG),CntKEGG_All+1):
45 cp=Decimal(str(comb(CntKEGG_All,k)))
46 bp=Decimal(str(pKEGG**k))
47 dp=Decimal(str(1.0-pKEGG))**Decimal(str(CntKEGG_All-k))
48 probHigh+=cp*bp*dp
49 return probLow,probHigh
50
51 def gauss_hypergeom(X, CntKEGG_All, SAPs_all, totalSAPs):
52 CntKEGG_All,SAPs_all,totalSAPs
53 """
54 Returns the probability of drawing X successes of SAPs_all marked items
55 in CntKEGG_All draws from a bin of totalSAPs total items
56 """
57 def logchoose(ni, ki):
58 try:
59 lgn1 = lgamma(ni+1)
60 lgk1 = lgamma(ki+1)
61 lgnk1 = lgamma(ni-ki+1)
62 except ValueError:
63 raise ValueError
64 return lgn1 - (lgnk1 + lgk1)
65 #~
66 r1 = logchoose(SAPs_all, X)
67 try:
68 r2 = logchoose(totalSAPs-SAPs_all, CntKEGG_All-X)
69 except ValueError:
70 return 0
71 r3 = logchoose(totalSAPs,CntKEGG_All)
72 return exp(r1 + r2 - r3)
73
74 def hypergeo_sf(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG):
75 """
76 Runs Hypergeometric probability test
77 """
78 s = 0
79 t=0
80 for i in range(SAPs_KEGG,min(SAPs_all,CntKEGG_All)+1):
81 s += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0)
82 for i in range(0, SAPs_KEGG+1):
83 t += max(gauss_hypergeom(i,CntKEGG_All,SAPs_all,totalSAPs), 0.0)
84 return min(max(t,0.0), 1),min(max(s,0.0), 1)
85
86 def fisherexct(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG):
87 """
88 Runs Fisher's exact test
89 """
90 ftest=fisher(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all)
91 probLow,probHigh=ftest.left_tail,ftest.right_tail
92 return probLow,probHigh
93
94 def rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd):
95 """
96 """
97 dKEGGTENSEMBLT={}
98 for eachl in open(inBckgrndfile,'r'):
99 if eachl.strip():
100 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTBckgrnd]
101 KEGGTs=set(eachl.splitlines()[0].split('\t')[columnKEGGBckgrnd].split('.'))
102 KEGGTs=KEGGTs.difference(set(['','U','N']))
103 for KEGGT in KEGGTs:
104 try:
105 dKEGGTENSEMBLT[KEGGT].add(ENSEMBLT)
106 except:
107 dKEGGTENSEMBLT[KEGGT]=set([ENSEMBLT])
108 ENSEMBLTGinKEGG=set.union(*dKEGGTENSEMBLT.values())
109 return dKEGGTENSEMBLT,ENSEMBLTGinKEGG
110
111 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG):
112 """
113 returns a set of the ENSEMBLT codes present in the input list and
114 in the KEGG file
115 """
116 sENSEMBLTSAPsinKEGG=set()
117 for eachl in open(inSAPsfile,'r'):
118 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT]
119 if ENSEMBLT in ENSEMBLTGinKEGG:
120 sENSEMBLTSAPsinKEGG.add(ENSEMBLT)
121 return sENSEMBLTSAPsinKEGG
122
123 def rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest):
124 """
125 returns a list of the ENSEMBLT codes present in the input list and
126 in the KEGG file. The pathways in this list are: 'Go Term','# Genes in
127 the KEGG Term','# Genes in the list and in the KEGG Term','Enrichement
128 of the KEGG Term for genes in the input list','Genes in the input list
129 present in the KEGG term'
130 """
131 totalSAPs=len(ENSEMBLTGinKEGG)
132 SAPs_all=len(sENSEMBLTSAPsinKEGG)
133 NoSAPs_all=totalSAPs-SAPs_all
134 pKEGG=SAPs_all/float(totalSAPs)
135 #~
136 lp=len(dKEGGTENSEMBLT)
37 cnt=0 137 cnt=0
38 while totalnvals>cnt: 138 #~
139 if statsTest=='fisher':
140 ptest=fisherexct
141 elif statsTest=='hypergeometric':
142 ptest=hypergeo_sf
143 elif statsTest=='binomial':
144 ptest=binProb
145 #~
146 ltfreqs=[]
147 for echKEGGT in dKEGGTENSEMBLT:
39 cnt+=1 148 cnt+=1
40 tmpFreq,tmpCount,pval,tmpPthw=ordvals.pop() 149 CntKEGG_All=len(dKEGGTENSEMBLT[echKEGGT])
41 if tmpFreq!=tmpFreq0: 150 SAPs_KEGG=len(dKEGGTENSEMBLT[echKEGGT].intersection(sENSEMBLTSAPsinKEGG))
42 crank=len(outrnk)+1 151 NoSAPs_KEGG=CntKEGG_All-SAPs_KEGG
43 tmpFreq0=tmpFreq 152 probLow,probHigh=ptest(SAPs_KEGG,NoSAPs_KEGG,SAPs_all,NoSAPs_all,CntKEGG_All,totalSAPs,pKEGG)
44 outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),str(pval),tmpPthw])) 153 ltfreqs.append([(SAPs_KEGG/Decimal(CntKEGG_All)),SAPs_KEGG,probLow,probHigh,echKEGGT])
45 return outrnk 154 #~
46 155 ltfreqs.sort()
156 ltfreqs.reverse()
157 outl=[]
158 cper,crank=Decimal('2'),0
159 #~
160 getcontext().prec=2#set 2 decimal places
161 for perc,cnt_go,pvalLow,pvalHigh,goTerm in ltfreqs:
162 if perc<cper:
163 crank+=1
164 cper=perc
165 outl.append('\t'.join([str(cnt_go),str(Decimal(perc)*Decimal('1.0')),str(crank),str(Decimal(pvalLow)*Decimal('1.0')),str(Decimal(pvalHigh)*Decimal('1.0')),goTerm]))
166 #~
167 return outl
168
47 169
48 def main(): 170 def main():
49 parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.') 171 #~
50 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') 172 parser = argparse.ArgumentParser(description='Returns the count of genes in KEGG categories and their statistical overrrepresentation, from a list of genes and an background file (i.e. plane text with ENSEMBLT and KEGG pathways).')
51 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2') 173 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True)
52 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') 174 parser.add_argument('--inBckgrndfile',metavar='input TXT file',type=str,help='the input file with the background table in txt format.',required=True)
53 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') 175 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True)
54 parser.add_argument('--loc_file',metavar='location file',type=str,help='location file') 176 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True)
55 parser.add_argument('--species',metavar='species',type=str,help='species') 177 parser.add_argument('--columnENSEMBLTBckgrnd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the background file.',required=True)
56 #~Open arguments 178 parser.add_argument('--columnKEGGBckgrnd',metavar='column number',type=int,help='column with the KEGG pathways in the background file.',required=True)
57 class C(object): 179 parser.add_argument('--statsTest',metavar='input TXT file',type=str,help='statistical test to compare KEGG pathways (i.e. fisher, hypergeometric, binomial).',required=True)
58 pass 180
59 fulargs=C() 181 args = parser.parse_args()
60 parser.parse_args(sys.argv[1:],namespace=fulargs) 182
61 #test input vars 183 inSAPsfile = args.input
62 inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn 184 inBckgrndfile = args.inBckgrndfile
63 locf,species=fulargs.loc_file,fulargs.species 185 saleKEGGPCount = args.output
64 #make a dictionary of valid genes 186 columnENSEMBLT = args.columnENSEMBLT
65 posKEGGclmn-=1 187 columnENSEMBLTBckgrnd = args.columnENSEMBLTBckgrnd
66 Kgeneposcolmn-=1 188 columnKEGGBckgrnd = args.columnKEGGBckgrnd
67 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.split('\t')[posKEGGclmn] not in set(['U','N'])]) 189 statsTest = args.statsTest
68 for u in ['U','N']: 190 columnENSEMBLT-=1
69 try: 191 columnENSEMBLTBckgrnd-=1
70 a=dKEGGcPthws.pop(u) 192 columnKEGGBckgrnd=-1
71 except: 193 #~
72 pass 194 dKEGGTENSEMBLT,ENSEMBLTGinKEGG=rtrnKEGGcENSEMBLc(inBckgrndfile,columnENSEMBLTBckgrnd,columnKEGGBckgrnd)
73 getcontext().prec=2#set 2 decimal places 195 sENSEMBLTSAPsinKEGG=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinKEGG)
74 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) 196 outl=rtrnCounts(dKEGGTENSEMBLT,ENSEMBLTGinKEGG,sENSEMBLTSAPsinKEGG,statsTest)
75 while True:#to correct names with more than one gene 197 #~
76 try: 198 saleKEGGPCount=open(saleKEGGPCount,'w')
77 mgenes=sdGenes.pop() 199 saleKEGGPCount.write('\n'.join(outl))
78 pthwsAssotd=dKEGGcPthws.pop(mgenes) 200 saleKEGGPCount.close()
79 mgenes=mgenes.split('.') 201 #~
80 for eachg in mgenes:
81 dKEGGcPthws[eachg]=pthwsAssotd
82 except:
83 break
84 #~ Count genes
85
86 location_file = LocationFile(locf)
87 prefix, kxml_dir_path, dict_file = location_file.get_values(species)
88 dPthContsTotls = {}
89 try:
90 with open(dict_file) as fh:
91 for line in fh:
92 line = line.rstrip('\r\n')
93 value, key = line.split('\t')
94 dPthContsTotls[key] = int(value)
95 except IOError, err:
96 print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror)
97 sys.exit(1)
98
99 dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes
100 sdGenes=set(dKEGGcPthws.keys())#list of all genes
101 cntGens=0
102 ltGens=len(sdGenes)
103 while cntGens<ltGens:
104 cGen=sdGenes.pop()
105 sKEGGcPthws=dKEGGcPthws.pop(cGen)
106 for eachP in sKEGGcPthws:
107 if eachP!='N':
108 if eachP in dPthContsTmp:
109 dPthContsTmp[eachP]+=1
110 else:
111 print >> sys.stderr, "Error: pathway not found in database: '{0}'".format(eachP)
112 sys.exit(1)
113 cntGens+=1
114 #~ Calculate Freqs.
115 ltfreqs=[]
116 cntAllKEGGinGnm=sum(dPthContsTotls.values())
117 cntListKEGGinGnm=sum(dPthContsTmp.values())
118 cntAllKEGGNOTinGnm=cntAllKEGGinGnm-cntListKEGGinGnm
119 for pthw in dPthContsTotls:
120 cntAllKEGGinpthw=Decimal(dPthContsTotls[pthw])
121 try:
122 cntListKEGGinpthw=Decimal(dPthContsTmp[pthw])
123 except:
124 cntListKEGGinpthw=Decimal('0')
125 cntAllKEGGNOTinpthw=cntAllKEGGinpthw-cntListKEGGinpthw
126 pval=pvalue(cntListKEGGinpthw,cntAllKEGGNOTinpthw,cntListKEGGinGnm,cntAllKEGGNOTinGnm)
127
128 ltfreqs.append([(cntListKEGGinpthw/cntAllKEGGinpthw),cntListKEGGinpthw,Decimal(str(pval.two_tail))*1,pthw])
129 #~ ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls]
130 tabllfreqs='\n'.join(rankd(ltfreqs))
131 salef=open(outputf,'w')
132 salef.write(tabllfreqs)
133 salef.close()
134 return 0 202 return 0
135
136 203
137 if __name__ == '__main__': 204 if __name__ == '__main__':
138 main() 205 main()
206