comparison rank_terms.py @ 22:95a05c1ef5d5

update to devshed revision aaece207bd01
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 11 Mar 2013 11:28:06 -0400
parents
children 248b06e86022
comparison
equal deleted inserted replaced
21:d6b961721037 22:95a05c1ef5d5
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # GOFisher.py
5 #
6 # Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu>
7 #
8 # This program is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation; either version 2 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
21 # MA 02110-1301, USA.
22
23 import argparse
24 import os
25 import sys
26 from fisher import pvalue
27 from decimal import Decimal,getcontext
28
29 def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd):
30 """
31 """
32 dGOTENSEMBLT={}
33 for eachl in open(inExtnddfile,'r'):
34 if eachl.strip():
35 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd]
36 GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.'))
37 GOTs=GOTs.difference(set(['','U','N']))
38 for GOT in GOTs:
39 try:
40 dGOTENSEMBLT[GOT].add(ENSEMBLT)
41 except:
42 dGOTENSEMBLT[GOT]=set([ENSEMBLT])
43 #~
44 ##dGOTENSEMBLT.pop('')
45 ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values())
46 return dGOTENSEMBLT,ENSEMBLTGinGO
47
48 def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO):
49 """
50 returns a set of the ENSEMBLT codes present in the input list and
51 in the GO file
52 """
53 sENSEMBLTSAPsinGO=set()
54 for eachl in open(inSAPsfile,'r'):
55 ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT]
56 if ENSEMBLT in ENSEMBLTGinGO:
57 sENSEMBLTSAPsinGO.add(ENSEMBLT)
58 return sENSEMBLTSAPsinGO
59
60 def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO):
61 """
62 returns a list of the ENSEMBLT codes present in the input list and
63 in the GO file. The terms in this list are: 'Go Term','# Genes in
64 the GO Term','# Genes in the list and in the GO Term','Enrichement
65 of the GO Term for genes in the input list','Genes in the input list
66 present in the GO term'
67 """
68 getcontext().prec=2#set 2 decimal places
69 SAPs_all=len(sENSEMBLTSAPsinGO)
70 NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all
71 #~
72 lp=len(dGOTENSEMBLT)
73 cnt=0
74 #~
75 ltfreqs=[]
76 for echGOT in dGOTENSEMBLT:
77 cnt+=1
78 ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp)
79 CntGO_All=len(dGOTENSEMBLT[echGOT])
80 SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))
81 NoSAPs_GO=CntGO_All-SAPs_GO
82 pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all)
83 #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)]))
84 ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT])
85 #~
86 ltfreqs.sort()
87 ltfreqs.reverse()
88 outl=[]
89 cper,crank=Decimal('2'),0
90 #~
91 for perc,cnt_go,pval,goTerm in ltfreqs:
92 if perc<cper:
93 crank+=1
94 cper=perc
95 outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm]))
96 #~
97 return outl
98
99
100 def main():
101 #~
102 parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).')
103 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True)
104 parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True)
105 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True)
106 parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True)
107 parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True)
108 parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True)
109
110 args = parser.parse_args()
111
112 inSAPsfile = args.input
113 inExtnddfile = args.inExtnddfile
114 saleGOPCount = args.output
115 columnENSEMBLT = args.columnENSEMBLT
116 columnENSEMBLTExtndd = args.columnENSEMBLTExtndd
117 columnGOExtndd = args.columnGOExtndd
118
119 #~
120 dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd)
121 sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO)
122 outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO)
123 #~
124 saleGOPCount=open(saleGOPCount,'w')
125 saleGOPCount.write('\n'.join(outl))
126 saleGOPCount.close()
127 #~
128 return 0
129
130 if __name__ == '__main__':
131 main()
132