Mercurial > repos > miller-lab > genome_diversity
diff rank_terms.py @ 24:248b06e86022
Added gd_genotype datatype. Modified tools to support new datatype.
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Tue, 28 May 2013 16:24:19 -0400 |
parents | 95a05c1ef5d5 |
children | 8997f2ca8c7a |
line wrap: on
line diff
--- a/rank_terms.py Wed May 22 15:58:18 2013 -0400 +++ b/rank_terms.py Tue May 28 16:24:19 2013 -0400 @@ -1,132 +1,132 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# GOFisher.py -# -# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, -# MA 02110-1301, USA. - -import argparse -import os -import sys -from fisher import pvalue -from decimal import Decimal,getcontext - -def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): - """ - """ - dGOTENSEMBLT={} - for eachl in open(inExtnddfile,'r'): - if eachl.strip(): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] - GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) - GOTs=GOTs.difference(set(['','U','N'])) - for GOT in GOTs: - try: - dGOTENSEMBLT[GOT].add(ENSEMBLT) - except: - dGOTENSEMBLT[GOT]=set([ENSEMBLT]) - #~ - ##dGOTENSEMBLT.pop('') - ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) - return dGOTENSEMBLT,ENSEMBLTGinGO - -def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): - """ - returns a set of the ENSEMBLT codes present in the input list and - in the GO file - """ - sENSEMBLTSAPsinGO=set() - for eachl in open(inSAPsfile,'r'): - ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] - if ENSEMBLT in ENSEMBLTGinGO: - sENSEMBLTSAPsinGO.add(ENSEMBLT) - return sENSEMBLTSAPsinGO - -def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): - """ - returns a list of the ENSEMBLT codes present in the input list and - in the GO file. The terms in this list are: 'Go Term','# Genes in - the GO Term','# Genes in the list and in the GO Term','Enrichement - of the GO Term for genes in the input list','Genes in the input list - present in the GO term' - """ - getcontext().prec=2#set 2 decimal places - SAPs_all=len(sENSEMBLTSAPsinGO) - NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all - #~ - lp=len(dGOTENSEMBLT) - cnt=0 - #~ - ltfreqs=[] - for echGOT in dGOTENSEMBLT: - cnt+=1 - ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) - CntGO_All=len(dGOTENSEMBLT[echGOT]) - SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) - NoSAPs_GO=CntGO_All-SAPs_GO - pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) - #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) - ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) - #~ - ltfreqs.sort() - ltfreqs.reverse() - outl=[] - cper,crank=Decimal('2'),0 - #~ - for perc,cnt_go,pval,goTerm in ltfreqs: - if perc<cper: - crank+=1 - cper=perc - outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) - #~ - return outl - - -def main(): - #~ - parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') - parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) - parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) - parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) - parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) - parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) - parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) - - args = parser.parse_args() - - inSAPsfile = args.input - inExtnddfile = args.inExtnddfile - saleGOPCount = args.output - columnENSEMBLT = args.columnENSEMBLT - columnENSEMBLTExtndd = args.columnENSEMBLTExtndd - columnGOExtndd = args.columnGOExtndd - - #~ - dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) - sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) - outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) - #~ - saleGOPCount=open(saleGOPCount,'w') - saleGOPCount.write('\n'.join(outl)) - saleGOPCount.close() - #~ - return 0 - -if __name__ == '__main__': - main() - +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GOFisher.py +# +# Copyright 2013 Oscar Reina <oscar@niska.bx.psu.edu> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, +# MA 02110-1301, USA. + +import argparse +import os +import sys +from fisher import pvalue +from decimal import Decimal,getcontext + +def rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd): + """ + """ + dGOTENSEMBLT={} + for eachl in open(inExtnddfile,'r'): + if eachl.strip(): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLTExtndd] + GOTs=set(eachl.splitlines()[0].split('\t')[columnGOExtndd].split('.')) + GOTs=GOTs.difference(set(['','U','N'])) + for GOT in GOTs: + try: + dGOTENSEMBLT[GOT].add(ENSEMBLT) + except: + dGOTENSEMBLT[GOT]=set([ENSEMBLT]) + #~ + ##dGOTENSEMBLT.pop('') + ENSEMBLTGinGO=set.union(*dGOTENSEMBLT.values()) + return dGOTENSEMBLT,ENSEMBLTGinGO + +def rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO): + """ + returns a set of the ENSEMBLT codes present in the input list and + in the GO file + """ + sENSEMBLTSAPsinGO=set() + for eachl in open(inSAPsfile,'r'): + ENSEMBLT=eachl.splitlines()[0].split('\t')[columnENSEMBLT] + if ENSEMBLT in ENSEMBLTGinGO: + sENSEMBLTSAPsinGO.add(ENSEMBLT) + return sENSEMBLTSAPsinGO + +def rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO): + """ + returns a list of the ENSEMBLT codes present in the input list and + in the GO file. The terms in this list are: 'Go Term','# Genes in + the GO Term','# Genes in the list and in the GO Term','Enrichement + of the GO Term for genes in the input list','Genes in the input list + present in the GO term' + """ + getcontext().prec=2#set 2 decimal places + SAPs_all=len(sENSEMBLTSAPsinGO) + NoSAPs_all=len(ENSEMBLTGinGO)-SAPs_all + #~ + lp=len(dGOTENSEMBLT) + cnt=0 + #~ + ltfreqs=[] + for echGOT in dGOTENSEMBLT: + cnt+=1 + ##print 'Running "%s", %s out of %s'%(echGOT,cnt,lp) + CntGO_All=len(dGOTENSEMBLT[echGOT]) + SAPs_GO=len(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO)) + NoSAPs_GO=CntGO_All-SAPs_GO + pval=pvalue(SAPs_GO,NoSAPs_GO,SAPs_all,NoSAPs_all) + #~ outl.append('\t'.join([str(x) for x in(str(pval.two_tail),CntGO_All,SAPs_GO,'.'.join(sorted(dGOTENSEMBLT[echGOT].intersection(sENSEMBLTSAPsinGO))),echGOT)])) + ltfreqs.append([(SAPs_GO/Decimal(CntGO_All)),SAPs_GO,Decimal(str(pval.two_tail))*1,echGOT]) + #~ + ltfreqs.sort() + ltfreqs.reverse() + outl=[] + cper,crank=Decimal('2'),0 + #~ + for perc,cnt_go,pval,goTerm in ltfreqs: + if perc<cper: + crank+=1 + cper=perc + outl.append('\t'.join([str(cnt_go),str(perc),str(crank),str(pval),goTerm])) + #~ + return outl + + +def main(): + #~ + parser = argparse.ArgumentParser(description='Returns the count of genes in GO categories and their statistical overrrepresentation, from a list of genes and an extended file (i.e. plane text with ENSEMBLT and GO terms).') + parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format.',required=True) + parser.add_argument('--inExtnddfile',metavar='input TXT file',type=str,help='the input file with the extended table in txt format.',required=True) + parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format.',required=True) + parser.add_argument('--columnENSEMBLT',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the input file.',required=True) + parser.add_argument('--columnENSEMBLTExtndd',metavar='column number',type=int,help='column with the ENSEMBL transcript code in the extended file.',required=True) + parser.add_argument('--columnGOExtndd',metavar='column number',type=int,help='column with the GO terms in the extended file.',required=True) + + args = parser.parse_args() + + inSAPsfile = args.input + inExtnddfile = args.inExtnddfile + saleGOPCount = args.output + columnENSEMBLT = args.columnENSEMBLT + columnENSEMBLTExtndd = args.columnENSEMBLTExtndd + columnGOExtndd = args.columnGOExtndd + + #~ + dGOTENSEMBLT,ENSEMBLTGinGO=rtrnGOcENSEMBLc(inExtnddfile,columnENSEMBLTExtndd,columnGOExtndd) + sENSEMBLTSAPsinGO=rtrnENSEMBLcSAPs(inSAPsfile,columnENSEMBLT,ENSEMBLTGinGO) + outl=rtrnCounts(dGOTENSEMBLT,ENSEMBLTGinGO,sENSEMBLTSAPsinGO) + #~ + saleGOPCount=open(saleGOPCount,'w') + saleGOPCount.write('\n'.join(outl)) + saleGOPCount.close() + #~ + return 0 + +if __name__ == '__main__': + main() +